00001
00008 #ifndef H_UNROLL_PREFETCH_SIMD_DEF_H
00009 #define H_UNROLL_PREFETCH_SIMD_DEF_H
00010
00019
00020 #define UNROLL4_PREF_KERNEL5_SIMD(MDOP,ADV,T,SUF,UNA1,UNA2) \
00021 if (EL_PER_CL(T) <= 1) { \
00022 MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
00023 i -= 4*ADV; \
00024 MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
00025 v1 += 4*ADV; \
00026 MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
00027 v2 += 4*ADV; \
00028 MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
00029 res += 4*ADV; \
00030 } else if (EL_PER_CL(T) <= 2) { \
00031 MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
00032 i -= 4*ADV; \
00033 MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
00034 v1 += 4*ADV; \
00035 MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
00036 v2 += 4*ADV; \
00037 MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
00038 res += 4*ADV; \
00039 } else { \
00040 MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
00041 i -= 4*ADV; \
00042 MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
00043 v1 += 4*ADV; \
00044 MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
00045 v2 += 4*ADV; \
00046 MDOP(res+4*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
00047 res += 4*ADV; \
00048 }
00049
00050
00052 #define UNROLL4_KERNEL5_SIMD(MDOP,ADV,SUF,UNA1,UNA2) \
00053 MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
00054 i -= 4*ADV; \
00055 MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2);\
00056 v1 += 4*ADV; \
00057 MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
00058 v2 += 4*ADV; \
00059 MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
00060 res += 4*ADV
00061
00062
00063
00064 #define VKERN_TEMPL_3V_NP_SIMD(MDOP,ADV,STP,SUF,UNA1,UNA2) \
00065 if (LIKELY(i >= 4*ADV)) { \
00066 STP TMP, LD UNUSED; \
00067 do { \
00068 UNROLL4_KERNEL5_SIMD(MDOP,ADV,SUF,UNA1,UNA2); \
00069 } while (i >= 4*ADV); \
00070 }
00071
00072
00073 #define VKERN_TEMPL_3V_PLAIN_SIMD(MDOP,ADV,STP,SUF,UNA1,UNA2) \
00074 while (i >= ADV) { \
00075 STP TMP, LD UNUSED; \
00076 MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
00077 i -= ADV; res += ADV; v1 += ADV; v2 +=ADV; \
00078 }
00079
00080
00081 #define VKERN_TEMPL_3V_SISD(SDOP,COND,STP,SUF) \
00082 while (COND && i) { \
00083 STP TMP, LD UNUSED; \
00084 SDOP(res,v1,v2,f1,f2,SUF,,); \
00085 --i; ++res; ++v1; ++v2; \
00086 }
00087
00088
00089
00091 #define UNROLL4_PREF_KERNEL4_SIMD(MDOP,ADV,T,SUF,UNA) \
00092 if (EL_PER_CL(T) <= 1) { \
00093 MDOP(res,v1,f1,f2,SUF,UNA); \
00094 i -= 4*ADV; \
00095 MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
00096 MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
00097 v1 += 4*ADV; \
00098 MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
00099 res += 4*ADV; \
00100 } else if (EL_PER_CL(T) <= 2) { \
00101 MDOP(res,v1,f1,f2, SUF,UNA); \
00102 i -= 4*ADV; \
00103 MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
00104 MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
00105 v1 += 4*ADV; \
00106 MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
00107 res += 4*ADV; \
00108 } else { \
00109 MDOP(res,v1,f1,f2,SUF,UNA); \
00110 i -= 4*ADV; \
00111 MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
00112 MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
00113 v1 += 4*ADV; \
00114 MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
00115 res += 4*ADV; \
00116 }
00117
00118
00120 #define UNROLL4_KERNEL4_SIMD(MDOP,ADV,SUF,UNA) \
00121 MDOP(res, v1, f1, f2, SUF, UNA); \
00122 MDOP(res+ADV, v1+ADV, f1, f2, SUF, UNA); \
00123 i -= 4*ADV; \
00124 MDOP(res+2*ADV, v1+2*ADV, f1, f2, SUF, UNA); \
00125 MDOP(res+3*ADV, v1+3*ADV, f1, f2, SUF, UNA); \
00126 v1 += 4*ADV; res += 4*ADV
00127
00128
00129
00130 #define VKERN_TEMPL_2V_NP_SIMD(MDOP,ADV,STP,SUF,UNA) \
00131 if (LIKELY(i >= 4*ADV)) { \
00132 STP TMP, LD UNUSED; \
00133 do { \
00134 UNROLL4_KERNEL4_SIMD(MDOP,ADV,SUF,UNA); \
00135 } while (i >= 4*ADV); \
00136 }
00137
00138
00139 #define VKERN_TEMPL_2V_PLAIN_SIMD(MDOP,ADV,STP,SUF,UNA) \
00140 while (i >= ADV) { \
00141 STP TMP, LD UNUSED; \
00142 MDOP(res, v1, f1, f2, SUF, UNA); \
00143 i -= ADV; v1 += ADV; res += ADV; \
00144 }
00145
00146
00147 #define VKERN_TEMPL_2V_SISD(SDOP,COND,STP,SUF) \
00148 while (COND && i) { \
00149 STP TMP, LD UNUSED; \
00150 SDOP(res,v1,f1,f2,SUF,); \
00151 --i; ++v1; ++res; \
00152 }
00153
00154
00155
00156
00161 #define UNROLL4_PREF_KERNEL3_SIMD(MDOP,ADV,T,SUF) \
00162 if (EL_PER_CL(T) <= 1) { \
00163 MDOP(res, f1, f2, SUF); \
00164 MDOP(res+ADV, f1, f2, SUF); \
00165 i -= 4*ADV; \
00166 MDOP(res+2*ADV, f1, f2, SUF); \
00167 MDOP(res+3*ADV, f1, f2, SUF); \
00168 res += 4*ADV; \
00169 } else if (EL_PER_CL(T) <= 2) { \
00170 MDOP(res, f1, f2, SUF); \
00171 MDOP(res+ADV, f1, f2, SUF); \
00172 i -= 4*ADV; \
00173 MDOP(res+2*ADV, f1, f2, SUF); \
00174 MDOP(res+3*ADV, f1, f2, SUF); \
00175 res += 4*ADV; \
00176 } else { \
00177 MDOP(res, f1, f2, SUF); \
00178 MDOP(res+ADV, f1, f2, SUF); \
00179 i -= 4*ADV; \
00180 MDOP(res+2*ADV, f1, f2, SUF); \
00181 MDOP(res+3*ADV, f1, f2, SUF); \
00182 res += 4*ADV; \
00183 }
00184
00185
00187 #define UNROLL4_KERNEL3_SIMD(MDOP,ADV,SUF) \
00188 MDOP(res, f1, f2, SUF); \
00189 MDOP(res+ADV, f1, f2, SUF); \
00190 i -= 4*ADV; \
00191 MDOP(res+2*ADV, f1, f2, SUF); \
00192 MDOP(res+3*ADV, f1, f2, SUF); \
00193 res += 4*ADV
00194
00195
00196 #define VKERN_TEMPL_1V_NP_SIMD(MDOP,ADV,STP,SUF) \
00197 if (LIKELY(i >= 4*ADV)) { \
00198 STP TMP UNUSED; \
00199 do { \
00200 UNROLL4_KERNEL3_SIMD(MDOP,ADV,SUF); \
00201 } while (i >= 4*ADV); \
00202 }
00203
00204
00205 #define VKERN_TEMPL_1V_PLAIN_SIMD(MDOP,ADV,STP,SUF) \
00206 while (i >= ADV) { \
00207 STP TMP UNUSED; \
00208 MDOP(res,f1,f2,SUF); \
00209 i -= ADV; res += ADV; \
00210 }
00211
00212
00213 #define VKERN_TEMPL_1V_SISD(SDOP,COND,STP,SUF) \
00214 while (COND && i) { \
00215 STP TMP UNUSED; \
00216 SDOP(res,f1,f2,SUF); \
00217 --i; ++res; \
00218 }
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228 #if !defined(__x86_64__) && (!defined(MALLOC_CACHE) || defined(SSE_VARS_MAY_BE_UNALIGNED))
00229
00230
00231
00232
00233
00234 # define MISALIGNMENT_CHECK(x) ((unsigned long)x & 0x0f)
00235 # if defined(__GNUC__) || defined(__INTEL_COMPILER)
00236 # warning May have to use slow unaligned SSE insns
00237 # endif
00238 #else
00239 # define MISALIGNMENT_CHECK(x) (UNLIKELY((unsigned long)x & 0x0f))
00240 #endif
00241
00247 #ifdef WARN_UNALIGNED
00248 # define WARN_UNALIGN(v) \
00249 STD__ cerr << "TBCI WARN: Unaligned access to " #v " at " << v << " from " << __FUNCTION__ << "\n";
00250 #else
00251 # define WARN_UNALIGN(v) do {} while (0)
00252 #endif
00253
00258
00259
00274 #define VKERN_TEMPL_3V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00275 TWEAK(template <> \
00276 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00277 TYPE* RESTRICT const _res, \
00278 const TYPE* RESTRICT const _v1, \
00279 const TYPE* RESTRICT const _v2)) \
00280 { \
00281 register const TYPE *v1 = _v1, *v2 = _v2; \
00282 register TYPE *res = _res; \
00283 PREP; \
00284 register long i = sz; \
00285 \
00286 VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00287 if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
00288 WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
00289 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,u,u); \
00290 \
00291 } else if (MISALIGNMENT_CHECK(v1)) { \
00292 WARN_UNALIGN(v1); \
00293 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,u,); \
00294 \
00295 } else if (MISALIGNMENT_CHECK(v2)) { \
00296 WARN_UNALIGN(v2); \
00297 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,,u); \
00298 \
00299 } else { \
00300 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,,); \
00301 \
00302 } \
00303 SFIN; \
00304 VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
00305 FIN; \
00306 }
00307
00309 #define VKERN_TEMPL_3V_SIMD_UA(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00310 TWEAK(template <> \
00311 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00312 TYPE* RESTRICT const _res, \
00313 const TYPE* RESTRICT const _v1, \
00314 const TYPE* RESTRICT const _v2)) \
00315 { \
00316 register const TYPE *v1 = _v1, *v2 = _v2; \
00317 register TYPE *res = _res; \
00318 PREP; \
00319 register long i = sz; \
00320 \
00321 VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00322 if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
00323 WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
00324 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,u,u); \
00325 \
00326 } else if (MISALIGNMENT_CHECK(v1)) { \
00327 WARN_UNALIGN(v1); \
00328 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,u,); \
00329 \
00330 } else if (MISALIGNMENT_CHECK(v2)) { \
00331 WARN_UNALIGN(v2); \
00332 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,,u); \
00333 \
00334 } else { \
00335 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,,); \
00336 \
00337 } \
00338 SFIN; \
00339 VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
00340 FIN; \
00341 }
00342
00343 #define VKERN_TEMPL_3V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00344 TWEAK(template <> \
00345 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00346 TYPE* RESTRICT const _res, \
00347 const TYPE* RESTRICT const _v1, \
00348 const TYPE* RESTRICT const _v2, \
00349 LCTYPED(TYPE) _f2)) \
00350 { \
00351 register const TYPE *v1 = _v1, *v2 = _v2; \
00352 register TYPE *res = _res; \
00353 PREP(_f2); \
00354 register long i = sz; \
00355 \
00356 VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00357 if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
00358 WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
00359 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,u,u); \
00360 \
00361 } else if (MISALIGNMENT_CHECK(v1)) { \
00362 WARN_UNALIGN(v1); \
00363 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,u,); \
00364 \
00365 } else if (MISALIGNMENT_CHECK(v2)) { \
00366 WARN_UNALIGN(v2); \
00367 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,,u); \
00368 \
00369 } else { \
00370 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,,); \
00371 \
00372 } \
00373 SFIN; \
00374 VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
00375 FIN(_f2); \
00376 }
00377
00378 #define VKERN_TEMPL_3V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00379 TWEAK(template <> \
00380 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00381 TYPE* RESTRICT const _res, \
00382 const TYPE* RESTRICT const _v1, \
00383 const TYPE* RESTRICT const _v2, \
00384 LCTYPED(TYPE) _f1, \
00385 LCTYPED(TYPE) _f2)) \
00386 { \
00387 register const TYPE *v1 = _v1, *v2 = _v2; \
00388 register TYPE *res = _res; \
00389 PREP(_f1, _f2); \
00390 register long i = sz; \
00391 \
00392 VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00393 if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
00394 WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
00395 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,u,u); \
00396 \
00397 } else if (MISALIGNMENT_CHECK(v1)) { \
00398 WARN_UNALIGN(v1); \
00399 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,u,); \
00400 \
00401 } else if (MISALIGNMENT_CHECK(v2)) { \
00402 WARN_UNALIGN(v2); \
00403 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,,u); \
00404 \
00405 } else { \
00406 VKERN_TEMPL_3V_NP_SIMD(OP,ADV,STP,MSUF,,); \
00407 \
00408 } \
00409 SFIN; \
00410 VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
00411 FIN(_f1, _f2); \
00412 }
00413
00414
00415 #define VKERN_TEMPL_2V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00416 TWEAK(template <> \
00417 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00418 TYPE* RESTRICT const _res, \
00419 const TYPE* RESTRICT const _v1)) \
00420 { \
00421 register TYPE *res = _res; \
00422 register const TYPE *v1 = _v1; \
00423 PREP; \
00424 register long i = sz; \
00425 \
00426 VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00427 if (MISALIGNMENT_CHECK(v1)) { \
00428 WARN_UNALIGN(v1); \
00429 VKERN_TEMPL_2V_NP_SIMD(OP,ADV,STP,MSUF,u) \
00430 \
00431 } else { \
00432 VKERN_TEMPL_2V_NP_SIMD(OP,ADV,STP,MSUF,); \
00433 \
00434 } \
00435 SFIN; \
00436 VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
00437 FIN; \
00438 }
00439
00440 #define VKERN_TEMPL_2V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00441 TWEAK(template <> \
00442 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00443 TYPE* RESTRICT const _res, \
00444 const TYPE* RESTRICT const _v1, \
00445 LCTYPED(TYPE) _f2)) \
00446 { \
00447 register const TYPE *v1 = _v1; \
00448 register TYPE *res= _res; \
00449 PREP(_f2); \
00450 register long i = sz; \
00451 \
00452 VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00453 if (MISALIGNMENT_CHECK(v1)) { \
00454 WARN_UNALIGN(v1); \
00455 VKERN_TEMPL_2V_NP_SIMD(OP,ADV,STP,MSUF,u) \
00456 \
00457 } else { \
00458 VKERN_TEMPL_2V_NP_SIMD(OP,ADV,STP,MSUF,); \
00459 \
00460 } \
00461 SFIN; \
00462 VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
00463 FIN(_f2); \
00464 }
00465
00466 #define VKERN_TEMPL_2V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00467 TWEAK(template <> \
00468 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00469 TYPE* RESTRICT const _res, \
00470 const TYPE* RESTRICT const _v1, \
00471 LCTYPED(TYPE) _f1, \
00472 LCTYPED(TYPE) _f2)) \
00473 { \
00474 register const TYPE *v1 = _v1; \
00475 register TYPE *res= _res; \
00476 PREP(_f1, _f2); \
00477 register long i = sz; \
00478 \
00479 VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00480 if (MISALIGNMENT_CHECK(v1)) { \
00481 WARN_UNALIGN(v1); \
00482 VKERN_TEMPL_2V_NP_SIMD(OP,ADV,STP,MSUF,u) \
00483 \
00484 } else { \
00485 VKERN_TEMPL_2V_NP_SIMD(OP,ADV,STP,MSUF,); \
00486 \
00487 } \
00488 SFIN; \
00489 VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
00490 FIN(_f1, _f2); \
00491 }
00492
00493 #define VKERN_TEMPL_2V_T_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00494 TWEAK(template <> \
00495 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00496 const TYPE* RESTRICT const _res, \
00497 const TYPE* RESTRICT const _v1, \
00498 TYPE &_f2)) \
00499 { \
00500 register const TYPE *res= _res, *v1 = _v1; \
00501 PREP(_f2); \
00502 register long i = sz; \
00503 \
00504 VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00505 if (MISALIGNMENT_CHECK(v1)) { \
00506 WARN_UNALIGN(v1); \
00507 VKERN_TEMPL_2V_NP_SIMD(OP,ADV,STP,MSUF,u) \
00508 \
00509 } else { \
00510 VKERN_TEMPL_2V_NP_SIMD(OP,ADV,STP,MSUF,); \
00511 \
00512 } \
00513 SFIN; \
00514 VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
00515 FIN(_f2); \
00516 }
00517
00518 #define VKERN_TEMPL_1V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00519 TWEAK(template <> \
00520 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00521 TYPE* RESTRICT const _res)) \
00522 { \
00523 register TYPE *res= _res; \
00524 PREP; \
00525 register long i = sz; \
00526 \
00527 VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00528 VKERN_TEMPL_1V_NP_SIMD(OP,ADV,STP,MSUF); \
00529 \
00530 SFIN; \
00531 VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
00532 FIN; \
00533 }
00534
00535 #define VKERN_TEMPL_1V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00536 TWEAK(template <> \
00537 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00538 TYPE* RESTRICT const _res, \
00539 LCTYPED(TYPE) _f2)) \
00540 { \
00541 register TYPE *res= _res; \
00542 PREP(_f2); \
00543 register long i = sz; \
00544 \
00545 VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00546 VKERN_TEMPL_1V_NP_SIMD(OP,ADV,STP,MSUF); \
00547 \
00548 SFIN; \
00549 VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
00550 FIN(_f2); \
00551 }
00552
00553 #define VKERN_TEMPL_1V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00554 TWEAK(template <> \
00555 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00556 TYPE* RESTRICT const _res, \
00557 LCTYPED(TYPE) _f1, \
00558 LCTYPED(TYPE) _f2)) \
00559 { \
00560 register TYPE *res= _res; \
00561 PREP(_f1, _f2); \
00562 register long i = sz; \
00563 \
00564 VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00565 VKERN_TEMPL_1V_NP_SIMD(OP,ADV,STP,MSUF); \
00566 \
00567 SFIN; \
00568 VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
00569 FIN(_f1, _f2); \
00570 }
00571
00572 #define VKERN_TEMPL_1V_T_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
00573 TWEAK(template <> \
00574 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
00575 const TYPE* const _res, \
00576 TYPE &_f2)) \
00577 { \
00578 register const TYPE *res= _res; \
00579 PREP(_f2); \
00580 register long i = sz; \
00581 \
00582 VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
00583 VKERN_TEMPL_1V_NP_SIMD(OP,ADV,STP,MSUF); \
00584 \
00585 SFIN; \
00586 VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
00587 FIN(_f2); \
00588 }
00589
00590 #endif // H_UNROLL_PREFETCH_SIMD_DEF_H
00591