00001
00009 #ifndef H_VEC_KERN_SPECIAL_H
00010 #define H_VEC_KERN_SPECIAL_H
00011
00064 #if defined(__SSE2__) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_WEAK_ATTR) && \
00065 ( defined(__x86_64__) || defined(__i386__) )
00066
00067 #include <emmintrin.h>
00068
00069 #if defined(HAVE_PMMINTRIN_H) && defined(__SSE3__)
00070 # include <pmmintrin.h>
00071 #else
00072 # undef __SSE3__
00073 #endif
00074
00075 #include "unroll_prefetch_simd_def.h"
00076
00077
00078
00079 #if 0 //defined(TBCI_SELECTIVE_INST) && !defined(TBCI_INSTANTIATE) && !defined(AUTO_DECL)
00080 # include "vec_kern_special_gd.h"
00081 #else
00082
00083 NAMESPACE_TBCI
00084
00085 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
00086 # warning Info: Using unrolled SSE2 vector kernels
00087 #endif
00088
00089
00090
00091
00092
00093
00094 #define SIMD_EMPTY0 do {} while (0)
00095 #define SIMD_EMPTY1(x) do {} while (0)
00096 #define SIMD_EMPTY2(x,y) do {} while (0)
00097
00098 #define SIMD_CONST_DOUBLE_PREP(x) register __m128d f2 = _mm_set1_pd(x)
00099 #define SIMD_2CONST_DOUBLE_PREP(x,y) register __m128d f1 = _mm_set1_pd(x), f2 = _mm_set1_pd(y)
00100
00101 #define SIMD_CONST_FLOAT_PREP(x) register __m128 f2 = _mm_set1_ps(x)
00102 #define SIMD_2CONST_FLOAT_PREP(x,y) register __m128 f1 = _mm_set1_ps(x), f2 = _mm_set1_ps(y)
00103
00104
00105
00106
00107 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MAJOR__ == 0 && \
00108 __GNUC_MINOR__ == 0 && \
00109 (! defined(__GNUC_PATCHLEVEL__) || __GNUC_PATCHLEVEL__ == 0)
00110 # define _MM_STORE(mem, reg, SUF, UNA) \
00111 asm(""::"x"(reg)); \
00112 _mm_store##UNA##_##SUF(mem, reg)
00113 #else
00114 # define _MM_STORE(mem, reg, SUF, UNA) \
00115 _mm_store##UNA##_##SUF(mem, reg)
00116 #endif
00117
00118
00119
00120
00121 #ifndef C_MEMALLOC
00123
00124
00125 #define COPY2_SIMD(r,v1,f1,f2,SUF,UNA1) \
00126 TMP = _mm_load##UNA1##_##SUF(v1); \
00127 _MM_STORE(r, TMP, SUF,)
00128 VKERN_TEMPL_2V_SIMD(_tbci_copy, COPY2_SIMD, sd, pd,
00129 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00130 2, double, __m128d)
00131 VKERN_TEMPL_2V_SIMD(_tbci_copy, COPY2_SIMD, ss, ps,
00132 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00133 4, float, __m128)
00134
00135
00136
00137
00138
00139
00140
00142
00143 #define FILL1_SIMD(r,f1,f2,SUF) \
00144 _MM_STORE(r, f2, SUF,)
00145 VKERN_TEMPL_1V_C_SIMD(_tbci_fill, FILL1_SIMD, sd, pd,
00146 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00147 2, double, __m128d)
00148 VKERN_TEMPL_1V_C_SIMD(_tbci_fill, FILL1_SIMD, ss, ps,
00149 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00150 4, float, __m128)
00151
00152 #endif
00153
00154
00156
00157 #define ADD3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
00158 TMP = _mm_load##UNA1##_##SUF(v1); \
00159 LD = _mm_load##UNA2##_##SUF(v2); \
00160 TMP = _mm_add_##SUF(TMP, LD); \
00161 _MM_STORE(r, TMP, SUF,)
00162 VKERN_TEMPL_3V_SIMD(do_vec_vec_add, ADD3_SIMD, sd, pd,
00163 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00164 2, double, __m128d)
00165 VKERN_TEMPL_3V_SIMD(do_vec_vec_add, ADD3_SIMD, ss, ps,
00166 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00167 4, float, __m128)
00168
00170
00171 #define SUB3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
00172 TMP = _mm_load##UNA1##_##SUF(v1); \
00173 LD = _mm_load##UNA2##_##SUF(v2); \
00174 TMP = _mm_sub_##SUF(TMP, LD); \
00175 _MM_STORE(r, TMP, SUF,)
00176 VKERN_TEMPL_3V_SIMD(do_vec_vec_sub, SUB3_SIMD, sd, pd,
00177 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00178 2, double, __m128d)
00179 VKERN_TEMPL_3V_SIMD(do_vec_vec_sub, SUB3_SIMD, ss, ps,
00180 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00181 4, float, __m128)
00182
00184
00185 #define MUL3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
00186 TMP = _mm_load##UNA1##_##SUF(v1); \
00187 LD = _mm_load##UNA2##_##SUF(v2); \
00188 TMP = _mm_mul_##SUF(TMP, LD); \
00189 _MM_STORE(r, TMP, SUF,)
00190 VKERN_TEMPL_3V_SIMD(do_vec_vec_mul, MUL3_SIMD, sd, pd,
00191 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00192 2, double, __m128d)
00193 VKERN_TEMPL_3V_SIMD(do_vec_vec_mul, MUL3_SIMD, ss, ps,
00194 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00195 4, float, __m128)
00196
00197 template <> inline void do_vec_vec_cmul<double>(const unsigned long sz,
00198 double* RESTRICT const res, const double* RESTRICT const v1,
00199 const double* RESTRICT const v2)
00200 {
00201 do_vec_vec_mul<double>(sz, res, v1, v2);
00202 }
00203 template <> inline void do_vec_vec_cmul<float>(const unsigned long sz,
00204 float* RESTRICT const res, const float* RESTRICT const v1,
00205 const float* RESTRICT const v2)
00206 {
00207 do_vec_vec_mul<float>(sz, res, v1, v2);
00208 }
00209
00211
00212 #define DIV3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
00213 TMP = _mm_load##UNA1##_##SUF(v1); \
00214 LD = _mm_load##UNA2##_##SUF(v2); \
00215 TMP = _mm_div_##SUF(TMP, LD); \
00216 _MM_STORE(r, TMP, SUF,)
00217 VKERN_TEMPL_3V_SIMD(do_vec_vec_div, DIV3_SIMD, sd, pd,
00218 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00219 2, double, __m128d)
00220 VKERN_TEMPL_3V_SIMD(do_vec_vec_div, DIV3_SIMD, ss, ps,
00221 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00222 4, float, __m128)
00223
00224 template <> inline void do_vec_vec_cdiv<double>(const unsigned long sz,
00225 double* RESTRICT const res, const double* RESTRICT const v1,
00226 const double* RESTRICT const v2)
00227 {
00228 do_vec_vec_div<double>(sz, res, v1, v2);
00229 }
00230 template <> inline void do_vec_vec_cdiv<float>(const unsigned long sz,
00231 float* RESTRICT const res, const float* RESTRICT const v1,
00232 const float* RESTRICT const v2)
00233 {
00234 do_vec_vec_div<float>(sz, res, v1, v2);
00235 }
00236
00237
00239
00240 #define ADD2_SIMD(r,v1,f1,f2,SUF,UNA1) \
00241 TMP = _mm_load_##SUF(r); \
00242 LD = _mm_load##UNA1##_##SUF(v1); \
00243 TMP = _mm_add_##SUF(TMP, LD); \
00244 _MM_STORE(r, TMP, SUF,)
00245 VKERN_TEMPL_2V_SIMD(do_vec_add_vec, ADD2_SIMD, sd, pd,
00246 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00247 2, double, __m128d)
00248 VKERN_TEMPL_2V_SIMD(do_vec_add_vec, ADD2_SIMD, ss, ps,
00249 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00250 4, float, __m128)
00251
00253
00254 #define SUB2_SIMD(r,v1,f1,f2,SUF,UNA1) \
00255 TMP = _mm_load_##SUF(r); \
00256 LD = _mm_load##UNA1##_##SUF(v1); \
00257 TMP = _mm_sub_##SUF(TMP, LD); \
00258 _MM_STORE(r, TMP, SUF,)
00259 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec, SUB2_SIMD, sd, pd,
00260 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00261 2, double, __m128d)
00262 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec, SUB2_SIMD, ss, ps,
00263 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00264 4, float, __m128)
00265
00267
00268 #define SUB2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
00269 TMP = _mm_load_##SUF(r); \
00270 LD = _mm_load##UNA1##_##SUF(v1); \
00271 LD = _mm_sub_##SUF(LD, TMP); \
00272 _MM_STORE(r, LD, SUF,)
00273 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec_inv, SUB2I_SIMD, sd, pd,
00274 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00275 2, double, __m128d)
00276 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec_inv, SUB2I_SIMD, ss, ps,
00277 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00278 4, float, __m128)
00279
00281
00282 #define MUL2_SIMD(r,v1,f1,f2,SUF,UNA1) \
00283 TMP = _mm_load_##SUF(r); \
00284 LD = _mm_load##UNA1##_##SUF(v1); \
00285 TMP = _mm_mul_##SUF(TMP, LD); \
00286 _MM_STORE(r, TMP, SUF,)
00287 VKERN_TEMPL_2V_SIMD(do_vec_mul_vec, MUL2_SIMD, sd, pd,
00288 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00289 2, double, __m128d)
00290 VKERN_TEMPL_2V_SIMD(do_vec_mul_vec, MUL2_SIMD, ss, ps,
00291 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00292 4, float, __m128)
00293
00295
00296 template <> inline void do_vec_cmul_vec<double>(const unsigned long sz,
00297 double* RESTRICT const res, const double* RESTRICT const v1)
00298 {
00299 do_vec_mul_vec<double>(sz, res, v1);
00300 }
00301 template <> inline void do_vec_cmul_vec<float>(const unsigned long sz,
00302 float* RESTRICT const res, const float* RESTRICT const v1)
00303 {
00304 do_vec_mul_vec<float>(sz, res, v1);
00305 }
00307
00308 template <> inline void do_vec_cmul_vec_inv<double>(const unsigned long sz,
00309 double* RESTRICT const res, const double* RESTRICT const v1)
00310 {
00311 do_vec_mul_vec<double>(sz, res, v1);
00312 }
00313 template <> inline void do_vec_cmul_vec_inv<float>(const unsigned long sz,
00314 float* RESTRICT const res, const float* RESTRICT const v1)
00315 {
00316 do_vec_mul_vec<float>(sz, res, v1);
00317 }
00318
00320
00321 #define DIV2_SIMD(r,v1,f1,f2,SUF,UNA1) \
00322 TMP = _mm_load_##SUF(r); \
00323 LD = _mm_load##UNA1##_##SUF(v1); \
00324 TMP = _mm_div_##SUF(TMP, LD); \
00325 _MM_STORE(r, TMP, SUF,)
00326 VKERN_TEMPL_2V_SIMD(do_vec_div_vec, DIV2_SIMD, sd, pd,
00327 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00328 2, double, __m128d)
00329 VKERN_TEMPL_2V_SIMD(do_vec_div_vec, DIV2_SIMD, ss, ps,
00330 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00331 4, float, __m128)
00332
00334
00335 #define DIV2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
00336 TMP = _mm_load_##SUF(r); \
00337 LD = _mm_load##UNA1##_##SUF(v1); \
00338 LD = _mm_div_##SUF(LD, TMP); \
00339 _MM_STORE(r, LD, SUF,)
00340 VKERN_TEMPL_2V_SIMD(do_vec_div_vec_inv, DIV2I_SIMD, sd, pd,
00341 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00342 2, double, __m128d)
00343 VKERN_TEMPL_2V_SIMD(do_vec_div_vec_inv, DIV2I_SIMD, ss, ps,
00344 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
00345 4, float, __m128)
00346
00348
00349 template <> inline void do_vec_cdiv_vec<double>(const unsigned long sz,
00350 double* RESTRICT const res, const double* RESTRICT const v1)
00351 {
00352 do_vec_div_vec<double>(sz, res, v1);
00353 }
00354 template <> inline void do_vec_cdiv_vec<float>(const unsigned long sz,
00355 float* RESTRICT const res, const float* RESTRICT const v1)
00356 {
00357 do_vec_div_vec<float>(sz, res, v1);
00358 }
00359
00360
00362
00363 template <> inline void do_vec_cdiv_vec_inv<double>(const unsigned long sz,
00364 double* RESTRICT const res, const double* RESTRICT const v1)
00365 {
00366 do_vec_div_vec_inv<double>(sz, res, v1);
00367 }
00368 template <> inline void do_vec_cdiv_vec_inv<float>(const unsigned long sz,
00369 float* RESTRICT const res, const float* RESTRICT const v1)
00370 {
00371 do_vec_div_vec_inv<float>(sz, res, v1);
00372 }
00373
00375
00376 #define ADD2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
00377 TMP = _mm_load##UNA1##_##SUF(v1); \
00378 TMP = _mm_add_##SUF(TMP, f2); \
00379 _MM_STORE(r, TMP, SUF,)
00380 VKERN_TEMPL_2V_C_SIMD(do_vec_val_add, ADD2NV_SIMD, sd, pd,
00381 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00382 2, double, __m128d)
00383 VKERN_TEMPL_2V_C_SIMD(do_vec_val_add, ADD2NV_SIMD, ss, ps,
00384 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00385 4, float, __m128)
00386
00388
00389 #define SUB2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
00390 TMP = _mm_load##UNA1##_##SUF(v1); \
00391 TMP = _mm_sub_##SUF(TMP, f2); \
00392 _MM_STORE(r, TMP, SUF,)
00393 VKERN_TEMPL_2V_C_SIMD(do_vec_val_sub, SUB2NV_SIMD, sd, pd,
00394 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00395 2, double, __m128d)
00396 VKERN_TEMPL_2V_C_SIMD(do_vec_val_sub, SUB2NV_SIMD, ss, ps,
00397 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00398 4, float, __m128)
00399
00401
00402
00403 #define MUL2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
00404 TMP = _mm_load##UNA1##_##SUF(v1); \
00405 TMP = _mm_mul_##SUF(TMP, f2); \
00406 _MM_STORE(r, TMP, SUF,)
00407 VKERN_TEMPL_2V_C_SIMD(do_vec_val_mul, MUL2NV_SIMD, sd, pd,
00408 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00409 2, double, __m128d)
00410 VKERN_TEMPL_2V_C_SIMD(do_vec_val_mul, MUL2NV_SIMD, ss, ps,
00411 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00412 4, float, __m128)
00413
00414
00416
00417 template <> inline void do_val_vec_add<double>(const unsigned long sz,
00418 double* RESTRICT const res, const double* RESTRICT const v1,
00419 LCTYPED(double) _f2)
00420 {
00421 do_vec_val_add<double>(sz, res, v1, _f2);
00422 }
00423 template <> inline void do_val_vec_add<float>(const unsigned long sz,
00424 float* RESTRICT const res, const float* RESTRICT const v1,
00425 LCTYPED(float) _f2)
00426 {
00427 do_vec_val_add<float>(sz, res, v1, _f2);
00428 }
00429
00431
00432 #define SUB2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
00433 TMP = _mm_load##UNA1##_##SUF(v1); \
00434 TMP = _mm_sub_##SUF(f2, TMP); \
00435 _MM_STORE(r, TMP, SUF,)
00436 VKERN_TEMPL_2V_C_SIMD(do_val_vec_sub, SUB2RV_SIMD, sd, pd,
00437 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00438 2, double, __m128d)
00439 VKERN_TEMPL_2V_C_SIMD(do_val_vec_sub, SUB2RV_SIMD, ss, ps,
00440 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00441 4, float, __m128)
00442
00444
00445 template <> inline void do_val_vec_mul<double>(const unsigned long sz,
00446 double* RESTRICT const res, const double* RESTRICT const v1,
00447 LCTYPED(double) _f2)
00448 {
00449 do_vec_val_mul<double>(sz, res, v1, _f2);
00450 }
00451 template <> inline void do_val_vec_mul<float>(const unsigned long sz,
00452 float* RESTRICT const res, const float* RESTRICT const v1,
00453 LCTYPED(float) _f2)
00454 {
00455 do_vec_val_mul<float>(sz, res, v1, _f2);
00456 }
00457
00459
00460 #define DIV2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
00461 TMP = _mm_load##UNA1##_##SUF(v1); \
00462 TMP = _mm_div_##SUF(f2, TMP); \
00463 _MM_STORE(r, TMP, SUF,)
00464 VKERN_TEMPL_2V_C_SIMD(do_val_vec_div, DIV2RV_SIMD, sd, pd,
00465 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00466 2, double, __m128d)
00467 VKERN_TEMPL_2V_C_SIMD(do_val_vec_div, DIV2RV_SIMD, ss, ps,
00468 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00469 4, float, __m128)
00470
00471
00472
00473
00474
00476
00477 #define ADD1NV_SIMD(r,f1,f2,SUF) \
00478 TMP = _mm_load_##SUF(r); \
00479 TMP = _mm_add_##SUF(TMP, f2); \
00480 _MM_STORE(r, TMP, SUF,)
00481 VKERN_TEMPL_1V_C_SIMD(do_vec_add_val, ADD1NV_SIMD, sd, pd,
00482 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00483 2, double, __m128d)
00484 VKERN_TEMPL_1V_C_SIMD(do_vec_add_val, ADD1NV_SIMD, ss, ps,
00485 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00486 4, float, __m128)
00487
00489
00490 #define SUB1NV_SIMD(r,f1,f2,SUF) \
00491 TMP = _mm_load_##SUF(r); \
00492 TMP = _mm_sub_##SUF(TMP, f2); \
00493 _MM_STORE(r, TMP, SUF,)
00494 VKERN_TEMPL_1V_C_SIMD(do_vec_sub_val, SUB1NV_SIMD, sd, pd,
00495 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00496 2, double, __m128d)
00497 VKERN_TEMPL_1V_C_SIMD(do_vec_sub_val, SUB1NV_SIMD, ss, ps,
00498 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00499 4, float, __m128)
00500
00502
00503 #define SUB1RV_SIMD(r,f1,f2,SUF) \
00504 TMP = _mm_load_##SUF(r); \
00505 TMP = _mm_sub_##SUF(f2, TMP); \
00506 _MM_STORE(r, TMP, SUF,)
00507 VKERN_TEMPL_1V_C_SIMD(do_val_sub_vec, SUB1RV_SIMD, sd, pd,
00508 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00509 2, double, __m128d)
00510 VKERN_TEMPL_1V_C_SIMD(do_val_sub_vec, SUB1RV_SIMD, ss, ps,
00511 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00512 4, float, __m128)
00513
00515
00516 #define MUL1NV_SIMD(r,f1,f2,SUF) \
00517 TMP = _mm_load_##SUF(r); \
00518 TMP = _mm_mul_##SUF(TMP, f2); \
00519 _MM_STORE(r, TMP, SUF,)
00520 VKERN_TEMPL_1V_C_SIMD(do_vec_mul_val, MUL1NV_SIMD, sd, pd,
00521 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00522 2, double, __m128d)
00523 VKERN_TEMPL_1V_C_SIMD(do_vec_mul_val, MUL1NV_SIMD, ss, ps,
00524 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00525 4, float, __m128)
00526
00528
00529 #define DIV1NV_SIMD(r,f1,f2,SUF) \
00530 TMP = _mm_load_##SUF(r); \
00531 TMP = _mm_div_##SUF(TMP, f2); \
00532 _MM_STORE(r, TMP, SUF,)
00533 VKERN_TEMPL_1V_C_SIMD(do_vec_div_val, DIV1NV_SIMD, sd, pd,
00534 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00535 2, double, __m128d)
00536 VKERN_TEMPL_1V_C_SIMD(do_vec_div_val, DIV1NV_SIMD, ss, ps,
00537 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00538 4, float, __m128)
00539
00541
00542 #define DIV1RV_SIMD(r,f1,f2,SUF) \
00543 TMP = _mm_load_##SUF(r); \
00544 TMP = _mm_div_##SUF(f2, TMP); \
00545 _MM_STORE(r, TMP, SUF,)
00546 VKERN_TEMPL_1V_C_SIMD(do_val_div_vec, DIV1RV_SIMD, sd, pd,
00547 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00548 2, double, __m128d)
00549 VKERN_TEMPL_1V_C_SIMD(do_val_div_vec, DIV1RV_SIMD, ss, ps,
00550 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00551 4, float, __m128)
00552
00554
00555 template <> inline void do_val_add_vec<double>(const unsigned long sz,
00556 double* RESTRICT const res, LCTYPED(double) _f2)
00557 {
00558 do_vec_add_val<double>(sz, res, _f2);
00559 }
00560 template <> inline void do_val_add_vec<float>(const unsigned long sz,
00561 float* RESTRICT const res, LCTYPED(float) _f2)
00562 {
00563 do_vec_add_val<float>(sz, res, _f2);
00564 }
00565
00566
00567
00568
00569
00570
00571
00572
00574
00575 #define ADD2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
00576 LD = _mm_load##UNA1##_##SUF(v1); \
00577 TMP = _mm_load_##SUF(r); \
00578 LD = _mm_mul_##SUF(LD, f2); \
00579 TMP = _mm_add_##SUF(TMP, LD); \
00580 _MM_STORE(r, TMP, SUF,)
00581 VKERN_TEMPL_2V_C_SIMD(do_vec_add_svc, ADD2NS_SIMD, sd, pd,
00582 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00583 2, double, __m128d)
00584 VKERN_TEMPL_2V_C_SIMD(do_vec_add_svc, ADD2NS_SIMD, ss, ps,
00585 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00586 4, float, __m128)
00587
00589
00590 #define SUB2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
00591 LD = _mm_load##UNA1##_##SUF(v1); \
00592 TMP = _mm_load_##SUF(r); \
00593 LD = _mm_mul_##SUF(LD, f2); \
00594 TMP = _mm_sub_##SUF(TMP, LD); \
00595 _MM_STORE(r, TMP, SUF,)
00596 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc, SUB2NS_SIMD, sd, pd,
00597 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00598 2, double, __m128d)
00599 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc, SUB2NS_SIMD, ss, ps,
00600 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00601 4, float, __m128)
00602
00604
00605 #define SUB2RS_SIMD(r,v1,f1,f2,SUF,UNA1) \
00606 LD = _mm_load##UNA1##_##SUF(v1); \
00607 TMP = _mm_load_##SUF(r); \
00608 LD = _mm_mul_##SUF(LD, f2); \
00609 LD = _mm_sub_##SUF(LD, TMP); \
00610 _MM_STORE(r, LD, SUF,)
00611 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc_inv, SUB2RS_SIMD, sd, pd,
00612 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00613 2, double, __m128d)
00614 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc_inv, SUB2RS_SIMD, ss, ps,
00615 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00616 4, float, __m128)
00617
00619
00620 #define ADD3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
00621 LD = _mm_load##UNA2##_##SUF(v2); \
00622 TMP = _mm_load##UNA1##_##SUF(v1); \
00623 LD = _mm_mul_##SUF(LD, f2); \
00624 TMP = _mm_add_##SUF(TMP, LD); \
00625 _MM_STORE(r, TMP, SUF,)
00626 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_add, ADD3NS_SIMD, sd, pd,
00627 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00628 2, double, __m128d)
00629 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_add, ADD3NS_SIMD, ss, ps,
00630 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00631 4, float, __m128)
00632
00634
00635 #define SUB3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
00636 LD = _mm_load##UNA2##_##SUF(v2); \
00637 TMP = _mm_load##UNA1##_##SUF(v1); \
00638 LD = _mm_mul_##SUF(LD, f2); \
00639 TMP = _mm_sub_##SUF(TMP, LD); \
00640 _MM_STORE(r, TMP, SUF,)
00641 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_sub, SUB3NS_SIMD, sd, pd,
00642 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00643 2, double, __m128d)
00644 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_sub, SUB3NS_SIMD, ss, ps,
00645 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00646 4, float, __m128)
00647
00648
00650
00651 #define ADD3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
00652 LD = _mm_load##UNA1##_##SUF(v1); \
00653 TMP = _mm_load##UNA2##_##SUF(v2); \
00654 LD = _mm_mul_##SUF(LD, f2); \
00655 TMP = _mm_add_##SUF(TMP, LD); \
00656 _MM_STORE(r, TMP, SUF,)
00657 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_add, ADD3SN_SIMD, sd, pd,
00658 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00659 2, double, __m128d)
00660 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_add, ADD3SN_SIMD, ss, ps,
00661 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00662 4, float, __m128)
00663
00665
00666 #define SUB3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
00667 LD = _mm_load##UNA1##_##SUF(v1); \
00668 TMP = _mm_load##UNA2##_##SUF(v2); \
00669 LD = _mm_mul_##SUF(LD, f2); \
00670 LD = _mm_sub_##SUF(LD, TMP); \
00671 _MM_STORE(r, LD, SUF,)
00672 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_sub, SUB3SN_SIMD, sd, pd,
00673 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00674 2, double, __m128d)
00675 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_sub, SUB3SN_SIMD, ss, ps,
00676 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00677 4, float, __m128)
00678
00679
00681
00682 #define ADD3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
00683 LD = _mm_load##UNA1##_##SUF(v1); \
00684 TMP = _mm_load##UNA2##_##SUF(v2); \
00685 LD = _mm_mul_##SUF(LD, f1); \
00686 TMP = _mm_mul_##SUF(TMP, f2); \
00687 LD = _mm_add_##SUF(LD, TMP); \
00688 _MM_STORE(r, LD, SUF,)
00689 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_add, ADD3SS_SIMD, sd, pd,
00690 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00691 2, double, __m128d)
00692 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_add, ADD3SS_SIMD, ss, ps,
00693 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00694 4, float, __m128)
00695
00697
00698 #define SUB3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
00699 LD = _mm_load##UNA1##_##SUF(v1); \
00700 TMP = _mm_load##UNA2##_##SUF(v2); \
00701 LD = _mm_mul_##SUF(LD, f1); \
00702 TMP = _mm_mul_##SUF(TMP, f2); \
00703 LD = _mm_sub_##SUF(LD, TMP); \
00704 _MM_STORE(r, LD, SUF,)
00705 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_sub, SUB3SS_SIMD, sd, pd,
00706 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00707 2, double, __m128d)
00708 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_sub, SUB3SS_SIMD, ss, ps,
00709 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00710 4, float, __m128)
00711
00712
00714
00715 #define ADD2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
00716 LD = _mm_load_##SUF(r); \
00717 TMP = _mm_load##UNA1##_##SUF(v1); \
00718 LD = _mm_mul_##SUF(LD, f2); \
00719 TMP = _mm_add_##SUF(TMP, LD); \
00720 _MM_STORE(r, TMP, SUF,)
00721 VKERN_TEMPL_2V_C_SIMD(do_svc_add_vec, ADD2SN_SIMD, sd, pd,
00722 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00723 2, double, __m128d)
00724 VKERN_TEMPL_2V_C_SIMD(do_svc_add_vec, ADD2SN_SIMD, ss, ps,
00725 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00726 4, float, __m128)
00727
00729
00730 #define SUB2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
00731 LD = _mm_load_##SUF(r); \
00732 TMP = _mm_load##UNA1##_##SUF(v1); \
00733 LD = _mm_mul_##SUF(LD, f2); \
00734 LD = _mm_sub_##SUF(LD, TMP); \
00735 _MM_STORE(r, LD, SUF,)
00736 VKERN_TEMPL_2V_C_SIMD(do_svc_sub_vec, SUB2SN_SIMD, sd, pd,
00737 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00738 2, double, __m128d)
00739 VKERN_TEMPL_2V_C_SIMD(do_svc_sub_vec, SUB2SN_SIMD, ss, ps,
00740 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
00741 4, float, __m128)
00742
00744
00745 #define ADD2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
00746 LD = _mm_load_##SUF(r); \
00747 TMP = _mm_load##UNA1##_##SUF(v1); \
00748 LD = _mm_mul_##SUF(LD, f1); \
00749 TMP = _mm_mul_##SUF(TMP, f2); \
00750 LD = _mm_add_##SUF(LD, TMP); \
00751 _MM_STORE(r, LD, SUF,)
00752 VKERN_TEMPL_2V_CC_SIMD(do_svc_add_svc, ADD2SS_SIMD, sd, pd,
00753 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00754 2, double, __m128d)
00755 VKERN_TEMPL_2V_CC_SIMD(do_svc_add_svc, ADD2SS_SIMD, ss, ps,
00756 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00757 4, float, __m128)
00758
00760
00761 #define SUB2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
00762 LD = _mm_load_##SUF(r); \
00763 TMP = _mm_load##UNA1##_##SUF(v1); \
00764 LD = _mm_mul_##SUF(LD, f1); \
00765 TMP = _mm_mul_##SUF(TMP, f2); \
00766 LD = _mm_sub_##SUF(LD, TMP); \
00767 _MM_STORE(r, LD, SUF,)
00768 VKERN_TEMPL_2V_CC_SIMD(do_svc_sub_svc, SUB2SS_SIMD, sd, pd,
00769 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00770 2, double, __m128d)
00771 VKERN_TEMPL_2V_CC_SIMD(do_svc_sub_svc, SUB2SS_SIMD, ss, ps,
00772 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00773 4, float, __m128)
00774
00775
00777
00778 #define ADD2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
00779 TMP = _mm_load##UNA1##_##SUF(v1); \
00780 TMP = _mm_mul_##SUF(TMP, f1); \
00781 TMP = _mm_add_##SUF(TMP, f2); \
00782 _MM_STORE(r, TMP, SUF,)
00783 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_add, ADD2SV_SIMD, sd, pd,
00784 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00785 2, double, __m128d)
00786 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_add, ADD2SV_SIMD, ss, ps,
00787 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00788 4, float, __m128)
00789
00791
00792 #define SUB2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
00793 TMP = _mm_load##UNA1##_##SUF(v1); \
00794 TMP = _mm_mul_##SUF(TMP, f1); \
00795 TMP = _mm_sub_##SUF(TMP, f2); \
00796 _MM_STORE(r, TMP, SUF,)
00797 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_sub, SUB2SV_SIMD, sd, pd,
00798 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00799 2, double, __m128d)
00800 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_sub, SUB2SV_SIMD, ss, ps,
00801 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00802 4, float, __m128)
00803
00804
00806
00807 #define ADD1SV_SIMD(r,f1,f2,SUF) \
00808 TMP = _mm_load_##SUF(r); \
00809 TMP = _mm_mul_##SUF(TMP, f1); \
00810 TMP = _mm_add_##SUF(TMP, f2); \
00811 _MM_STORE(r, TMP, SUF,)
00812 VKERN_TEMPL_1V_CC_SIMD(do_svc_add_val, ADD1SV_SIMD, sd, pd,
00813 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00814 2, double, __m128d)
00815 VKERN_TEMPL_1V_CC_SIMD(do_svc_add_val, ADD1SV_SIMD, ss, ps,
00816 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00817 4, float, __m128)
00818
00820
00821 #define SUB1SV_SIMD(r,f1,f2,SUF) \
00822 TMP = _mm_load_##SUF(r); \
00823 TMP = _mm_mul_##SUF(TMP, f1); \
00824 TMP = _mm_sub_##SUF(TMP, f2); \
00825 _MM_STORE(r, TMP, SUF,)
00826 VKERN_TEMPL_1V_CC_SIMD(do_svc_sub_val, SUB1SV_SIMD, sd, pd,
00827 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00828 2, double, __m128d)
00829 VKERN_TEMPL_1V_CC_SIMD(do_svc_sub_val, SUB1SV_SIMD, ss, ps,
00830 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00831 4, float, __m128)
00832
00833
00835
00836 template <> inline void do_val_svc_add<double>(const unsigned long sz,
00837 double* RESTRICT const res, const double* RESTRICT const v1,
00838 LCTYPED(double) f1, LCTYPED(double) f2)
00839 {
00840 do_svc_val_add<double>(sz, res, v1, f2, f1);
00841 }
00842 template <> inline void do_val_svc_add<float>(const unsigned long sz,
00843 float* RESTRICT const res, const float* RESTRICT const v1,
00844 LCTYPED(float) f1, LCTYPED(float) f2)
00845 {
00846 do_svc_val_add<float>(sz, res, v1, f2, f1);
00847 }
00848
00850
00851 #define SUB2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
00852 TMP = _mm_load##UNA1##_##SUF(v1); \
00853 TMP = _mm_mul_##SUF(TMP, f2); \
00854 TMP = _mm_sub_##SUF(f1, TMP); \
00855 _MM_STORE(r, TMP, SUF,)
00856 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_sub, SUB2VS_SIMD, sd, pd,
00857 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00858 2, double, __m128d)
00859 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_sub, SUB2VS_SIMD, ss, ps,
00860 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00861 4, float, __m128)
00862
00864
00865 #define DIV2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
00866 TMP = _mm_load##UNA1##_##SUF(v1); \
00867 TMP = _mm_mul_##SUF(TMP, f2); \
00868 TMP = _mm_div_##SUF(f1, TMP); \
00869 _MM_STORE(r, TMP, SUF,)
00870 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_div, DIV2VS_SIMD, sd, pd,
00871 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00872 2, double, __m128d)
00873 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_div, DIV2VS_SIMD, ss, ps,
00874 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
00875 4, float, __m128)
00876
00877
00878
00879
00880
00881
00882 #ifdef HAVE_LONG_LONG
00883 #define NEG_DOUBLE_PREP \
00884 static union _negmask { \
00885 unsigned LONG_LONG lng[2]; \
00886 double dbl[2]; \
00887 __m128d m128d; \
00888 } ALIGN(16) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
00889 __m128d neg = _mm_load_pd(negmask.dbl)
00890 #else
00891 #define NEG_DOUBLE_PREP \
00892 static union _negmask { \
00893 unsigned int lng[4]; \
00894 double dbl[2]; \
00895 __m128d m128d; \
00896 } ALIGN(16) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
00897 __m128d neg = _mm_load_pd(negmask.dbl)
00898 #endif
00899 #define NEG_FLOAT_PREP \
00900 static union _negmask { \
00901 unsigned int itg[4]; \
00902 float flt[4]; \
00903 __m128 m128s; \
00904 } ALIGN(16) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
00905 __m128 neg = _mm_load_ps(negmask.flt)
00906
00907
00908 #define _mm_xor_sd _mm_xor_pd
00909 #define _mm_xor_ss _mm_xor_ps
00910
00912
00913 #define NEG2_SIMD(r,v1,f1,f2,SUF,UNA1) \
00914 TMP = _mm_load##UNA1##_##SUF(v1); \
00915 TMP = _mm_xor_##SUF(TMP, neg); \
00916 _MM_STORE(r, TMP, SUF,)
00917 VKERN_TEMPL_2V_SIMD(do_vec_neg_vec, NEG2_SIMD, sd, pd,
00918 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
00919 2, double, __m128d)
00920 VKERN_TEMPL_2V_SIMD(do_vec_neg_vec, NEG2_SIMD, ss, ps,
00921 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
00922 4, float, __m128)
00923
00925
00926 #define NEG1_SIMD(r,f1,f2,SUF) \
00927 TMP = _mm_load_##SUF(r); \
00928 TMP = _mm_xor_##SUF(TMP, neg); \
00929 _MM_STORE(r, TMP, SUF,)
00930 VKERN_TEMPL_1V_SIMD(do_vec_neg, NEG1_SIMD, sd, pd,
00931 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
00932 2, double, __m128d)
00933 VKERN_TEMPL_1V_SIMD(do_vec_neg, NEG1_SIMD, ss, ps,
00934 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
00935 4, float, __m128)
00936
00937
00938
00940
00941
00942
00943
00944
00945
00946 #define DECL_DOUBLE __m128d TM2
00947 #define DECL_FLOAT __m128 TM2
00948
00949 #define _mm_loadu_sd _mm_load_sd
00950 #define _mm_loadu_ss _mm_load_ss
00951
00952
00953 #define SUMMULT3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
00954 TMP = _mm_load##UNA1##_##SUF(v1); \
00955 LD = _mm_load##UNA2##_##SUF(v2); \
00956 TM2 = _mm_load_##SUF(r); \
00957 TMP = _mm_mul_##SUF(TMP, LD); \
00958 TM2 = _mm_add_##SUF(TM2, TMP); \
00959 _MM_STORE(r, TM2, SUF,)
00960
00961
00962
00963
00964 VKERN_TEMPL_3V_SIMD(do_add_vec_vec_mul, SUMMULT3_SIMD, sd, pd,
00965 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
00966 2, double, __m128d)
00967 VKERN_TEMPL_3V_SIMD(do_add_vec_vec_mul, SUMMULT3_SIMD, ss, ps,
00968 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
00969 4, float, __m128)
00970
00971
00972 template <> inline void do_add_vec_vec_cmul<double>(const unsigned long sz,
00973 double* RESTRICT const r, const double* RESTRICT const v1,
00974 const double* RESTRICT const v2)
00975 {
00976 do_add_vec_vec_mul<double>(sz, r, v1, v2);
00977 }
00978 template <> inline void do_add_vec_vec_cmul<float>(const unsigned long sz,
00979 float* RESTRICT const r, const float* RESTRICT const v1,
00980 const float* RESTRICT const v2)
00981 {
00982 do_add_vec_vec_mul<float>(sz, r, v1, v2);
00983 }
00984
00985
01002 #ifdef TBCI_SIMD_SUM
01003
01004 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
01005 # warning Info: Using unrolled SSE2 vector kernels for sums (reductions)
01006 #endif
01007
01008 #define SUM_DOUBLE_PREP(x) register __m128d f2 = _mm_set_sd(x)
01009 #define SUM_FLOAT_PREP(x) register __m128 f2 = _mm_set_ss(x)
01010
01016 #ifdef __SSE3__
01017 # define SUM_DOUBLE_SIMD_FIN \
01018 f2 = _mm_hadd_pd(f2, f2)
01019 # define SUM_FLOAT_SIMD_FIN \
01020 f2 = _mm_hadd_ps(f2, f2); \
01021 f2 = _mm_hadd_ps(f2, f2)
01022 #else // __SSE3__
01023 # define SUM_DOUBLE_SIMD_FIN \
01024 __m128d DUM = f2; \
01025 DUM = _mm_unpackhi_pd(DUM, f2); \
01026 f2 = _mm_add_sd(f2, DUM)
01027 # define SUM_FLOAT_SIMD_FIN \
01028 __m128 DUM = f2; \
01029 DUM = _mm_shuffle_ps(DUM, f2, 0xb1); \
01030 f2 = _mm_add_ps(f2, DUM); \
01031 DUM = f2; \
01032 DUM = _mm_shuffle_ps(DUM, f2, 0x1b); \
01033 f2 = _mm_add_ss(f2, DUM)
01034 #endif // __SSE3__
01035
01036 #define SUM_DOUBLE_FINAL(x) \
01037 _mm_store_sd(&x, f2)
01038 #define SUM_FLOAT_FINAL(x) \
01039 _mm_store_ss(&x, f2)
01040
01041
01043
01044 #define MULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
01045 TMP = _mm_load_##SUF(r); \
01046 LD = _mm_load##UNA1##_##SUF(v1); \
01047 TMP = _mm_mul_##SUF(TMP, LD); \
01048 f2 = _mm_add_##SUF(f2, TMP)
01049
01050 VKERN_TEMPL_2V_T_SIMD(do_vec_mult, MULT2_SIMD, sd, pd,
01051 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
01052 2, double, __m128d)
01053 VKERN_TEMPL_2V_T_SIMD(do_vec_mult, MULT2_SIMD, ss, ps,
01054 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
01055 4, float, __m128)
01056
01057 template <> inline void do_vec_dot<double>(const unsigned long sz,
01058 const double * RESTRICT const _v1, const double * RESTRICT const _v2,
01059 double& _f2)
01060 {
01061 do_vec_mult<double>(sz, _v1, _v2, _f2);
01062 }
01063 template <> inline void do_vec_dot<float>(const unsigned long sz,
01064 const float * RESTRICT const _v1, const float * RESTRICT const _v2,
01065 float& _f2)
01066 {
01067 do_vec_mult<float>(sz, _v1, _v2, _f2);
01068 }
01069
01071 VKERN_TEMPL_2V_T(do_vec_mult_unaligned, MULT2, T)
01072
01073
01074
01075 #define SQR1_SIMD(r,f1,f2,SUF) \
01076 TMP = _mm_load_##SUF(r); \
01077 TMP = _mm_mul_##SUF(TMP, TMP); \
01078 f2 = _mm_add_##SUF(f2, TMP);
01079
01080 VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr, SQR1_SIMD, sd, pd,
01081 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
01082 2, double, __m128d)
01083 VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr, SQR1_SIMD, ss, ps,
01084 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
01085 4, float, __m128)
01086
01087 #ifndef TBCI_NO_SIMD_FABSSQR
01088 template <> inline void do_vec_fabssqr<double>(const unsigned long sz,
01089 const double * const _v1, LONG_DOUBLE& _f2)
01090 {
01091 double F2 = _f2;
01092 do_vec_sumsqr<double>(sz, _v1, F2);
01093 _f2 = F2;
01094 }
01095 #endif // TBCI_NO_SIMD_FABSSQR
01096 #ifdef TBCI_SIMD_FABSSQR_FLOAT // The loss of precision with float is unbearable
01097 template <> inline void do_vec_fabssqr<float>(const unsigned long sz,
01098 const float * const _v1, LONG_DOUBLE& _f2)
01099 {
01100 float F2 = _f2;
01101 do_vec_sumsqr<float>(sz, _v1, F2);
01102 _f2 = F2;
01103 }
01104 #endif // TBCI_SIMD_FABSSQR_FLOAT
01105
01107
01108 #define SUM1_SIMD(r,f1,f2,SUF) \
01109 TMP = _mm_load_##SUF(r); \
01110 f2 = _mm_add_##SUF(f2, TMP)
01111 VKERN_TEMPL_1V_T_SIMD(do_vec_sum, SUM1_SIMD, sd, pd,
01112 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
01113 2, double, __m128d)
01114 VKERN_TEMPL_1V_T_SIMD(do_vec_sum, SUM1_SIMD, ss, ps,
01115 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
01116 4, float, __m128)
01117
01118 #endif // TBCI_SIMD_SUM
01119
01120 NAMESPACE_END
01121
01122 #endif // TBCI_SELECTIVE_INST
01123
01124 #endif // __SSE2__
01125
01126 #endif // H_VEC_KERN_SPECIAL_H