00001
00008 #ifndef TBCI_UNROLL_PREFETCH_DEF2_H
00009 #define TBCI_UNROLL_PREFETCH_DEF2_H
00010
00011
00012
00014 #define LCTYPE(T) register typename tbci_traits<T>::loop_const_refval_type
00015 #define LCTYPED(T) register tbci_traits<T>::loop_const_refval_type
00016
00042 #ifndef UNROLL_DEPTH
00043 # define UNROLL_DEPTH 4
00044 #endif
00045
00046
00047
00048
00049
00050
00052 #define UNROLL1_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
00053 --i; \
00054 OPER(res[0], v1[0], v2[0], f1, f2); \
00055 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00056 ++v1; \
00057 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00058 ++v2; \
00059 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00060 ++res
00061
00063 #define UNROLL1_KERNEL5(OPER) \
00064 --i; \
00065 OPER(res[0], v1[0], v2[0], f1, f2); \
00066 ++v1; ++v2; ++res
00067
00068 #define UNROLL1_KERNEL5_PREPARE do {} while(0)
00069 #define UNROLL1_KERNEL5_FIXUP do {} while(0)
00070
00071
00073 #define UNROLL2_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
00074 if (EL_PER_CL(T) <= 1) { \
00075 OPER(res[0], v1[0], v2[0], f1, f2); \
00076 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00077 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00078 i -= 2; \
00079 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00080 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
00081 OPER(res[1], v1[1], v2[1], f1, f2); \
00082 v1 += 2; v2 += 2; \
00083 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00084 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
00085 res += 2; \
00086 } else { \
00087 OPER(res[0], v1[0], v2[0], f1, f2); \
00088 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00089 i -= 2; \
00090 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00091 OPER(res[1], v1[1], v2[1], f1, f2); \
00092 v1 += 2; v2 += 2; \
00093 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00094 res += 2; \
00095 } \
00096
00097
00099 #define UNROLL2_KERNEL5(OPER) \
00100 OPER(res[0], v1[0], v2[0], f1, f2); \
00101 i -= 2; \
00102 OPER(res[1], v1[1], v2[1], f1, f2); \
00103 v1 += 2; v2 += 2; res += 2
00104
00105 #define UNROLL2_KERNEL5_PREPARE do {} while(0)
00106 #define UNROLL2_KERNEL5_FIXUP do {} while(0)
00107
00108
00110 #define UNROLL4_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
00111 if (EL_PER_CL(T) <= 1) { \
00112 OPER(res[0], v1[0], v2[0], f1, f2); \
00113 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00114 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00115 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00116 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
00117 OPER(res[1], v1[1], v2[1], f1, f2); \
00118 i -= 4; \
00119 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00120 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
00121 OPER(res[2], v1[2], v2[2], f1, f2); \
00122 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
00123 PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
00124 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00125 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
00126 OPER(res[3], v1[3], v2[3], f1, f2); \
00127 v1 += 4; v2 += 4; \
00128 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
00129 PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
00130 res += 4; \
00131 } else if (EL_PER_CL(T) <= 2) { \
00132 OPER(res[0], v1[0], v2[0], f1, f2); \
00133 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00134 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00135 OPER(res[1], v1[1], v2[1], f1, f2); \
00136 i -= 4; \
00137 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00138 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
00139 OPER(res[2], v1[2], v2[2], f1, f2); \
00140 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00141 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
00142 OPER(res[3], v1[3], v2[3], f1, f2); \
00143 v1 += 4; v2 += 4; \
00144 res += 4; \
00145 } else { \
00146 OPER(res[0], v1[0], v2[0], f1, f2); \
00147 i -= 4; \
00148 OPER(res[1], v1[1], v2[1], f1, f2); \
00149 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00150 OPER(res[2], v1[2], v2[2], f1, f2); \
00151 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00152 OPER(res[3], v1[3], v2[3], f1, f2); \
00153 v1 += 4; v2 += 4; \
00154 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00155 res += 4; \
00156 }
00157
00159 #define UNROLL4_KERNEL5(OPER) \
00160 OPER(res[0], v1[0], v2[0], f1, f2); \
00161 OPER(res[1], v1[1], v2[1], f1, f2); \
00162 i -= 4; \
00163 OPER(res[2], v1[2], v2[2], f1, f2); \
00164 OPER(res[3], v1[3], v2[3], f1, f2); \
00165 v1 += 4; v2 += 4; \
00166 res += 4
00167
00168 #define UNROLL4_KERNEL5_PREPARE do {} while(0)
00169 #define UNROLL4_KERNEL5_FIXUP do {} while(0)
00170
00171
00173 #define UNROLL8_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
00174 if (EL_PER_CL(T) <= 1) { \
00175 OPER(res[0], v1[0], v2[0], f1, f2); \
00176 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00177 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00178 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00179 OPER(res[1], v1[1], v2[1], f1, f2); \
00180 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
00181 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00182 PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
00183 OPER(res[2], v1[2], v2[2], f1, f2); \
00184 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
00185 PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
00186 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00187 OPER(res[3], v1[3], v2[3], f1, f2); \
00188 i -= 8; \
00189 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
00190 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
00191 PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
00192 OPER(res[4], v1[4], v2[4], f1, f2); \
00193 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
00194 PREFETCH_R(v2 +PREF_OFFS(T)+5, CA2); \
00195 PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
00196 OPER(res[5], v1[5], v2[5], f1, f2); \
00197 PREFETCH_R(v2 +PREF_OFFS(T)+7, CA2); \
00198 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00199 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
00200 OPER(res[6], v1[6], v2[6], f1, f2); \
00201 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
00202 PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
00203 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
00204 OPER(res[7], v1[7], v2[7], f1, f2); \
00205 v1 += 8; v2 += 8; \
00206 PREFETCH_W(res+PREF_OFFS(T)+5, CA0); \
00207 PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
00208 PREFETCH_W(res+PREF_OFFS(T)+7, CA0); \
00209 res += 8; \
00210 } else if (EL_PER_CL(T) <= 2) { \
00211 OPER(res[0], v1[0], v2[0], f1, f2); \
00212 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00213 OPER(res[1], v1[1], v2[1], f1, f2); \
00214 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00215 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00216 OPER(res[2], v1[2], v2[2], f1, f2); \
00217 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
00218 OPER(res[3], v1[3], v2[3], f1, f2); \
00219 i -= 8; \
00220 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00221 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
00222 OPER(res[4], v1[4], v2[4], f1, f2); \
00223 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
00224 PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
00225 OPER(res[5], v1[5], v2[5], f1, f2); \
00226 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00227 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
00228 OPER(res[6], v1[6], v2[6], f1, f2); \
00229 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
00230 PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
00231 OPER(res[7], v1[7], v2[7], f1, f2); \
00232 v1 += 8; v2 += 8; \
00233 res += 8; \
00234 } else if (EL_PER_CL(T) <= 4) { \
00235 OPER(res[0], v1[0], v2[0], f1, f2); \
00236 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00237 OPER(res[1], v1[1], v2[1], f1, f2); \
00238 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00239 OPER(res[2], v1[2], v2[2], f1, f2); \
00240 i -= 8; \
00241 OPER(res[3], v1[3], v2[3], f1, f2); \
00242 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00243 OPER(res[4], v1[4], v2[4], f1, f2); \
00244 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
00245 OPER(res[5], v1[5], v2[5], f1, f2); \
00246 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00247 OPER(res[6], v1[6], v2[6], f1, f2); \
00248 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
00249 OPER(res[7], v1[7], v2[7], f1, f2); \
00250 v1 += 8; v2 += 8; \
00251 res += 8; \
00252 } else { \
00253 OPER(res[0], v1[0], v2[0], f1, f2); \
00254 OPER(res[1], v1[1], v2[1], f1, f2); \
00255 i -= 8; \
00256 OPER(res[2], v1[2], v2[2], f1, f2); \
00257 OPER(res[3], v1[3], v2[3], f1, f2); \
00258 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00259 OPER(res[4], v1[4], v2[4], f1, f2); \
00260 OPER(res[5], v1[5], v2[5], f1, f2); \
00261 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00262 OPER(res[6], v1[6], v2[6], f1, f2); \
00263 OPER(res[7], v1[7], v2[7], f1, f2); \
00264 v1 += 8; v2 += 8; \
00265 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00266 res += 8; \
00267 }
00268
00269
00271 #define UNROLL8_KERNEL5(OPER) \
00272 OPER(res[0], v1[0], v2[0], f1, f2); \
00273 OPER(res[1], v1[1], v2[1], f1, f2); \
00274 OPER(res[2], v1[2], v2[2], f1, f2); \
00275 OPER(res[3], v1[3], v2[3], f1, f2); \
00276 i -= 8; \
00277 OPER(res[4], v1[4], v2[4], f1, f2); \
00278 OPER(res[5], v1[5], v2[5], f1, f2); \
00279 OPER(res[6], v1[6], v2[6], f1, f2); \
00280 OPER(res[7], v1[7], v2[7], f1, f2); \
00281 v1 += 8; v2 += 8; \
00282 res += 8
00283
00284 #define UNROLL8_KERNEL5_PREPARE do {} while(0)
00285 #define UNROLL8_KERNEL5_FIXUP do {} while(0)
00286
00288 #define PREF_AHEAD3(T,CA0,CA1,CA2) \
00289 if (PREFETCH_AHEAD >= 16) { \
00290 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00291 PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
00292 PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
00293 PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
00294 PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
00295 PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
00296 PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
00297 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
00298 PREFETCH_R(v2 +EL_PER_CL(T)* 2, CA2); \
00299 PREFETCH_R(v2 +EL_PER_CL(T)* 3, CA2); \
00300 PREFETCH_R(v2 +EL_PER_CL(T)* 4, CA2); \
00301 PREFETCH_R(v2 +EL_PER_CL(T)* 5, CA2); \
00302 PREFETCH_R(v2 +EL_PER_CL(T)* 6, CA2); \
00303 PREFETCH_R(v2 +EL_PER_CL(T)* 7, CA2); \
00304 PREFETCH_W(res+EL_PER_CL(T), CA0); \
00305 PREFETCH_W(res+EL_PER_CL(T)* 2, CA0); \
00306 PREFETCH_W(res+EL_PER_CL(T)* 3, CA0); \
00307 PREFETCH_W(res+EL_PER_CL(T)* 4, CA0); \
00308 PREFETCH_W(res+EL_PER_CL(T)* 5, CA0); \
00309 PREFETCH_W(res+EL_PER_CL(T)* 6, CA0); \
00310 PREFETCH_W(res+EL_PER_CL(T)* 7, CA0); \
00311 PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
00312 PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
00313 PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
00314 PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
00315 PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
00316 PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
00317 PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
00318 PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
00319 PREFETCH_R(v2 +EL_PER_CL(T)* 8, CA2); \
00320 PREFETCH_R(v2 +EL_PER_CL(T)* 9, CA2); \
00321 PREFETCH_R(v2 +EL_PER_CL(T)*10, CA2); \
00322 PREFETCH_R(v2 +EL_PER_CL(T)*11, CA2); \
00323 PREFETCH_R(v2 +EL_PER_CL(T)*12, CA2); \
00324 PREFETCH_R(v2 +EL_PER_CL(T)*13, CA2); \
00325 PREFETCH_R(v2 +EL_PER_CL(T)*14, CA2); \
00326 PREFETCH_R(v2 +EL_PER_CL(T)*15, CA2); \
00327 PREFETCH_W(res+EL_PER_CL(T)* 8, CA0); \
00328 PREFETCH_W(res+EL_PER_CL(T)* 9, CA0); \
00329 PREFETCH_W(res+EL_PER_CL(T)*10, CA0); \
00330 PREFETCH_W(res+EL_PER_CL(T)*11, CA0); \
00331 PREFETCH_W(res+EL_PER_CL(T)*12, CA0); \
00332 PREFETCH_W(res+EL_PER_CL(T)*13, CA0); \
00333 PREFETCH_W(res+EL_PER_CL(T)*14, CA0); \
00334 PREFETCH_W(res+EL_PER_CL(T)*15, CA0); \
00335 } else if (PREFETCH_AHEAD >= 8) { \
00336 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00337 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
00338 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
00339 PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
00340 PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
00341 PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
00342 PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
00343 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
00344 PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
00345 PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
00346 PREFETCH_R(v2 +EL_PER_CL(T)*4, CA2); \
00347 PREFETCH_R(v2 +EL_PER_CL(T)*5, CA2); \
00348 PREFETCH_R(v2 +EL_PER_CL(T)*6, CA2); \
00349 PREFETCH_R(v2 +EL_PER_CL(T)*7, CA2); \
00350 PREFETCH_W(res+EL_PER_CL(T), CA0); \
00351 PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
00352 PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
00353 PREFETCH_W(res+EL_PER_CL(T)*4, CA0); \
00354 PREFETCH_W(res+EL_PER_CL(T)*5, CA0); \
00355 PREFETCH_W(res+EL_PER_CL(T)*6, CA0); \
00356 PREFETCH_W(res+EL_PER_CL(T)*7, CA0); \
00357 } else if (PREFETCH_AHEAD >= 4) { \
00358 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00359 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
00360 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
00361 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
00362 PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
00363 PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
00364 PREFETCH_W(res+EL_PER_CL(T), CA0); \
00365 PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
00366 PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
00367 } else if (PREFETCH_AHEAD >= 2) { \
00368 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00369 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
00370 PREFETCH_W(res+EL_PER_CL(T), CA0); \
00371 }
00372
00373
00374
00375
00376
00377
00379 #define UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
00380 OPER(res[0], v1[0], f1, f2); \
00381 --i; \
00382 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00383 ++v1; \
00384 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00385 ++res
00386
00388 #define UNROLL1_KERNEL4(OPER) \
00389 --i; \
00390 OPER(res[0], v1[0], f1, f2); \
00391 ++v1; ++res
00392
00393 #define UNROLL1_KERNEL4_PREPARE do {} while(0)
00394 #define UNROLL1_KERNEL4_FIXUP do {} while(0)
00395
00396
00398 #define UNROLL2_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
00399 if (EL_PER_CL(T) <= 1) { \
00400 OPER(res[0], v1[0], f1, f2); \
00401 i -= 2; \
00402 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00403 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00404 OPER(res[1], v1[1], f1, f2); \
00405 v1 += 2; \
00406 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00407 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00408 res += 2; \
00409 } else { \
00410 OPER(res[0], v1[0], f1, f2); \
00411 i -= 2; \
00412 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00413 OPER(res[1], v1[1], f1, f2); \
00414 v1 += 2; \
00415 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00416 res += 2; \
00417 } \
00418
00419
00421 #define UNROLL2_KERNEL4(OPER) \
00422 OPER(res[0], v1[0], f1, f2); \
00423 v1 += 2; i -= 2; \
00424 OPER(res[1], v1[-1],f1, f2); \
00425 res += 2
00426
00427 #define UNROLL2_KERNEL4_PREPARE do {} while(0)
00428 #define UNROLL2_KERNEL4_FIXUP do {} while(0)
00429
00430
00432 #define UNROLL4_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
00433 if (EL_PER_CL(T) <= 1) { \
00434 OPER(res[0], v1[0], f1, f2); \
00435 i -= 4; \
00436 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00437 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00438 OPER(res[1], v1[1], f1, f2); \
00439 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00440 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
00441 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00442 OPER(res[2], v1[2], f1, f2); \
00443 v1 += 4; \
00444 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00445 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00446 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
00447 OPER(res[3], v1[-1], f1, f2); \
00448 res += 4; \
00449 } else if (EL_PER_CL(T) <= 2) { \
00450 OPER(res[0], v1[0], f1, f2); \
00451 i -= 4; \
00452 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00453 OPER(res[1], v1[1], f1, f2); \
00454 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00455 OPER(res[2], v1[2], f1, f2); \
00456 v1 += 4; \
00457 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00458 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00459 OPER(res[3], v1[-1], f1, f2); \
00460 res += 4; \
00461 } else { \
00462 OPER(res[0], v1[0], f1, f2); \
00463 i -= 4; \
00464 OPER(res[1], v1[1], f1, f2); \
00465 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00466 OPER(res[2], v1[2], f1, f2); \
00467 v1 += 4; \
00468 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00469 OPER(res[3], v1[-1], f1, f2); \
00470 res += 4; \
00471 }
00472
00474 #define UNROLL4_KERNEL4(OPER) \
00475 OPER(res[0], v1[0], f1, f2); \
00476 OPER(res[1], v1[1], f1, f2); \
00477 v1 += 4; i -= 4; \
00478 OPER(res[2], v1[-2], f1, f2); \
00479 OPER(res[3], v1[-1], f1, f2); \
00480 res += 4
00481
00482 #define UNROLL4_KERNEL4_PREPARE do {} while(0)
00483 #define UNROLL4_KERNEL4_FIXUP do {} while(0)
00484
00485
00487 #define UNROLL8_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
00488 if (EL_PER_CL(T) <= 1) { \
00489 OPER(res[0], v1[0], f1, f2); \
00490 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00491 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00492 OPER(res[1], v1[1], f1, f2); \
00493 i -= 8; \
00494 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00495 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
00496 OPER(res[2], v1[2], f1, f2); \
00497 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00498 PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
00499 OPER(res[3], v1[3], f1, f2); \
00500 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
00501 PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
00502 OPER(res[4], v1[4], f1, f2); \
00503 v1 += 8; \
00504 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00505 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00506 OPER(res[5], v1[-3], f1, f2); \
00507 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00508 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
00509 OPER(res[6], v1[-2], f1, f2); \
00510 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00511 PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
00512 OPER(res[7], v1[-1], f1, f2); \
00513 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
00514 PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
00515 res += 8; \
00516 } else if (EL_PER_CL(T) <= 2) { \
00517 OPER(res[0], v1[0], f1, f2); \
00518 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00519 OPER(res[1], v1[1], f1, f2); \
00520 i -= 8; \
00521 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00522 OPER(res[2], v1[2], f1, f2); \
00523 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00524 OPER(res[3], v1[3], f1, f2); \
00525 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
00526 OPER(res[4], v1[4], f1, f2); \
00527 v1 += 8; \
00528 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00529 OPER(res[5], v1[-3], f1, f2); \
00530 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00531 OPER(res[6], v1[-2], f1, f2); \
00532 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00533 OPER(res[7], v1[-1], f1, f2); \
00534 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
00535 res += 8; \
00536 } else if (EL_PER_CL(T) <= 4) { \
00537 OPER(res[0], v1[0], f1, f2); \
00538 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00539 OPER(res[1], v1[1], f1, f2); \
00540 OPER(res[2], v1[2], f1, f2); \
00541 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00542 OPER(res[3], v1[3], f1, f2); \
00543 OPER(res[4], v1[4], f1, f2); \
00544 v1 += 8; \
00545 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00546 OPER(res[5], v1[-3], f1, f2); \
00547 i -= 8; \
00548 OPER(res[6], v1[-2], f1, f2); \
00549 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00550 OPER(res[7], v1[-1], f1, f2); \
00551 res += 8; \
00552 } else { \
00553 OPER(res[0], v1[0], f1, f2); \
00554 OPER(res[1], v1[1], f1, f2); \
00555 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00556 OPER(res[2], v1[2], f1, f2); \
00557 OPER(res[3], v1[3], f1, f2); \
00558 v1 += 8; \
00559 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00560 OPER(res[4], v1[-4], f1, f2); \
00561 OPER(res[5], v1[-3], f1, f2); \
00562 i -= 8; \
00563 OPER(res[6], v1[-2], f1, f2); \
00564 OPER(res[7], v1[-1], f1, f2); \
00565 res += 8; \
00566 }
00567
00568
00570 #define UNROLL8_KERNEL4(OPER) \
00571 OPER(res[0], v1[0], f1, f2); \
00572 OPER(res[1], v1[1], f1, f2); \
00573 OPER(res[2], v1[2], f1, f2); \
00574 OPER(res[3], v1[3], f1, f2); \
00575 v1 += 8; i -= 8; \
00576 OPER(res[4], v1[-4], f1, f2); \
00577 OPER(res[5], v1[-3], f1, f2); \
00578 OPER(res[6], v1[-2], f1, f2); \
00579 OPER(res[7], v1[-1], f1, f2); \
00580 res += 8
00581
00582 #define UNROLL8_KERNEL4_PREPARE do {} while(0)
00583 #define UNROLL8_KERNEL4_FIXUP do {} while(0)
00584
00585
00587 #define PREF_AHEAD2(T,PREFETCH_X,CA0,CA1) \
00588 if (PREFETCH_AHEAD >= 16) { \
00589 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00590 PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
00591 PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
00592 PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
00593 PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
00594 PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
00595 PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
00596 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00597 PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
00598 PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
00599 PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
00600 PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
00601 PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
00602 PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
00603 PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
00604 PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
00605 PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
00606 PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
00607 PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
00608 PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
00609 PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
00610 PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
00611 PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
00612 PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
00613 PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
00614 PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
00615 PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
00616 PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
00617 PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
00618 PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
00619 } else if (PREFETCH_AHEAD >= 8) { \
00620 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00621 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
00622 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
00623 PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
00624 PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
00625 PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
00626 PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
00627 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00628 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
00629 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
00630 PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
00631 PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
00632 PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
00633 PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
00634 } else if (PREFETCH_AHEAD >= 4) { \
00635 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00636 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
00637 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
00638 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00639 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
00640 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
00641 } else if (PREFETCH_AHEAD >= 2) { \
00642 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00643 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00644 }
00645
00646
00647
00648
00649
00650
00652 #define UNROLL1_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
00653 OPER(res[0], f1, f2); \
00654 --i; \
00655 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00656 ++res
00657
00659 #define UNROLL1_KERNEL3(OPER) \
00660 --i; \
00661 OPER(res[0], f1, f2); \
00662 ++res
00663
00664 #define UNROLL1_KERNEL3_PREPARE do {} while(0)
00665 #define UNROLL1_KERNEL3_FIXUP do {} while(0)
00666
00667
00669 #define UNROLL2_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
00670 if (EL_PER_CL(T) <= 1) { \
00671 OPER(res[0], f1, f2); \
00672 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00673 i -= 2; \
00674 OPER(res[1], f1, f2); \
00675 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00676 res += 2; \
00677 } else { \
00678 OPER(res[0], f1, f2); \
00679 i -= 2; \
00680 OPER(res[1], f1, f2); \
00681 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00682 res += 2; \
00683 } \
00684
00685
00687 #define UNROLL2_KERNEL3(OPER) \
00688 OPER(res[0], f1, f2); \
00689 i -= 2; \
00690 OPER(res[1], f1, f2); \
00691 res += 2
00692
00693 #define UNROLL2_KERNEL3_PREPARE do {} while(0)
00694 #define UNROLL2_KERNEL3_FIXUP do {} while(0)
00695
00696
00698 #define UNROLL4_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
00699 if (EL_PER_CL(T) <= 1) { \
00700 OPER(res[0], f1, f2); \
00701 i -= 4; \
00702 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00703 OPER(res[1], f1, f2); \
00704 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00705 OPER(res[2], f1, f2); \
00706 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00707 OPER(res[3], f1, f2); \
00708 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
00709 res += 4; \
00710 } else if (EL_PER_CL(T) <= 2) { \
00711 OPER(res[0], f1, f2); \
00712 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00713 OPER(res[1], f1, f2); \
00714 i -= 4; \
00715 OPER(res[2], f1, f2); \
00716 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00717 OPER(res[3], f1, f2); \
00718 res += 4; \
00719 } else { \
00720 OPER(res[0], f1, f2); \
00721 i -= 4; \
00722 OPER(res[1], f1, f2); \
00723 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00724 OPER(res[2], f1, f2); \
00725 OPER(res[3], f1, f2); \
00726 res += 4; \
00727 }
00728
00730 #define UNROLL4_KERNEL3(OPER) \
00731 OPER(res[0], f1, f2); \
00732 OPER(res[1], f1, f2); \
00733 i -= 4; \
00734 OPER(res[2], f1, f2); \
00735 OPER(res[3], f1, f2); \
00736 res += 4
00737
00738 #define UNROLL4_KERNEL3_PREPARE do {} while(0)
00739 #define UNROLL4_KERNEL3_FIXUP do {} while(0)
00740
00741
00743 #define UNROLL8_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
00744 if (EL_PER_CL(T) <= 1) { \
00745 OPER(res[0], f1, f2); \
00746 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00747 OPER(res[1], f1, f2); \
00748 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00749 OPER(res[2], f1, f2); \
00750 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00751 OPER(res[3], f1, f2); \
00752 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
00753 OPER(res[4], f1, f2); \
00754 i -= 8; \
00755 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00756 OPER(res[5], f1, f2); \
00757 PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
00758 OPER(res[6], f1, f2); \
00759 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
00760 OPER(res[7], f1, f2); \
00761 PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
00762 res += 8; \
00763 } else if (EL_PER_CL(T) <= 2) { \
00764 OPER(res[0], f1, f2); \
00765 OPER(res[1], f1, f2); \
00766 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00767 OPER(res[2], f1, f2); \
00768 OPER(res[3], f1, f2); \
00769 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00770 OPER(res[4], f1, f2); \
00771 i -= 8; \
00772 OPER(res[5], f1, f2); \
00773 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00774 OPER(res[6], f1, f2); \
00775 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
00776 OPER(res[7], f1, f2); \
00777 res += 8; \
00778 } else if (EL_PER_CL(T) <= 4) { \
00779 OPER(res[0], f1, f2); \
00780 OPER(res[1], f1, f2); \
00781 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00782 OPER(res[2], f1, f2); \
00783 OPER(res[3], f1, f2); \
00784 i -= 8; \
00785 OPER(res[4], f1, f2); \
00786 OPER(res[5], f1, f2); \
00787 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00788 OPER(res[6], f1, f2); \
00789 OPER(res[7], f1, f2); \
00790 res += 8; \
00791 } else { \
00792 OPER(res[0], f1, f2); \
00793 OPER(res[1], f1, f2); \
00794 OPER(res[2], f1, f2); \
00795 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00796 OPER(res[3], f1, f2); \
00797 OPER(res[4], f1, f2); \
00798 OPER(res[5], f1, f2); \
00799 i -= 8; \
00800 OPER(res[6], f1, f2); \
00801 OPER(res[7], f1, f2); \
00802 res += 8; \
00803 }
00804
00805
00807 #define UNROLL8_KERNEL3(OPER) \
00808 OPER(res[0], f1, f2); \
00809 OPER(res[1], f1, f2); \
00810 OPER(res[2], f1, f2); \
00811 OPER(res[3], f1, f2); \
00812 i -= 8; \
00813 OPER(res[4], f1, f2); \
00814 OPER(res[5], f1, f2); \
00815 OPER(res[6], f1, f2); \
00816 OPER(res[7], f1, f2); \
00817 res += 8
00818
00819 #define UNROLL8_KERNEL3_PREPARE do {} while(0)
00820 #define UNROLL8_KERNEL3_FIXUP do {} while(0)
00821
00822
00824 #define PREF_AHEAD1(T,PREFETCH_X,CA0) \
00825 if (PREFETCH_AHEAD >= 16) { \
00826 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00827 PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
00828 PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
00829 PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
00830 PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
00831 PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
00832 PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
00833 PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
00834 PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
00835 PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
00836 PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
00837 PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
00838 PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
00839 PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
00840 PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
00841 } else if (PREFETCH_AHEAD >= 8) { \
00842 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00843 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
00844 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
00845 PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
00846 PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
00847 PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
00848 PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
00849 } else if (PREFETCH_AHEAD >= 4) { \
00850 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00851 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
00852 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
00853 } else if (PREFETCH_AHEAD >= 2) { \
00854 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00855 }
00856
00857
00858
00859
00860 #if UNROLL_DEPTH == 1
00861
00862 # define UNR_PREF_KERNEL5 UNROLL1_PREF_KERNEL5
00863 # define UNR_KERNEL5 UNROLL1_KERNEL5
00864 # define UNR_KERNEL5_PREP UNROLL1_KERNEL5_PREPARE
00865 # define UNR_KERNEL5_FIX UNROLL1_KERNEL5_FIXUP
00866
00867 # define UNR_PREF_KERNEL4 UNROLL1_PREF_KERNEL4
00868 # define UNR_KERNEL4 UNROLL1_KERNEL4
00869 # define UNR_KERNEL4_PREP UNROLL1_KERNEL4_PREPARE
00870 # define UNR_KERNEL4_FIX UNROLL1_KERNEL4_FIXUP
00871
00872 # define UNR_PREF_KERNEL3 UNROLL1_PREF_KERNEL3
00873 # define UNR_KERNEL3 UNROLL1_KERNEL3
00874 # define UNR_KERNEL3_PREP UNROLL1_KERNEL3_PREPARE
00875 # define UNR_KERNEL3_FIX UNROLL1_KERNEL3_FIXUP
00876
00877 #elif UNROLL_DEPTH == 2
00878
00879 # define UNR_PREF_KERNEL5 UNROLL2_PREF_KERNEL5
00880 # define UNR_KERNEL5 UNROLL2_KERNEL5
00881 # define UNR_KERNEL5_PREP UNROLL2_KERNEL5_PREPARE
00882 # define UNR_KERNEL5_FIX UNROLL2_KERNEL5_FIXUP
00883
00884 # define UNR_PREF_KERNEL4 UNROLL2_PREF_KERNEL4
00885 # define UNR_KERNEL4 UNROLL2_KERNEL4
00886 # define UNR_KERNEL4_PREP UNROLL2_KERNEL4_PREPARE
00887 # define UNR_KERNEL4_FIX UNROLL2_KERNEL4_FIXUP
00888
00889 # define UNR_PREF_KERNEL3 UNROLL2_PREF_KERNEL3
00890 # define UNR_KERNEL3 UNROLL2_KERNEL3
00891 # define UNR_KERNEL3_PREP UNROLL2_KERNEL3_PREPARE
00892 # define UNR_KERNEL3_FIX UNROLL2_KERNEL3_FIXUP
00893
00894 #elif UNROLL_DEPTH == 4
00895
00896 # define UNR_PREF_KERNEL5 UNROLL4_PREF_KERNEL5
00897 # define UNR_KERNEL5 UNROLL4_KERNEL5
00898 # define UNR_KERNEL5_PREP UNROLL4_KERNEL5_PREPARE
00899 # define UNR_KERNEL5_FIX UNROLL4_KERNEL5_FIXUP
00900
00901 # define UNR_PREF_KERNEL4 UNROLL4_PREF_KERNEL4
00902 # define UNR_KERNEL4 UNROLL4_KERNEL4
00903 # define UNR_KERNEL4_PREP UNROLL4_KERNEL4_PREPARE
00904 # define UNR_KERNEL4_FIX UNROLL4_KERNEL4_FIXUP
00905
00906 # define UNR_PREF_KERNEL3 UNROLL4_PREF_KERNEL3
00907 # define UNR_KERNEL3 UNROLL4_KERNEL3
00908 # define UNR_KERNEL3_PREP UNROLL4_KERNEL3_PREPARE
00909 # define UNR_KERNEL3_FIX UNROLL4_KERNEL3_FIXUP
00910
00911 #elif UNROLL_DEPTH == 8
00912
00913 # define UNR_PREF_KERNEL5 UNROLL8_PREF_KERNEL5
00914 # define UNR_KERNEL5 UNROLL8_KERNEL5
00915 # define UNR_KERNEL5_PREP UNROLL8_KERNEL5_PREPARE
00916 # define UNR_KERNEL5_FIX UNROLL8_KERNEL5_FIXUP
00917
00918 # define UNR_PREF_KERNEL4 UNROLL8_PREF_KERNEL4
00919 # define UNR_KERNEL4 UNROLL8_KERNEL4
00920 # define UNR_KERNEL4_PREP UNROLL8_KERNEL4_PREPARE
00921 # define UNR_KERNEL4_FIX UNROLL8_KERNEL4_FIXUP
00922
00923 # define UNR_PREF_KERNEL3 UNROLL8_PREF_KERNEL3
00924 # define UNR_KERNEL3 UNROLL8_KERNEL3
00925 # define UNR_KERNEL3_PREP UNROLL8_KERNEL3_PREPARE
00926 # define UNR_KERNEL3_FIX UNROLL8_KERNEL3_FIXUP
00927
00928 #else
00929
00930 # error "UNROLL_DEPTH may only be 1, 2, 4, 8"
00931
00932 #endif
00933
00952
00953
00954
00955
00956 #ifdef USE_PREFETCH
00957
00958 # define VKERN_TEMPL_3V_PREF(OP3,T) \
00959 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
00960 PREFETCH_W (res, 3); \
00961 PREF_AHEAD3(T,3,MAX(1,CACHE_LOC_READ),MAX(1,CACHE_LOC_READ)); \
00962 UNR_KERNEL5_PREP; \
00963 do { \
00964 UNR_PREF_KERNEL5(OP3,T,CACHE_LOC_WRITE,CACHE_LOC_READ,CACHE_LOC_READ); \
00965 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
00966 UNR_KERNEL5_FIX; \
00967 }
00968
00969 # define VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_X,CW) \
00970 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
00971 PREFETCH_X (res, 3); \
00972 PREF_AHEAD2(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ)); \
00973 UNR_KERNEL4_PREP; \
00974 do { \
00975 UNR_PREF_KERNEL4(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ); \
00976 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
00977 UNR_KERNEL4_FIX; \
00978 }
00979
00980 # define VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_X,CW) \
00981 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
00982 PREFETCH_X (res, 3); \
00983 PREF_AHEAD1(T,PREFETCH_X,CW); \
00984 UNR_KERNEL3_PREP; \
00985 do { \
00986 UNR_PREF_KERNEL3(OP1,T,PREFETCH_X,CW); \
00987 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
00988 UNR_KERNEL3_FIX; \
00989 }
00990 #else
00991 # define VKERN_TEMPL_3V_PREF(OP,T) do {} while (0)
00992 # define VKERN_TEMPL_2V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
00993 # define VKERN_TEMPL_1V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
00994 #endif
00995
00996
00997
00998
00999
01000
01012
01013 #define VKERN_TEMPL_3V(FNAME,OP3) \
01014 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01015 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, const T* RESTRICT const);) \
01016 template <typename T> \
01017 VEC_INLINE void FNAME (const unsigned long sz, \
01018 T* RESTRICT const _res, \
01019 const T* RESTRICT const _v1, \
01020 const T* RESTRICT const _v2) \
01021 { \
01022 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
01023 register const T *v1 = _v1, *v2 = _v2; \
01024 register T *res = _res; \
01025 register long i = sz; \
01026 VKERN_TEMPL_3V_PREF(OP3,T); \
01027 \
01028 if (LIKELY(i >= UNROLL_DEPTH)) { \
01029 UNR_KERNEL5_PREP; \
01030 do { \
01031 UNR_KERNEL5(OP3); \
01032 } while (i >= UNROLL_DEPTH); \
01033 UNR_KERNEL5_FIX; \
01034 } \
01035 \
01036 for (; i; --i) { \
01037 OP3(*res, *v1, *v2, f1, f2); \
01038 ++v1; ++v2; ++res; \
01039 } \
01040 }
01041
01043 #define VKERN_TEMPL_3V_C(FNAME,OP3) \
01044 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01045 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
01046 const T* RESTRICT const, LCTYPED(T));) \
01047 template <typename T> \
01048 VEC_INLINE void FNAME (const unsigned long sz, \
01049 T* RESTRICT const _res, \
01050 const T* RESTRICT const _v1, \
01051 const T* RESTRICT const _v2, \
01052 LCTYPE(T) f2) \
01053 { \
01054 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
01055 register const T *v1 = _v1. *v2 = _v2; \
01056 register T *res = _res; \
01057 register long i = sz; \
01058 VKERN_TEMPL_3V_PREF(OP3,T); \
01059 \
01060 if (LIKELY(i >= UNROLL_DEPTH)) { \
01061 UNR_KERNEL5_PREP; \
01062 do { \
01063 UNR_KERNEL5(OP3); \
01064 } while (i >= UNROLL_DEPTH); \
01065 UNR_KERNEL5_FIX; \
01066 } \
01067 \
01068 for (; i; --i) { \
01069 OP3(*res, *v1, *v2, f1, f2); \
01070 ++v1; ++v2; ++res; \
01071 } \
01072 }
01073
01075 #define VKERN_TEMPL_3V_CC(FNAME,OP3) \
01076 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01077 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
01078 const T* RESTRICT const, LCTYPED(T), LCTYPED(T));) \
01079 template <typename T> \
01080 VEC_INLINE void FNAME (const unsigned long sz, \
01081 T* RESTRICT const _res, \
01082 const T* RESTRICT const _v1, \
01083 const T* RESTRICT const _v2, \
01084 LCTYPE(T) f1, \
01085 LCTYPE(T) f2) \
01086 { \
01087 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
01088 register long i = sz; \
01089 register const T *v1 = _v1, *v2 = _v2; \
01090 register T *res = _res; \
01091 VKERN_TEMPL_3V_PREF(OP3,T); \
01092 \
01093 if (LIKELY(i >= UNROLL_DEPTH)) { \
01094 UNR_KERNEL5_PREP; \
01095 do { \
01096 UNR_KERNEL5(OP3); \
01097 } while (i >= UNROLL_DEPTH); \
01098 UNR_KERNEL5_FIX; \
01099 } \
01100 \
01101 for (; i; --i) { \
01102 OP3(*res, *v1, *v2, f1, f2); \
01103 ++v1; ++v2; ++res; \
01104 } \
01105 }
01106
01108 #define VKERN_TEMPL_2V(FNAME,OP2) \
01109 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01110 (const unsigned long, T* RESTRICT const, const T* RESTRICT const);) \
01111 template <typename T> \
01112 VEC_INLINE void FNAME (const unsigned long sz, \
01113 T* RESTRICT const _res, \
01114 const T* RESTRICT const _v1) \
01115 { \
01116 PREFETCH_R(_v1, 3); \
01117 register const T *v1 = _v1; \
01118 register T* res = _res; \
01119 register long i = sz; \
01120 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_WRITE); \
01121 \
01122 if (LIKELY(i >= UNROLL_DEPTH)) { \
01123 UNR_KERNEL4_PREP; \
01124 do { \
01125 UNR_KERNEL4(OP2); \
01126 } while (i >= UNROLL_DEPTH); \
01127 UNR_KERNEL4_FIX; \
01128 } \
01129 \
01130 for (; i; --i) { \
01131 OP2(*res, *v1, f1, f2); \
01132 ++v1; ++res; \
01133 } \
01134 }
01135
01137 #define VKERN_TEMPL_2V_C(FNAME,OP2) \
01138 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01139 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
01140 LCTYPED(T));) \
01141 template <typename T> \
01142 VEC_INLINE void FNAME (const unsigned long sz, \
01143 T* RESTRICT const _res, \
01144 const T* RESTRICT const _v1, \
01145 LCTYPE(T) f2) \
01146 { \
01147 PREFETCH_R(_v1, 3); \
01148 register const T *v1 = _v1; \
01149 register T* res = _res; \
01150 register long i = sz; \
01151 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
01152 \
01153 if (LIKELY(i >= UNROLL_DEPTH)) { \
01154 UNR_KERNEL4_PREP; \
01155 do { \
01156 UNR_KERNEL4(OP2); \
01157 } while (i >= UNROLL_DEPTH); \
01158 UNR_KERNEL4_FIX; \
01159 } \
01160 \
01161 for (; i; --i) { \
01162 OP2(*res, *v1, f1, f2); \
01163 ++v1; ++res; \
01164 } \
01165 }
01166
01168 #define VKERN_TEMPL_2V_CC(FNAME,OP2) \
01169 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01170 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
01171 LCTYPED(T), LCTYPED(T));) \
01172 template <typename T> \
01173 VEC_INLINE void FNAME (const unsigned long sz, \
01174 T* RESTRICT const _res, \
01175 const T* RESTRICT const _v1, \
01176 LCTYPE(T) f1, \
01177 LCTYPE(T) f2) \
01178 { \
01179 PREFETCH_R(_v1, 3); \
01180 register const T *v1 = _v1; \
01181 register T* res = _res; \
01182 register long i = sz; \
01183 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
01184 \
01185 if (LIKELY(i >= UNROLL_DEPTH)) { \
01186 UNR_KERNEL4_PREP; \
01187 do { \
01188 UNR_KERNEL4(OP2); \
01189 } while (i >= UNROLL_DEPTH); \
01190 UNR_KERNEL4_FIX; \
01191 } \
01192 \
01193 for (; i; --i) { \
01194 OP2(*res, *v1, f1, f2); \
01195 ++v1; ++res; \
01196 } \
01197 }
01198
01200 #define VKERN_TEMPL_2V_T(FNAME,OP2,TYPE) \
01201 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01202 (const unsigned long, const T* RESTRICT const, \
01203 const T* RESTRICT const, TYPE&);) \
01204 template <typename T> \
01205 VEC_INLINE void FNAME (const unsigned long sz, \
01206 const T* RESTRICT const _res, \
01207 const T* RESTRICT const _v1, \
01208 TYPE &_f2) \
01209 { \
01210 PREFETCH_R(_v1, 3); \
01211 register tbci_traits<TYPE>::loop_refval_type f2(_f2); \
01212 register const T *v1 = _v1; \
01213 register const T* res = _res; \
01214 register long i = sz; \
01215 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_READ); \
01216 \
01217 if (LIKELY(i >= UNROLL_DEPTH)) { \
01218 UNR_KERNEL4_PREP; \
01219 do { \
01220 UNR_KERNEL4(OP2); \
01221 } while (i >= UNROLL_DEPTH); \
01222 UNR_KERNEL4_FIX; \
01223 } \
01224 \
01225 for (; i; --i) { \
01226 OP2(*res, *v1, f1, f2); \
01227 ++v1; ++res; \
01228 } \
01229 _f2 = f2; \
01230 }
01231
01233 #define VKERN_TEMPL_1V(FNAME,OP1) \
01234 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01235 (const unsigned long, T* RESTRICT const);) \
01236 template <typename T> \
01237 VEC_INLINE void FNAME (const unsigned long sz, \
01238 T* RESTRICT const _res) \
01239 { \
01240 register long i = sz; \
01241 register T* res = _res; \
01242 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
01243 \
01244 if (LIKELY(i >= UNROLL_DEPTH)) { \
01245 UNR_KERNEL3_PREP; \
01246 do { \
01247 UNR_KERNEL3(OP1); \
01248 } while (i >= UNROLL_DEPTH); \
01249 UNR_KERNEL3_FIX; \
01250 } \
01251 \
01252 for (; i; --i) { \
01253 OP1(*res, f1, f2); \
01254 ++res; \
01255 } \
01256 }
01257
01259 #define VKERN_TEMPL_1V_C(FNAME,OP1) \
01260 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01261 (const unsigned long, T* RESTRICT const, LCTYPED(T));) \
01262 template <typename T> \
01263 VEC_INLINE void FNAME (const unsigned long sz, \
01264 T* RESTRICT const _res, \
01265 LCTYPE(T) f2) \
01266 { \
01267 register long i = sz; \
01268 register T* res = _res; \
01269 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
01270 \
01271 if (LIKELY(i >= UNROLL_DEPTH)) { \
01272 UNR_KERNEL3_PREP; \
01273 do { \
01274 UNR_KERNEL3(OP1); \
01275 } while (i >= UNROLL_DEPTH); \
01276 UNR_KERNEL3_FIX; \
01277 } \
01278 \
01279 for (; i; --i) { \
01280 OP1(*res, f1, f2); \
01281 ++res; \
01282 } \
01283 }
01284
01286 #define VKERN_TEMPL_1V_CC(FNAME,OP1) \
01287 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01288 (const unsigned long, T* RESTRICT const, LCTYPED(T), \
01289 LCTYPED(T));) \
01290 template <typename T> \
01291 VEC_INLINE void FNAME (const unsigned long sz, \
01292 T* RESTRICT const _res, \
01293 LCTYPE(T) f1, \
01294 LCTYPE(T) f2) \
01295 { \
01296 register long i = sz; \
01297 register T* res = _res; \
01298 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
01299 \
01300 if (LIKELY(i >= UNROLL_DEPTH)) { \
01301 UNR_KERNEL3_PREP; \
01302 do { \
01303 UNR_KERNEL3(OP1); \
01304 } while (i >= UNROLL_DEPTH); \
01305 UNR_KERNEL3_FIX; \
01306 } \
01307 \
01308 for (; i; --i) { \
01309 OP1(*res, f1, f2); \
01310 ++res; \
01311 } \
01312 }
01313
01315 #define VKERN_TEMPL_1V_T(FNAME,OP1,TYPE) \
01316 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01317 (const unsigned long, const T* const, TYPE&);) \
01318 template <typename T> \
01319 VEC_INLINE void FNAME (const unsigned long sz, \
01320 const T* const _res, \
01321 TYPE &_f2) \
01322 { \
01323 register tbci_traits<TYPE>::loop_refval_type f2(_f2); \
01324 register const T* res = _res; \
01325 register long i = sz; \
01326 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
01327 \
01328 if (LIKELY(i >= UNROLL_DEPTH)) { \
01329 UNR_KERNEL3_PREP; \
01330 do { \
01331 UNR_KERNEL3(OP1); \
01332 } while (i >= UNROLL_DEPTH); \
01333 UNR_KERNEL3_FIX; \
01334 } \
01335 \
01336 for (; i; --i) { \
01337 OP1(*res, f1, f2); \
01338 ++res; \
01339 } \
01340 _f2 = f2; \
01341 }
01342
01343 #endif