00001
00008 #ifndef TBCI_UNROLL_PREFETCH_DEF_H
00009 #define TBCI_UNROLL_PREFETCH_DEF_H
00010
00011
00012
00014 #define LCTYPE(T) register typename tbci_traits<T>::loop_const_refval_type
00015 #define LCTYPED(T) register tbci_traits<T>::loop_const_refval_type
00016
00039 #ifndef UNROLL_DEPTH
00040 # define UNROLL_DEPTH 4
00041 #endif
00042
00043
00044
00045
00046
00048 #define UNROLL1_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
00049 OPER(res[0], v1[0], v2[0], f1, f2); \
00050 --i; \
00051 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00052 ++v1; \
00053 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00054 ++v2; \
00055 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00056 ++res
00057
00059 #define UNROLL1_KERNEL5(OPER) \
00060 --i; \
00061 OPER(res[0], v1[0], v2[0], f1, f2); \
00062 ++v1; ++v2; ++res
00063
00064 #define UNROLL1_KERNEL5_PREPARE do {} while(0)
00065 #define UNROLL1_KERNEL5_FIXUP do {} while(0)
00066
00067
00069 #define UNROLL2_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
00070 if (EL_PER_CL(T) <= 1) { \
00071 i -= 2; \
00072 OPER(res[0], v1[0], v2[0], f1, f2); \
00073 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00074 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00075 v1 += 2; \
00076 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00077 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
00078 OPER(res[1], v1[-1], v2[1], f1, f2); \
00079 v2 += 2; \
00080 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00081 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
00082 res += 2; \
00083 } else { \
00084 i -= 2; \
00085 OPER(res[0], v1[0], v2[0], f1, f2); \
00086 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00087 v1 += 2; \
00088 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00089 OPER(res[1], v1[-1], v2[1], f1, f2); \
00090 v2 += 2; \
00091 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00092 res += 2; \
00093 } \
00094
00095
00097 #define UNROLL2_KERNEL5(OPER) \
00098 OPER(res[0], v1[0], v2[0], f1, f2); \
00099 v1 += 2; i -= 2; \
00100 OPER(res[1], v1[-1], v2[1], f1, f2); \
00101 v2 += 2; res += 2
00102
00103 #define UNROLL2_KERNEL5_PREPARE do {} while(0)
00104 #define UNROLL2_KERNEL5_FIXUP do {} while(0)
00105
00106
00108 #define UNROLL4_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
00109 if (EL_PER_CL(T) <= 1) { \
00110 OPER(res[0], v1[0], v2[0], f1, f2); \
00111 i -= 4; \
00112 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00113 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00114 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00115 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
00116 OPER(res[1], v1[1], v2[1], f1, f2); \
00117 v1 += 4; \
00118 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00119 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
00120 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
00121 PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
00122 OPER(res[2], v1[-2], v2[2], f1, f2); \
00123 v2 += 4; \
00124 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00125 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
00126 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
00127 PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
00128 OPER(res[3], v1[-1], v2[-1], f1, f2); \
00129 res += 4; \
00130 } else if (EL_PER_CL(T) <= 2) { \
00131 OPER(res[0], v1[0], v2[0], f1, f2); \
00132 i -= 4; \
00133 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00134 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00135 OPER(res[1], v1[1], v2[1], f1, f2); \
00136 v1 += 4; \
00137 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00138 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
00139 OPER(res[2], v1[-2], v2[2], f1, f2); \
00140 v2 += 4; \
00141 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00142 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
00143 OPER(res[3], v1[-1], v2[-1], f1, f2); \
00144 res += 4; \
00145 } else { \
00146 OPER(res[0], v1[0], v2[0], f1, f2); \
00147 i -= 4; \
00148 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00149 OPER(res[1], v1[1], v2[1], f1, f2); \
00150 v1 += 4; \
00151 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00152 OPER(res[2], v1[-2], v2[2], f1, f2); \
00153 v2 += 4; \
00154 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00155 OPER(res[3], v1[-1], v2[-1], f1, f2); \
00156 res += 4; \
00157 }
00158
00160 #define UNROLL4_KERNEL5(OPER) \
00161 OPER(res[0], v1[0], v2[0], f1, f2); \
00162 i -= 4; \
00163 OPER(res[1], v1[1], v2[1], f1, f2); \
00164 v1 += 4; \
00165 OPER(res[2], v1[-2], v2[2], f1, f2); \
00166 v2 += 4; \
00167 OPER(res[3], v1[-1], v2[-1], f1, f2); \
00168 res += 4
00169
00170 #define UNROLL4_KERNEL5_PREPARE do {} while(0)
00171 #define UNROLL4_KERNEL5_FIXUP do {} while(0)
00172
00173
00175 #define UNROLL8_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
00176 if (EL_PER_CL(T) <= 1) { \
00177 OPER(res[0], v1[0], v2[0], f1, f2); \
00178 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00179 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00180 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00181 OPER(res[1], v1[1], v2[1], f1, f2); \
00182 i -= 8; \
00183 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
00184 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00185 PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
00186 OPER(res[2], v1[2], v2[2], f1, f2); \
00187 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
00188 PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
00189 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00190 OPER(res[3], v1[3], v2[3], f1, f2); \
00191 v1 += 8; \
00192 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
00193 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
00194 PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
00195 OPER(res[4], v1[-4], v2[4], f1, f2); \
00196 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
00197 PREFETCH_R(v2 +PREF_OFFS(T)+5, CA2); \
00198 PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
00199 OPER(res[5], v1[-3], v2[5], f1, f2); \
00200 PREFETCH_R(v2 +PREF_OFFS(T)+7, CA2); \
00201 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00202 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
00203 OPER(res[6], v1[-2], v2[6], f1, f2); \
00204 v2 += 8; \
00205 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
00206 PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
00207 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
00208 OPER(res[7], v1[-1], v2[-1], f1, f2); \
00209 PREFETCH_W(res+PREF_OFFS(T)+5, CA0); \
00210 PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
00211 PREFETCH_W(res+PREF_OFFS(T)+7, CA0); \
00212 res += 8; \
00213 } else if (EL_PER_CL(T) <= 2) { \
00214 OPER(res[0], v1[0], v2[0], f1, f2); \
00215 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00216 i -= 8; \
00217 OPER(res[1], v1[1], v2[1], f1, f2); \
00218 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00219 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00220 OPER(res[2], v1[2], v2[2], f1, f2); \
00221 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
00222 OPER(res[3], v1[3], v2[3], f1, f2); \
00223 v1 += 8; \
00224 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00225 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
00226 OPER(res[4], v1[-4], v2[4], f1, f2); \
00227 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
00228 PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
00229 OPER(res[5], v1[-3], v2[5], f1, f2); \
00230 v2 += 8; \
00231 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00232 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
00233 OPER(res[6], v1[-2], v2[-2], f1, f2); \
00234 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
00235 PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
00236 OPER(res[7], v1[-1], v2[-1], f1, f2); \
00237 res += 8; \
00238 } else if (EL_PER_CL(T) <= 4) { \
00239 OPER(res[0], v1[0], v2[0], f1, f2); \
00240 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00241 OPER(res[1], v1[1], v2[1], f1, f2); \
00242 i -= 8; \
00243 OPER(res[2], v1[2], v2[2], f1, f2); \
00244 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00245 OPER(res[3], v1[3], v2[3], f1, f2); \
00246 v1 += 8; \
00247 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00248 OPER(res[4], v1[-4], v2[4], f1, f2); \
00249 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
00250 OPER(res[5], v1[-3], v2[5], f1, f2); \
00251 v2 += 8; \
00252 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00253 OPER(res[6], v1[-2], v2[-2], f1, f2); \
00254 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
00255 OPER(res[7], v1[-1], v2[-1], f1, f2); \
00256 res += 8; \
00257 } else { \
00258 OPER(res[0], v1[0], v2[0], f1, f2); \
00259 i -= 8; \
00260 OPER(res[1], v1[1], v2[1], f1, f2); \
00261 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00262 OPER(res[2], v1[2], v2[2], f1, f2); \
00263 OPER(res[3], v1[3], v2[3], f1, f2); \
00264 v1 += 8; \
00265 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
00266 OPER(res[4], v1[-4], v2[4], f1, f2); \
00267 OPER(res[5], v1[-3], v2[5], f1, f2); \
00268 v2 += 8; \
00269 PREFETCH_W(res+PREF_OFFS(T), CA0); \
00270 OPER(res[6], v1[-2], v2[-2], f1, f2); \
00271 OPER(res[7], v1[-1], v2[-1], f1, f2); \
00272 res += 8; \
00273 }
00274
00275
00277 #define UNROLL8_KERNEL5(OPER) \
00278 OPER(res[0], v1[0], v2[0], f1, f2); \
00279 OPER(res[1], v1[1], v2[1], f1, f2); \
00280 i -= 8; \
00281 OPER(res[2], v1[2], v2[2], f1, f2); \
00282 OPER(res[3], v1[3], v2[3], f1, f2); \
00283 v1 += 8; \
00284 OPER(res[4], v1[-4], v2[4], f1, f2); \
00285 OPER(res[5], v1[-3], v2[5], f1, f2); \
00286 v2 += 8; \
00287 OPER(res[6], v1[-2], v2[-2], f1, f2); \
00288 OPER(res[7], v1[-1], v2[-1], f1, f2); \
00289 res += 8
00290
00291 #define UNROLL8_KERNEL5_PREPARE do {} while(0)
00292 #define UNROLL8_KERNEL5_FIXUP do {} while(0)
00293
00295 #define PREF_AHEAD3(T,CA0,CA1,CA2) \
00296 if (PREFETCH_AHEAD >= 16) { \
00297 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00298 PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
00299 PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
00300 PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
00301 PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
00302 PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
00303 PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
00304 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
00305 PREFETCH_R(v2 +EL_PER_CL(T)* 2, CA2); \
00306 PREFETCH_R(v2 +EL_PER_CL(T)* 3, CA2); \
00307 PREFETCH_R(v2 +EL_PER_CL(T)* 4, CA2); \
00308 PREFETCH_R(v2 +EL_PER_CL(T)* 5, CA2); \
00309 PREFETCH_R(v2 +EL_PER_CL(T)* 6, CA2); \
00310 PREFETCH_R(v2 +EL_PER_CL(T)* 7, CA2); \
00311 PREFETCH_W(res+EL_PER_CL(T), CA0); \
00312 PREFETCH_W(res+EL_PER_CL(T)* 2, CA0); \
00313 PREFETCH_W(res+EL_PER_CL(T)* 3, CA0); \
00314 PREFETCH_W(res+EL_PER_CL(T)* 4, CA0); \
00315 PREFETCH_W(res+EL_PER_CL(T)* 5, CA0); \
00316 PREFETCH_W(res+EL_PER_CL(T)* 6, CA0); \
00317 PREFETCH_W(res+EL_PER_CL(T)* 7, CA0); \
00318 PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
00319 PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
00320 PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
00321 PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
00322 PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
00323 PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
00324 PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
00325 PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
00326 PREFETCH_R(v2 +EL_PER_CL(T)* 8, CA2); \
00327 PREFETCH_R(v2 +EL_PER_CL(T)* 9, CA2); \
00328 PREFETCH_R(v2 +EL_PER_CL(T)*10, CA2); \
00329 PREFETCH_R(v2 +EL_PER_CL(T)*11, CA2); \
00330 PREFETCH_R(v2 +EL_PER_CL(T)*12, CA2); \
00331 PREFETCH_R(v2 +EL_PER_CL(T)*13, CA2); \
00332 PREFETCH_R(v2 +EL_PER_CL(T)*14, CA2); \
00333 PREFETCH_R(v2 +EL_PER_CL(T)*15, CA2); \
00334 PREFETCH_W(res+EL_PER_CL(T)* 8, CA0); \
00335 PREFETCH_W(res+EL_PER_CL(T)* 9, CA0); \
00336 PREFETCH_W(res+EL_PER_CL(T)*10, CA0); \
00337 PREFETCH_W(res+EL_PER_CL(T)*11, CA0); \
00338 PREFETCH_W(res+EL_PER_CL(T)*12, CA0); \
00339 PREFETCH_W(res+EL_PER_CL(T)*13, CA0); \
00340 PREFETCH_W(res+EL_PER_CL(T)*14, CA0); \
00341 PREFETCH_W(res+EL_PER_CL(T)*15, CA0); \
00342 } else if (PREFETCH_AHEAD >= 8) { \
00343 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00344 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
00345 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
00346 PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
00347 PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
00348 PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
00349 PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
00350 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
00351 PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
00352 PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
00353 PREFETCH_R(v2 +EL_PER_CL(T)*4, CA2); \
00354 PREFETCH_R(v2 +EL_PER_CL(T)*5, CA2); \
00355 PREFETCH_R(v2 +EL_PER_CL(T)*6, CA2); \
00356 PREFETCH_R(v2 +EL_PER_CL(T)*7, CA2); \
00357 PREFETCH_W(res+EL_PER_CL(T), CA0); \
00358 PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
00359 PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
00360 PREFETCH_W(res+EL_PER_CL(T)*4, CA0); \
00361 PREFETCH_W(res+EL_PER_CL(T)*5, CA0); \
00362 PREFETCH_W(res+EL_PER_CL(T)*6, CA0); \
00363 PREFETCH_W(res+EL_PER_CL(T)*7, CA0); \
00364 } else if (PREFETCH_AHEAD >= 4) { \
00365 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00366 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
00367 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
00368 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
00369 PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
00370 PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
00371 PREFETCH_W(res+EL_PER_CL(T), CA0); \
00372 PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
00373 PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
00374 } else if (PREFETCH_AHEAD >= 2) { \
00375 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00376 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
00377 PREFETCH_W(res+EL_PER_CL(T), CA0); \
00378 }
00379
00380
00381
00382
00383
00384
00386 #define UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
00387 OPER(res[0], v1[0], f1, f2); \
00388 --i; \
00389 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00390 ++v1; \
00391 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00392 ++res
00393
00395 #define UNROLL1_KERNEL4(OPER) \
00396 --i; \
00397 OPER(res[0], v1[0], f1, f2); \
00398 ++v1; ++res
00399
00400 #define UNROLL1_KERNEL4_PREPARE do {} while(0)
00401 #define UNROLL1_KERNEL4_FIXUP do {} while(0)
00402
00403
00405 #define UNROLL2_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
00406 if (EL_PER_CL(T) <= 1) { \
00407 i -= 2; \
00408 OPER(res[0], v1[0], f1, f2); \
00409 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00410 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00411 OPER(res[1], v1[1], f1, f2); \
00412 v1 += 2; \
00413 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00414 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00415 res += 2; \
00416 } else { \
00417 i -= 2; \
00418 OPER(res[0], v1[0], f1, f2); \
00419 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00420 OPER(res[1], v1[1], f1, f2); \
00421 v1 += 2; \
00422 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00423 res += 2; \
00424 } \
00425
00426
00428 #define UNROLL2_KERNEL4(OPER) \
00429 OPER(res[0], v1[0], f1, f2); \
00430 v1 += 2; i -= 2; \
00431 OPER(res[1], v1[-1],f1, f2); \
00432 res += 2
00433
00434 #define UNROLL2_KERNEL4_PREPARE do {} while(0)
00435 #define UNROLL2_KERNEL4_FIXUP do {} while(0)
00436
00437
00439 #define UNROLL4_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
00440 if (EL_PER_CL(T) <= 1) { \
00441 OPER(res[0], v1[0], f1, f2); \
00442 i -= 4; \
00443 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00444 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00445 OPER(res[1], v1[1], f1, f2); \
00446 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00447 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
00448 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00449 OPER(res[2], v1[2], f1, f2); \
00450 v1 += 4; \
00451 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00452 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00453 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
00454 OPER(res[3], v1[-1], f1, f2); \
00455 res += 4; \
00456 } else if (EL_PER_CL(T) <= 2) { \
00457 OPER(res[0], v1[0], f1, f2); \
00458 i -= 4; \
00459 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00460 OPER(res[1], v1[1], f1, f2); \
00461 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00462 OPER(res[2], v1[2], f1, f2); \
00463 v1 += 4; \
00464 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00465 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00466 OPER(res[3], v1[-1], f1, f2); \
00467 res += 4; \
00468 } else { \
00469 OPER(res[0], v1[0], f1, f2); \
00470 i -= 4; \
00471 OPER(res[1], v1[1], f1, f2); \
00472 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00473 OPER(res[2], v1[2], f1, f2); \
00474 v1 += 4; \
00475 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00476 OPER(res[3], v1[-1], f1, f2); \
00477 res += 4; \
00478 }
00479
00481 #define UNROLL4_KERNEL4(OPER) \
00482 OPER(res[0], v1[0], f1, f2); \
00483 OPER(res[1], v1[1], f1, f2); \
00484 v1 += 4; i -= 4; \
00485 OPER(res[2], v1[-2], f1, f2); \
00486 OPER(res[3], v1[-1], f1, f2); \
00487 res += 4
00488
00489 #define UNROLL4_KERNEL4_PREPARE do {} while(0)
00490 #define UNROLL4_KERNEL4_FIXUP do {} while(0)
00491
00492
00494 #define UNROLL8_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
00495 if (EL_PER_CL(T) <= 1) { \
00496 OPER(res[0], v1[0], f1, f2); \
00497 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00498 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
00499 OPER(res[1], v1[1], f1, f2); \
00500 i -= 8; \
00501 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00502 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
00503 OPER(res[2], v1[2], f1, f2); \
00504 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00505 PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
00506 OPER(res[3], v1[3], f1, f2); \
00507 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
00508 PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
00509 OPER(res[4], v1[4], f1, f2); \
00510 v1 += 8; \
00511 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00512 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00513 OPER(res[5], v1[-3], f1, f2); \
00514 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00515 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
00516 OPER(res[6], v1[-2], f1, f2); \
00517 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00518 PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
00519 OPER(res[7], v1[-1], f1, f2); \
00520 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
00521 PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
00522 res += 8; \
00523 } else if (EL_PER_CL(T) <= 2) { \
00524 OPER(res[0], v1[0], f1, f2); \
00525 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00526 OPER(res[1], v1[1], f1, f2); \
00527 i -= 8; \
00528 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
00529 OPER(res[2], v1[2], f1, f2); \
00530 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00531 OPER(res[3], v1[3], f1, f2); \
00532 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
00533 OPER(res[4], v1[4], f1, f2); \
00534 v1 += 8; \
00535 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00536 OPER(res[5], v1[-3], f1, f2); \
00537 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00538 OPER(res[6], v1[-2], f1, f2); \
00539 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00540 OPER(res[7], v1[-1], f1, f2); \
00541 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
00542 res += 8; \
00543 } else if (EL_PER_CL(T) <= 4) { \
00544 OPER(res[0], v1[0], f1, f2); \
00545 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00546 OPER(res[1], v1[1], f1, f2); \
00547 OPER(res[2], v1[2], f1, f2); \
00548 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
00549 OPER(res[3], v1[3], f1, f2); \
00550 OPER(res[4], v1[4], f1, f2); \
00551 v1 += 8; \
00552 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00553 OPER(res[5], v1[-3], f1, f2); \
00554 i -= 8; \
00555 OPER(res[6], v1[-2], f1, f2); \
00556 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00557 OPER(res[7], v1[-1], f1, f2); \
00558 res += 8; \
00559 } else { \
00560 OPER(res[0], v1[0], f1, f2); \
00561 OPER(res[1], v1[1], f1, f2); \
00562 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
00563 OPER(res[2], v1[2], f1, f2); \
00564 OPER(res[3], v1[3], f1, f2); \
00565 v1 += 8; \
00566 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00567 OPER(res[4], v1[-4], f1, f2); \
00568 OPER(res[5], v1[-3], f1, f2); \
00569 i -= 8; \
00570 OPER(res[6], v1[-2], f1, f2); \
00571 OPER(res[7], v1[-1], f1, f2); \
00572 res += 8; \
00573 }
00574
00575
00577 #define UNROLL8_KERNEL4(OPER) \
00578 OPER(res[0], v1[0], f1, f2); \
00579 OPER(res[1], v1[1], f1, f2); \
00580 OPER(res[2], v1[2], f1, f2); \
00581 OPER(res[3], v1[3], f1, f2); \
00582 v1 += 8; i -= 8; \
00583 OPER(res[4], v1[-4], f1, f2); \
00584 OPER(res[5], v1[-3], f1, f2); \
00585 OPER(res[6], v1[-2], f1, f2); \
00586 OPER(res[7], v1[-1], f1, f2); \
00587 res += 8
00588
00589 #define UNROLL8_KERNEL4_PREPARE do {} while(0)
00590 #define UNROLL8_KERNEL4_FIXUP do {} while(0)
00591
00592
00594 #define PREF_AHEAD2(T,PREFETCH_X,CA0,CA1) \
00595 if (PREFETCH_AHEAD >= 16) { \
00596 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00597 PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
00598 PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
00599 PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
00600 PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
00601 PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
00602 PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
00603 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00604 PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
00605 PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
00606 PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
00607 PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
00608 PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
00609 PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
00610 PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
00611 PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
00612 PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
00613 PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
00614 PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
00615 PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
00616 PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
00617 PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
00618 PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
00619 PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
00620 PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
00621 PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
00622 PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
00623 PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
00624 PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
00625 PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
00626 } else if (PREFETCH_AHEAD >= 8) { \
00627 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00628 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
00629 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
00630 PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
00631 PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
00632 PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
00633 PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
00634 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00635 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
00636 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
00637 PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
00638 PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
00639 PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
00640 PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
00641 } else if (PREFETCH_AHEAD >= 4) { \
00642 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00643 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
00644 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
00645 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00646 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
00647 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
00648 } else if (PREFETCH_AHEAD >= 2) { \
00649 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
00650 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00651 }
00652
00653
00654
00655
00656
00657
00659 #define UNROLL1_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
00660 OPER(res[0], f1, f2); \
00661 --i; \
00662 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00663 ++res
00664
00666 #define UNROLL1_KERNEL3(OPER) \
00667 --i; \
00668 OPER(res[0], f1, f2); \
00669 ++res
00670
00671 #define UNROLL1_KERNEL3_PREPARE do {} while(0)
00672 #define UNROLL1_KERNEL3_FIXUP do {} while(0)
00673
00674
00676 #define UNROLL2_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
00677 if (EL_PER_CL(T) <= 1) { \
00678 OPER(res[0], f1, f2); \
00679 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00680 i -= 2; \
00681 OPER(res[1], f1, f2); \
00682 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00683 res += 2; \
00684 } else { \
00685 OPER(res[0], f1, f2); \
00686 i -= 2; \
00687 OPER(res[1], f1, f2); \
00688 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00689 res += 2; \
00690 } \
00691
00692
00694 #define UNROLL2_KERNEL3(OPER) \
00695 OPER(res[0], f1, f2); \
00696 i -= 2; \
00697 OPER(res[1], f1, f2); \
00698 res += 2
00699
00700 #define UNROLL2_KERNEL3_PREPARE do {} while(0)
00701 #define UNROLL2_KERNEL3_FIXUP do {} while(0)
00702
00703
00705 #define UNROLL4_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
00706 if (EL_PER_CL(T) <= 1) { \
00707 OPER(res[0], f1, f2); \
00708 i -= 4; \
00709 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00710 OPER(res[1], f1, f2); \
00711 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00712 OPER(res[2], f1, f2); \
00713 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00714 OPER(res[3], f1, f2); \
00715 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
00716 res += 4; \
00717 } else if (EL_PER_CL(T) <= 2) { \
00718 OPER(res[0], f1, f2); \
00719 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00720 OPER(res[1], f1, f2); \
00721 i -= 4; \
00722 OPER(res[2], f1, f2); \
00723 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00724 OPER(res[3], f1, f2); \
00725 res += 4; \
00726 } else { \
00727 OPER(res[0], f1, f2); \
00728 i -= 4; \
00729 OPER(res[1], f1, f2); \
00730 OPER(res[2], f1, f2); \
00731 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00732 OPER(res[3], f1, f2); \
00733 res += 4; \
00734 }
00735
00737 #define UNROLL4_KERNEL3(OPER) \
00738 OPER(res[0], f1, f2); \
00739 OPER(res[1], f1, f2); \
00740 i -= 4; \
00741 OPER(res[2], f1, f2); \
00742 OPER(res[3], f1, f2); \
00743 res += 4
00744
00745 #define UNROLL4_KERNEL3_PREPARE do {} while(0)
00746 #define UNROLL4_KERNEL3_FIXUP do {} while(0)
00747
00748
00750 #define UNROLL8_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
00751 if (EL_PER_CL(T) <= 1) { \
00752 OPER(res[0], f1, f2); \
00753 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00754 OPER(res[1], f1, f2); \
00755 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
00756 OPER(res[2], f1, f2); \
00757 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00758 OPER(res[3], f1, f2); \
00759 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
00760 OPER(res[4], f1, f2); \
00761 i -= 8; \
00762 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00763 OPER(res[5], f1, f2); \
00764 PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
00765 OPER(res[6], f1, f2); \
00766 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
00767 OPER(res[7], f1, f2); \
00768 PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
00769 res += 8; \
00770 } else if (EL_PER_CL(T) <= 2) { \
00771 OPER(res[0], f1, f2); \
00772 OPER(res[1], f1, f2); \
00773 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00774 OPER(res[2], f1, f2); \
00775 OPER(res[3], f1, f2); \
00776 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
00777 OPER(res[4], f1, f2); \
00778 i -= 8; \
00779 OPER(res[5], f1, f2); \
00780 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00781 OPER(res[6], f1, f2); \
00782 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
00783 OPER(res[7], f1, f2); \
00784 res += 8; \
00785 } else if (EL_PER_CL(T) <= 4) { \
00786 OPER(res[0], f1, f2); \
00787 OPER(res[1], f1, f2); \
00788 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00789 OPER(res[2], f1, f2); \
00790 OPER(res[3], f1, f2); \
00791 i -= 8; \
00792 OPER(res[4], f1, f2); \
00793 OPER(res[5], f1, f2); \
00794 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
00795 OPER(res[6], f1, f2); \
00796 OPER(res[7], f1, f2); \
00797 res += 8; \
00798 } else { \
00799 OPER(res[0], f1, f2); \
00800 OPER(res[1], f1, f2); \
00801 OPER(res[2], f1, f2); \
00802 PREFETCH_X(res+PREF_OFFS(T), CA0); \
00803 OPER(res[3], f1, f2); \
00804 OPER(res[4], f1, f2); \
00805 OPER(res[5], f1, f2); \
00806 i -= 8; \
00807 OPER(res[6], f1, f2); \
00808 OPER(res[7], f1, f2); \
00809 res += 8; \
00810 }
00811
00812
00814 #define UNROLL8_KERNEL3(OPER) \
00815 OPER(res[0], f1, f2); \
00816 OPER(res[1], f1, f2); \
00817 OPER(res[2], f1, f2); \
00818 OPER(res[3], f1, f2); \
00819 i -= 8; \
00820 OPER(res[4], f1, f2); \
00821 OPER(res[5], f1, f2); \
00822 OPER(res[6], f1, f2); \
00823 OPER(res[7], f1, f2); \
00824 res += 8
00825
00826 #define UNROLL8_KERNEL3_PREPARE do {} while(0)
00827 #define UNROLL8_KERNEL3_FIXUP do {} while(0)
00828
00829
00831 #define PREF_AHEAD1(T,PREFETCH_X,CA0) \
00832 if (PREFETCH_AHEAD >= 16) { \
00833 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00834 PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
00835 PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
00836 PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
00837 PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
00838 PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
00839 PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
00840 PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
00841 PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
00842 PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
00843 PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
00844 PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
00845 PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
00846 PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
00847 PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
00848 } else if (PREFETCH_AHEAD >= 8) { \
00849 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00850 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
00851 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
00852 PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
00853 PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
00854 PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
00855 PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
00856 } else if (PREFETCH_AHEAD >= 4) { \
00857 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00858 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
00859 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
00860 } else if (PREFETCH_AHEAD >= 2) { \
00861 PREFETCH_X(res+EL_PER_CL(T), CA0); \
00862 }
00863
00864
00865
00866
00867 #if UNROLL_DEPTH == 1
00868
00869 # define UNR_PREF_KERNEL5 UNROLL1_PREF_KERNEL5
00870 # define UNR_KERNEL5 UNROLL1_KERNEL5
00871 # define UNR_KERNEL5_PREP UNROLL1_KERNEL5_PREPARE
00872 # define UNR_KERNEL5_FIX UNROLL1_KERNEL5_FIXUP
00873
00874 # define UNR_PREF_KERNEL4 UNROLL1_PREF_KERNEL4
00875 # define UNR_KERNEL4 UNROLL1_KERNEL4
00876 # define UNR_KERNEL4_PREP UNROLL1_KERNEL4_PREPARE
00877 # define UNR_KERNEL4_FIX UNROLL1_KERNEL4_FIXUP
00878
00879 # define UNR_PREF_KERNEL3 UNROLL1_PREF_KERNEL3
00880 # define UNR_KERNEL3 UNROLL1_KERNEL3
00881 # define UNR_KERNEL3_PREP UNROLL1_KERNEL3_PREPARE
00882 # define UNR_KERNEL3_FIX UNROLL1_KERNEL3_FIXUP
00883
00884 #elif UNROLL_DEPTH == 2
00885
00886 # define UNR_PREF_KERNEL5 UNROLL2_PREF_KERNEL5
00887 # define UNR_KERNEL5 UNROLL2_KERNEL5
00888 # define UNR_KERNEL5_PREP UNROLL2_KERNEL5_PREPARE
00889 # define UNR_KERNEL5_FIX UNROLL2_KERNEL5_FIXUP
00890
00891 # define UNR_PREF_KERNEL4 UNROLL2_PREF_KERNEL4
00892 # define UNR_KERNEL4 UNROLL2_KERNEL4
00893 # define UNR_KERNEL4_PREP UNROLL2_KERNEL4_PREPARE
00894 # define UNR_KERNEL4_FIX UNROLL2_KERNEL4_FIXUP
00895
00896 # define UNR_PREF_KERNEL3 UNROLL2_PREF_KERNEL3
00897 # define UNR_KERNEL3 UNROLL2_KERNEL3
00898 # define UNR_KERNEL3_PREP UNROLL2_KERNEL3_PREPARE
00899 # define UNR_KERNEL3_FIX UNROLL2_KERNEL3_FIXUP
00900
00901 #elif UNROLL_DEPTH == 4
00902
00903 # define UNR_PREF_KERNEL5 UNROLL4_PREF_KERNEL5
00904 # define UNR_KERNEL5 UNROLL4_KERNEL5
00905 # define UNR_KERNEL5_PREP UNROLL4_KERNEL5_PREPARE
00906 # define UNR_KERNEL5_FIX UNROLL4_KERNEL5_FIXUP
00907
00908 # define UNR_PREF_KERNEL4 UNROLL4_PREF_KERNEL4
00909 # define UNR_KERNEL4 UNROLL4_KERNEL4
00910 # define UNR_KERNEL4_PREP UNROLL4_KERNEL4_PREPARE
00911 # define UNR_KERNEL4_FIX UNROLL4_KERNEL4_FIXUP
00912
00913 # define UNR_PREF_KERNEL3 UNROLL4_PREF_KERNEL3
00914 # define UNR_KERNEL3 UNROLL4_KERNEL3
00915 # define UNR_KERNEL3_PREP UNROLL4_KERNEL3_PREPARE
00916 # define UNR_KERNEL3_FIX UNROLL4_KERNEL3_FIXUP
00917
00918 #elif UNROLL_DEPTH == 8
00919
00920 # define UNR_PREF_KERNEL5 UNROLL8_PREF_KERNEL5
00921 # define UNR_KERNEL5 UNROLL8_KERNEL5
00922 # define UNR_KERNEL5_PREP UNROLL8_KERNEL5_PREPARE
00923 # define UNR_KERNEL5_FIX UNROLL8_KERNEL5_FIXUP
00924
00925 # define UNR_PREF_KERNEL4 UNROLL8_PREF_KERNEL4
00926 # define UNR_KERNEL4 UNROLL8_KERNEL4
00927 # define UNR_KERNEL4_PREP UNROLL8_KERNEL4_PREPARE
00928 # define UNR_KERNEL4_FIX UNROLL8_KERNEL4_FIXUP
00929
00930 # define UNR_PREF_KERNEL3 UNROLL8_PREF_KERNEL3
00931 # define UNR_KERNEL3 UNROLL8_KERNEL3
00932 # define UNR_KERNEL3_PREP UNROLL8_KERNEL3_PREPARE
00933 # define UNR_KERNEL3_FIX UNROLL8_KERNEL3_FIXUP
00934
00935 #else
00936
00937 # error "UNROLL_DEPTH may only be 1, 2, 4, 8"
00938
00939 #endif
00940
00959
00960
00961
00962
00963 #ifdef USE_PREFETCH
00964
00965 # define VKERN_TEMPL_3V_PREF(OP3,T) \
00966 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
00967 PREFETCH_W (res, 3); \
00968 PREF_AHEAD3(T,3,MAX(1,CACHE_LOC_READ),MAX(1,CACHE_LOC_READ)); \
00969 UNR_KERNEL5_PREP; \
00970 do { \
00971 UNR_PREF_KERNEL5(OP3,T,CACHE_LOC_WRITE,CACHE_LOC_READ,CACHE_LOC_READ); \
00972 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
00973 UNR_KERNEL5_FIX; \
00974 }
00975
00976 # define VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_X,CW) \
00977 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
00978 PREFETCH_X (res, 3); \
00979 PREF_AHEAD2(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ)); \
00980 UNR_KERNEL4_PREP; \
00981 do { \
00982 UNR_PREF_KERNEL4(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ); \
00983 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
00984 UNR_KERNEL4_FIX; \
00985 }
00986
00987 # define VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_X,CW) \
00988 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
00989 PREFETCH_X (res, 3); \
00990 PREF_AHEAD1(T,PREFETCH_X,CW); \
00991 UNR_KERNEL3_PREP; \
00992 do { \
00993 UNR_PREF_KERNEL3(OP1,T,PREFETCH_X,CW); \
00994 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
00995 UNR_KERNEL3_FIX; \
00996 }
00997 #else
00998 # define VKERN_TEMPL_3V_PREF(OP,T) do {} while (0)
00999 # define VKERN_TEMPL_2V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
01000 # define VKERN_TEMPL_1V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
01001 #endif
01002
01003
01004
01005
01006
01007
01019
01020 #define VKERN_TEMPL_3V(FNAME,OP3) \
01021 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01022 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, const T* RESTRICT const);) \
01023 template <typename T> \
01024 VEC_INLINE void FNAME (const unsigned long sz, \
01025 T* RESTRICT const _res, \
01026 const T* RESTRICT const _v1, \
01027 const T* RESTRICT const _v2) \
01028 { \
01029 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
01030 register const T *v1 = _v1, *v2 = _v2; \
01031 register T *res = _res; \
01032 register long i = sz; \
01033 VKERN_TEMPL_3V_PREF(OP3,T); \
01034 \
01035 if (LIKELY(i >= UNROLL_DEPTH)) { \
01036 UNR_KERNEL5_PREP; \
01037 do { \
01038 UNR_KERNEL5(OP3); \
01039 } while (i >= UNROLL_DEPTH); \
01040 UNR_KERNEL5_FIX; \
01041 } \
01042 \
01043 for (; i; --i) { \
01044 OP3(*res, *v1, *v2, f1, f2); \
01045 ++v1; ++v2; ++res; \
01046 } \
01047 }
01048
01050 #define VKERN_TEMPL_3V_C(FNAME,OP3) \
01051 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01052 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
01053 const T* RESTRICT const, LCTYPED(T));) \
01054 template <typename T> \
01055 VEC_INLINE void FNAME (const unsigned long sz, \
01056 T* RESTRICT const _res, \
01057 const T* RESTRICT const _v1, \
01058 const T* RESTRICT const _v2, \
01059 LCTYPE(T) f2) \
01060 { \
01061 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
01062 register const T *v1 = _v1, *v2 = _v2; \
01063 register T *res = _res; \
01064 register long i = sz; \
01065 VKERN_TEMPL_3V_PREF(OP3,T); \
01066 \
01067 if (LIKELY(i >= UNROLL_DEPTH)) { \
01068 UNR_KERNEL5_PREP; \
01069 do { \
01070 UNR_KERNEL5(OP3); \
01071 } while (i >= UNROLL_DEPTH); \
01072 UNR_KERNEL5_FIX; \
01073 } \
01074 \
01075 for (; i; --i) { \
01076 OP3(*res, *v1, *v2, f1, f2); \
01077 ++v1; ++v2; ++res; \
01078 } \
01079 }
01080
01082 #define VKERN_TEMPL_3V_CC(FNAME,OP3) \
01083 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01084 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
01085 const T* RESTRICT const, LCTYPED(T), LCTYPED(T));) \
01086 template <typename T> \
01087 VEC_INLINE void FNAME (const unsigned long sz, \
01088 T* RESTRICT const _res, \
01089 const T* RESTRICT const _v1, \
01090 const T* RESTRICT const _v2, \
01091 LCTYPE(T) f1, \
01092 LCTYPE(T) f2) \
01093 { \
01094 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
01095 register long i = sz; \
01096 register const T *v1 = _v1, *v2 = _v2; \
01097 register T *res = _res; \
01098 VKERN_TEMPL_3V_PREF(OP3,T); \
01099 \
01100 if (LIKELY(i >= UNROLL_DEPTH)) { \
01101 UNR_KERNEL5_PREP; \
01102 do { \
01103 UNR_KERNEL5(OP3); \
01104 } while (i >= UNROLL_DEPTH); \
01105 UNR_KERNEL5_FIX; \
01106 } \
01107 \
01108 for (; i; --i) { \
01109 OP3(*res, *v1, *v2, f1, f2); \
01110 ++v1; ++v2; ++res; \
01111 } \
01112 }
01113
01115 #define VKERN_TEMPL_2V(FNAME,OP2) \
01116 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01117 (const unsigned long, T* RESTRICT const, const T* RESTRICT const);) \
01118 template <typename T> \
01119 VEC_INLINE void FNAME (const unsigned long sz, \
01120 T* RESTRICT const _res, \
01121 const T* RESTRICT const _v1) \
01122 { \
01123 PREFETCH_R(_v1, 3); \
01124 register const T *v1 = _v1; \
01125 register T* res = _res; \
01126 register long i = sz; \
01127 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_WRITE); \
01128 \
01129 if (LIKELY(i >= UNROLL_DEPTH)) { \
01130 UNR_KERNEL4_PREP; \
01131 do { \
01132 UNR_KERNEL4(OP2); \
01133 } while (i >= UNROLL_DEPTH); \
01134 UNR_KERNEL4_FIX; \
01135 } \
01136 \
01137 for (; i; --i) { \
01138 OP2(*res, *v1, f1, f2); \
01139 ++v1; ++res; \
01140 } \
01141 }
01142
01144 #define VKERN_TEMPL_2V_C(FNAME,OP2) \
01145 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01146 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
01147 LCTYPED(T));) \
01148 template <typename T> \
01149 VEC_INLINE void FNAME (const unsigned long sz, \
01150 T* RESTRICT const _res, \
01151 const T* RESTRICT const _v1, \
01152 LCTYPE(T) f2) \
01153 { \
01154 PREFETCH_R(_v1, 3); \
01155 register const T *v1 = _v1; \
01156 register T* res = _res; \
01157 register long i = sz; \
01158 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
01159 \
01160 if (LIKELY(i >= UNROLL_DEPTH)) { \
01161 UNR_KERNEL4_PREP; \
01162 do { \
01163 UNR_KERNEL4(OP2); \
01164 } while (i >= UNROLL_DEPTH); \
01165 UNR_KERNEL4_FIX; \
01166 } \
01167 \
01168 for (; i; --i) { \
01169 OP2(*res, *v1, f1, f2); \
01170 ++v1; ++res; \
01171 } \
01172 }
01173
01175 #define VKERN_TEMPL_2V_CC(FNAME,OP2) \
01176 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01177 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
01178 LCTYPED(T), LCTYPED(T));) \
01179 template <typename T> \
01180 VEC_INLINE void FNAME (const unsigned long sz, \
01181 T* RESTRICT const _res, \
01182 const T* RESTRICT const _v1, \
01183 LCTYPE(T) f1, \
01184 LCTYPE(T) f2) \
01185 { \
01186 PREFETCH_R(_v1, 3); \
01187 register const T *v1 = _v1; \
01188 register T* res = _res; \
01189 register long i = sz; \
01190 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
01191 \
01192 if (LIKELY(i >= UNROLL_DEPTH)) { \
01193 UNR_KERNEL4_PREP; \
01194 do { \
01195 UNR_KERNEL4(OP2); \
01196 } while (i >= UNROLL_DEPTH); \
01197 UNR_KERNEL4_FIX; \
01198 } \
01199 \
01200 for (; i; --i) { \
01201 OP2(*res, *v1, f1, f2); \
01202 ++v1; ++res; \
01203 } \
01204 }
01205
01207 #define VKERN_TEMPL_2V_T(FNAME,OP2,TYPE) \
01208 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01209 (const unsigned long, const T* RESTRICT const, \
01210 const T* RESTRICT const, TYPE&);) \
01211 template <typename T> \
01212 VEC_INLINE void FNAME (const unsigned long sz, \
01213 const T* RESTRICT const _res, \
01214 const T* RESTRICT const _v1, \
01215 TYPE &_f2) \
01216 { \
01217 PREFETCH_R(_v1, 3); \
01218 register const T *v1 = _v1; \
01219 register const T* res = _res; \
01220 register typename tbci_traits<TYPE>::loop_refval_type f2(_f2); \
01221 register long i = sz; \
01222 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_READ); \
01223 \
01224 if (LIKELY(i >= UNROLL_DEPTH)) { \
01225 UNR_KERNEL4_PREP; \
01226 do { \
01227 UNR_KERNEL4(OP2); \
01228 } while (i >= UNROLL_DEPTH); \
01229 UNR_KERNEL4_FIX; \
01230 } \
01231 \
01232 for (; i; --i) { \
01233 OP2(*res, *v1, f1, f2); \
01234 ++v1; ++res; \
01235 } \
01236 _f2 = f2; \
01237 }
01238
01240 #define VKERN_TEMPL_1V(FNAME,OP1) \
01241 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01242 (const unsigned long, T* RESTRICT const);) \
01243 template <typename T> \
01244 VEC_INLINE void FNAME (const unsigned long sz, \
01245 T* RESTRICT const _res) \
01246 { \
01247 register long i = sz; \
01248 register T* res = _res; \
01249 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
01250 \
01251 if (LIKELY(i >= UNROLL_DEPTH)) { \
01252 UNR_KERNEL3_PREP; \
01253 do { \
01254 UNR_KERNEL3(OP1); \
01255 } while (i >= UNROLL_DEPTH); \
01256 UNR_KERNEL3_FIX; \
01257 } \
01258 \
01259 for (; i; --i) { \
01260 OP1(*res, f1, f2); \
01261 ++res; \
01262 } \
01263 }
01264
01266 #define VKERN_TEMPL_1V_C(FNAME,OP1) \
01267 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01268 (const unsigned long, T* RESTRICT const, LCTYPED(T));) \
01269 template <typename T> \
01270 VEC_INLINE void FNAME (const unsigned long sz, \
01271 T* RESTRICT const _res, \
01272 LCTYPE(T) f2) \
01273 { \
01274 register long i = sz; \
01275 register T* res = _res; \
01276 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
01277 \
01278 if (LIKELY(i >= UNROLL_DEPTH)) { \
01279 UNR_KERNEL3_PREP; \
01280 do { \
01281 UNR_KERNEL3(OP1); \
01282 } while (i >= UNROLL_DEPTH); \
01283 UNR_KERNEL3_FIX; \
01284 } \
01285 \
01286 for (; i; --i) { \
01287 OP1(*res, f1, f2); \
01288 ++res; \
01289 } \
01290 }
01291
01293 #define VKERN_TEMPL_1V_CC(FNAME,OP1) \
01294 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01295 (const unsigned long, T* RESTRICT const, \
01296 LCTYPED(T), LCTYPED(T));) \
01297 template <typename T> \
01298 VEC_INLINE void FNAME (const unsigned long sz, \
01299 T* RESTRICT const _res, \
01300 LCTYPE(T) f1, \
01301 LCTYPE(T) f2) \
01302 { \
01303 register long i = sz; \
01304 register T* res = _res; \
01305 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
01306 \
01307 if (LIKELY(i >= UNROLL_DEPTH)) { \
01308 UNR_KERNEL3_PREP; \
01309 do { \
01310 UNR_KERNEL3(OP1); \
01311 } while (i >= UNROLL_DEPTH); \
01312 UNR_KERNEL3_FIX; \
01313 } \
01314 \
01315 for (; i; --i) { \
01316 OP1(*res, f1, f2); \
01317 ++res; \
01318 } \
01319 }
01320
01322 #define VKERN_TEMPL_1V_T(FNAME,OP1,TYPE) \
01323 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
01324 (const unsigned long, const T* const, TYPE&);) \
01325 template <typename T> \
01326 VEC_INLINE void FNAME (const unsigned long sz, \
01327 const T* const _res, \
01328 TYPE &_f2) \
01329 { \
01330 register typename tbci_traits<TYPE>::loop_refval_type f2(_f2); \
01331 register const T* res = _res; \
01332 register long i = sz; \
01333 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
01334 \
01335 if (LIKELY(i >= UNROLL_DEPTH)) { \
01336 UNR_KERNEL3_PREP; \
01337 do { \
01338 UNR_KERNEL3(OP1); \
01339 } while (i >= UNROLL_DEPTH); \
01340 UNR_KERNEL3_FIX; \
01341 } \
01342 \
01343 for (; i; --i) { \
01344 OP1(*res, f1, f2); \
01345 ++res; \
01346 } \
01347 _f2 = f2; \
01348 }
01349
01350 #endif