Go to the source code of this file.
Defines | |
| #define | LCTYPE(T) register typename tbci_traits<T>::loop_const_refval_type |
| Shortcut for loop const ref type. | |
| #define | LCTYPED(T) register tbci_traits<T>::loop_const_refval_type |
| #define | UNROLL_DEPTH 4 |
When unrolling the loops, I had the following architectural details in mind:
| |
| #define | UNROLL1_PREF_KERNEL5(OPER, T, CA0, CA1, CA2) |
| Non-unrolled kernel for 5 args with prefetching. | |
| #define | UNROLL1_KERNEL5(OPER) |
| Non-unrolled kernel for 5 args without prefetching. | |
| #define | UNROLL1_KERNEL5_PREPARE do {} while(0) |
| #define | UNROLL1_KERNEL5_FIXUP do {} while(0) |
| #define | UNROLL2_PREF_KERNEL5(OPER, T, CA0, CA1, CA2) |
| Twice unrolled kernel for 5 args with prefetching. | |
| #define | UNROLL2_KERNEL5(OPER) |
| Twice unrolled kernel for 5 args without prefetching. | |
| #define | UNROLL2_KERNEL5_PREPARE do {} while(0) |
| #define | UNROLL2_KERNEL5_FIXUP do {} while(0) |
| #define | UNROLL4_PREF_KERNEL5(OPER, T, CA0, CA1, CA2) |
| Four times unrolled kernel for 5 args with prefetching. | |
| #define | UNROLL4_KERNEL5(OPER) |
| Four times unrolled kernel for 5 args without prefetching. | |
| #define | UNROLL4_KERNEL5_PREPARE do {} while(0) |
| #define | UNROLL4_KERNEL5_FIXUP do {} while(0) |
| #define | UNROLL8_PREF_KERNEL5(OPER, T, CA0, CA1, CA2) |
| Eight times unrolled kernel for 5 args with prefetching. | |
| #define | UNROLL8_KERNEL5(OPER) |
| Four times unrolled kernel for 5 args without prefetching. | |
| #define | UNROLL8_KERNEL5_PREPARE do {} while(0) |
| #define | UNROLL8_KERNEL5_FIXUP do {} while(0) |
| #define | PREF_AHEAD3(T, CA0, CA1, CA2) |
| Initial prefetch ahead (3 pointers). | |
| #define | UNROLL1_PREF_KERNEL4(OPER, T, PREFETCH_X, CA0, CA1) |
| Non-unrolled kernel for 4 args with prefetching. | |
| #define | UNROLL1_KERNEL4(OPER) |
| Non-unrolled kernel for 4 args without prefetching. | |
| #define | UNROLL1_KERNEL4_PREPARE do {} while(0) |
| #define | UNROLL1_KERNEL4_FIXUP do {} while(0) |
| #define | UNROLL2_PREF_KERNEL4(OPER, T, PREFETCH_X, CA0, CA1) |
| Twice unrolled kernel for 4 args with prefetching. | |
| #define | UNROLL2_KERNEL4(OPER) |
| Twice unrolled kernel for 4 args without prefetching. | |
| #define | UNROLL2_KERNEL4_PREPARE do {} while(0) |
| #define | UNROLL2_KERNEL4_FIXUP do {} while(0) |
| #define | UNROLL4_PREF_KERNEL4(OPER, T, PREFETCH_X, CA0, CA1) |
| Four times unrolled kernel for 4 args with prefetching. | |
| #define | UNROLL4_KERNEL4(OPER) |
| Four times unrolled kernel for 4 args without prefetching. | |
| #define | UNROLL4_KERNEL4_PREPARE do {} while(0) |
| #define | UNROLL4_KERNEL4_FIXUP do {} while(0) |
| #define | UNROLL8_PREF_KERNEL4(OPER, T, PREFETCH_X, CA0, CA1) |
| Eight times unrolled kernel for 4 args with prefetching. | |
| #define | UNROLL8_KERNEL4(OPER) |
| Four times unrolled kernel for 4 args without prefetching. | |
| #define | UNROLL8_KERNEL4_PREPARE do {} while(0) |
| #define | UNROLL8_KERNEL4_FIXUP do {} while(0) |
| #define | PREF_AHEAD2(T, PREFETCH_X, CA0, CA1) |
| Initial prefetch ahead (2 pointers). | |
| #define | UNROLL1_PREF_KERNEL3(OPER, T, PREFETCH_X, CA0) |
| Non-unrolled kernel for 3 args with prefetching. | |
| #define | UNROLL1_KERNEL3(OPER) |
| Non-unrolled kernel for 3 args without prefetching. | |
| #define | UNROLL1_KERNEL3_PREPARE do {} while(0) |
| #define | UNROLL1_KERNEL3_FIXUP do {} while(0) |
| #define | UNROLL2_PREF_KERNEL3(OPER, T, PREFETCH_X, CA0) |
| Twice unrolled kernel for 3 args with prefetching. | |
| #define | UNROLL2_KERNEL3(OPER) |
| Twice unrolled kernel for 3 args without prefetching. | |
| #define | UNROLL2_KERNEL3_PREPARE do {} while(0) |
| #define | UNROLL2_KERNEL3_FIXUP do {} while(0) |
| #define | UNROLL4_PREF_KERNEL3(OPER, T, PREFETCH_X, CA0) |
| Four times unrolled kernel for 3 args with prefetching. | |
| #define | UNROLL4_KERNEL3(OPER) |
| Four times unrolled kernel for 3 args without prefetching. | |
| #define | UNROLL4_KERNEL3_PREPARE do {} while(0) |
| #define | UNROLL4_KERNEL3_FIXUP do {} while(0) |
| #define | UNROLL8_PREF_KERNEL3(OPER, T, PREFETCH_X, CA0) |
| Eight times unrolled kernel for 3 args with prefetching. | |
| #define | UNROLL8_KERNEL3(OPER) |
| Four times unrolled kernel for 3 args without prefetching. | |
| #define | UNROLL8_KERNEL3_PREPARE do {} while(0) |
| #define | UNROLL8_KERNEL3_FIXUP do {} while(0) |
| #define | PREF_AHEAD1(T, PREFETCH_X, CA0) |
| Initial prefetch ahead (1 pointer). | |
| #define | UNR_PREF_KERNEL5 UNROLL4_PREF_KERNEL5 |
| #define | UNR_KERNEL5 UNROLL4_KERNEL5 |
| #define | UNR_KERNEL5_PREP UNROLL4_KERNEL5_PREPARE |
| #define | UNR_KERNEL5_FIX UNROLL4_KERNEL5_FIXUP |
| #define | UNR_PREF_KERNEL4 UNROLL4_PREF_KERNEL4 |
| #define | UNR_KERNEL4 UNROLL4_KERNEL4 |
| #define | UNR_KERNEL4_PREP UNROLL4_KERNEL4_PREPARE |
| #define | UNR_KERNEL4_FIX UNROLL4_KERNEL4_FIXUP |
| #define | UNR_PREF_KERNEL3 UNROLL4_PREF_KERNEL3 |
| #define | UNR_KERNEL3 UNROLL4_KERNEL3 |
| #define | UNR_KERNEL3_PREP UNROLL4_KERNEL3_PREPARE |
| #define | UNR_KERNEL3_FIX UNROLL4_KERNEL3_FIXUP |
| #define | VKERN_TEMPL_3V_PREF(OP, T) do {} while (0) |
| Fragments to be combined for different cases 1,2,3 vector fields 0,1,2 scalars to multiply with variable number of data elements per cacheline 1,2,4,8,16 cachelines ahead prefetch 1,2,4,8 fold unrolling. | |
| #define | VKERN_TEMPL_2V_PREF(OP, T, PREFETCH_X, CW) do {} while (0) |
| #define | VKERN_TEMPL_1V_PREF(OP, T, PREFETCH_X, CW) do {} while (0) |
| #define | VKERN_TEMPL_3V(FNAME, OP3) |
| gcc-2.95.x seems to fail caching a const double& in a register. | |
| #define | VKERN_TEMPL_3V_C(FNAME, OP3) |
| Operations of type vec = vec OP val * vec. | |
| #define | VKERN_TEMPL_3V_CC(FNAME, OP3) |
| Operations of type vec = val * vec OP val * vec. | |
| #define | VKERN_TEMPL_2V(FNAME, OP2) |
| Operations of type vec OP= vec. | |
| #define | VKERN_TEMPL_2V_C(FNAME, OP2) |
| Operations of type VEC = VEC OP VAL or VAL OP VEC. | |
| #define | VKERN_TEMPL_2V_CC(FNAME, OP2) |
| Operations of type VEC = VEC OP VAL or VAL OP VEC. | |
| #define | VKERN_TEMPL_2V_T(FNAME, OP2, TYPE) |
| Operations of type TYPE = VEC OP VEC. | |
| #define | VKERN_TEMPL_1V(FNAME, OP1) |
| Operations of type VEC = OP self. | |
| #define | VKERN_TEMPL_1V_C(FNAME, OP1) |
| Operations of type VEC OP= VAL. | |
| #define | VKERN_TEMPL_1V_CC(FNAME, OP1) |
| Operations of type VEC *= S OP= VAL. | |
| #define | VKERN_TEMPL_1V_T(FNAME, OP1, TYPE) |
| Operations of type TYPE = OP VEC. | |
(c) Kurt Garloff, <kurt@garloff.de>, 7/2002, GNU LGPL v2
Definition in file unroll_prefetch_def.h.
| #define LCTYPE | ( | T | ) | register typename tbci_traits<T>::loop_const_refval_type |
| #define LCTYPED | ( | T | ) | register tbci_traits<T>::loop_const_refval_type |
Definition at line 15 of file unroll_prefetch_def.h.
| #define PREF_AHEAD1 | ( | T, | |||
| PREFETCH_X, | |||||
| CA0 | ) |
| #define PREF_AHEAD2 | ( | T, | |||
| PREFETCH_X, | |||||
| CA0, | |||||
| CA1 | ) |
| #define PREF_AHEAD3 | ( | T, | |||
| CA0, | |||||
| CA1, | |||||
| CA2 | ) |
| #define UNR_KERNEL3 UNROLL4_KERNEL3 |
Definition at line 914 of file unroll_prefetch_def.h.
| #define UNR_KERNEL3_FIX UNROLL4_KERNEL3_FIXUP |
Definition at line 916 of file unroll_prefetch_def.h.
| #define UNR_KERNEL3_PREP UNROLL4_KERNEL3_PREPARE |
Definition at line 915 of file unroll_prefetch_def.h.
| #define UNR_KERNEL4 UNROLL4_KERNEL4 |
Definition at line 909 of file unroll_prefetch_def.h.
| #define UNR_KERNEL4_FIX UNROLL4_KERNEL4_FIXUP |
Definition at line 911 of file unroll_prefetch_def.h.
| #define UNR_KERNEL4_PREP UNROLL4_KERNEL4_PREPARE |
Definition at line 910 of file unroll_prefetch_def.h.
| #define UNR_KERNEL5 UNROLL4_KERNEL5 |
Definition at line 904 of file unroll_prefetch_def.h.
| #define UNR_KERNEL5_FIX UNROLL4_KERNEL5_FIXUP |
Definition at line 906 of file unroll_prefetch_def.h.
| #define UNR_KERNEL5_PREP UNROLL4_KERNEL5_PREPARE |
Definition at line 905 of file unroll_prefetch_def.h.
| #define UNR_PREF_KERNEL3 UNROLL4_PREF_KERNEL3 |
Definition at line 913 of file unroll_prefetch_def.h.
| #define UNR_PREF_KERNEL4 UNROLL4_PREF_KERNEL4 |
Definition at line 908 of file unroll_prefetch_def.h.
| #define UNR_PREF_KERNEL5 UNROLL4_PREF_KERNEL5 |
Definition at line 903 of file unroll_prefetch_def.h.
| #define UNROLL1_KERNEL3 | ( | OPER | ) |
Value:
--i; \
OPER(res[0], f1, f2); \
++res
Definition at line 666 of file unroll_prefetch_def.h.
| #define UNROLL1_KERNEL3_FIXUP do {} while(0) |
Definition at line 672 of file unroll_prefetch_def.h.
| #define UNROLL1_KERNEL3_PREPARE do {} while(0) |
Definition at line 671 of file unroll_prefetch_def.h.
| #define UNROLL1_KERNEL4 | ( | OPER | ) |
Value:
--i; \
OPER(res[0], v1[0], f1, f2); \
++v1; ++res
Definition at line 395 of file unroll_prefetch_def.h.
| #define UNROLL1_KERNEL4_FIXUP do {} while(0) |
Definition at line 401 of file unroll_prefetch_def.h.
| #define UNROLL1_KERNEL4_PREPARE do {} while(0) |
Definition at line 400 of file unroll_prefetch_def.h.
| #define UNROLL1_KERNEL5 | ( | OPER | ) |
Value:
--i; \
OPER(res[0], v1[0], v2[0], f1, f2); \
++v1; ++v2; ++res
Definition at line 59 of file unroll_prefetch_def.h.
| #define UNROLL1_KERNEL5_FIXUP do {} while(0) |
Definition at line 65 of file unroll_prefetch_def.h.
| #define UNROLL1_KERNEL5_PREPARE do {} while(0) |
Definition at line 64 of file unroll_prefetch_def.h.
| #define UNROLL1_PREF_KERNEL3 | ( | OPER, | |||
| T, | |||||
| PREFETCH_X, | |||||
| CA0 | ) |
Value:
Non-unrolled kernel for 3 args with prefetching.
Definition at line 659 of file unroll_prefetch_def.h.
| #define UNROLL1_PREF_KERNEL4 | ( | OPER, | |||
| T, | |||||
| PREFETCH_X, | |||||
| CA0, | |||||
| CA1 | ) |
Value:
OPER(res[0], v1[0], f1, f2); \
--i; \
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
++v1; \
PREFETCH_X(res+PREF_OFFS(T), CA0); \
++res
Definition at line 386 of file unroll_prefetch_def.h.
| #define UNROLL1_PREF_KERNEL5 | ( | OPER, | |||
| T, | |||||
| CA0, | |||||
| CA1, | |||||
| CA2 | ) |
Value:
OPER(res[0], v1[0], v2[0], f1, f2); \
--i; \
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
++v1; \
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
++v2; \
PREFETCH_W(res+PREF_OFFS(T), CA0); \
++res
Definition at line 48 of file unroll_prefetch_def.h.
| #define UNROLL2_KERNEL3 | ( | OPER | ) |
Value:
OPER(res[0], f1, f2); \
i -= 2; \
OPER(res[1], f1, f2); \
res += 2
Definition at line 694 of file unroll_prefetch_def.h.
| #define UNROLL2_KERNEL3_FIXUP do {} while(0) |
Definition at line 701 of file unroll_prefetch_def.h.
| #define UNROLL2_KERNEL3_PREPARE do {} while(0) |
Definition at line 700 of file unroll_prefetch_def.h.
| #define UNROLL2_KERNEL4 | ( | OPER | ) |
Value:
OPER(res[0], v1[0], f1, f2); \
v1 += 2; i -= 2; \
OPER(res[1], v1[-1],f1, f2); \
res += 2
Definition at line 428 of file unroll_prefetch_def.h.
| #define UNROLL2_KERNEL4_FIXUP do {} while(0) |
Definition at line 435 of file unroll_prefetch_def.h.
| #define UNROLL2_KERNEL4_PREPARE do {} while(0) |
Definition at line 434 of file unroll_prefetch_def.h.
| #define UNROLL2_KERNEL5 | ( | OPER | ) |
Value:
OPER(res[0], v1[0], v2[0], f1, f2); \
v1 += 2; i -= 2; \
OPER(res[1], v1[-1], v2[1], f1, f2); \
v2 += 2; res += 2
Definition at line 97 of file unroll_prefetch_def.h.
| #define UNROLL2_KERNEL5_FIXUP do {} while(0) |
Definition at line 104 of file unroll_prefetch_def.h.
| #define UNROLL2_KERNEL5_PREPARE do {} while(0) |
Definition at line 103 of file unroll_prefetch_def.h.
| #define UNROLL2_PREF_KERNEL3 | ( | OPER, | |||
| T, | |||||
| PREFETCH_X, | |||||
| CA0 | ) |
Value:
if (EL_PER_CL(T) <= 1) { \ OPER(res[0], f1, f2); \ PREFETCH_X(res+PREF_OFFS(T), CA0); \ i -= 2; \ OPER(res[1], f1, f2); \ PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \ res += 2; \ } else { \ OPER(res[0], f1, f2); \ i -= 2; \ OPER(res[1], f1, f2); \ PREFETCH_X(res+PREF_OFFS(T), CA0); \ res += 2; \ } \
Definition at line 676 of file unroll_prefetch_def.h.
| #define UNROLL2_PREF_KERNEL4 | ( | OPER, | |||
| T, | |||||
| PREFETCH_X, | |||||
| CA0, | |||||
| CA1 | ) |
Value:
if (EL_PER_CL(T) <= 1) { \ i -= 2; \ OPER(res[0], v1[0], f1, f2); \ PREFETCH_R(v1 +PREF_OFFS(T), CA1); \ PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \ OPER(res[1], v1[1], f1, f2); \ v1 += 2; \ PREFETCH_X(res+PREF_OFFS(T), CA0); \ PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \ res += 2; \ } else { \ i -= 2; \ OPER(res[0], v1[0], f1, f2); \ PREFETCH_R(v1 +PREF_OFFS(T), CA1); \ OPER(res[1], v1[1], f1, f2); \ v1 += 2; \ PREFETCH_X(res+PREF_OFFS(T), CA0); \ res += 2; \ } \
Definition at line 405 of file unroll_prefetch_def.h.
| #define UNROLL2_PREF_KERNEL5 | ( | OPER, | |||
| T, | |||||
| CA0, | |||||
| CA1, | |||||
| CA2 | ) |
Value:
if (EL_PER_CL(T) <= 1) { \ i -= 2; \ OPER(res[0], v1[0], v2[0], f1, f2); \ PREFETCH_R(v1 +PREF_OFFS(T), CA1); \ PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \ v1 += 2; \ PREFETCH_R(v2 +PREF_OFFS(T), CA2); \ PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \ OPER(res[1], v1[-1], v2[1], f1, f2); \ v2 += 2; \ PREFETCH_W(res+PREF_OFFS(T), CA0); \ PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \ res += 2; \ } else { \ i -= 2; \ OPER(res[0], v1[0], v2[0], f1, f2); \ PREFETCH_R(v1 +PREF_OFFS(T), CA1); \ v1 += 2; \ PREFETCH_R(v2 +PREF_OFFS(T), CA2); \ OPER(res[1], v1[-1], v2[1], f1, f2); \ v2 += 2; \ PREFETCH_W(res+PREF_OFFS(T), CA0); \ res += 2; \ } \
Definition at line 69 of file unroll_prefetch_def.h.
| #define UNROLL4_KERNEL3 | ( | OPER | ) |
Value:
OPER(res[0], f1, f2); \
OPER(res[1], f1, f2); \
i -= 4; \
OPER(res[2], f1, f2); \
OPER(res[3], f1, f2); \
res += 4
Definition at line 737 of file unroll_prefetch_def.h.
| #define UNROLL4_KERNEL3_FIXUP do {} while(0) |
Definition at line 746 of file unroll_prefetch_def.h.
| #define UNROLL4_KERNEL3_PREPARE do {} while(0) |
Definition at line 745 of file unroll_prefetch_def.h.
| #define UNROLL4_KERNEL4 | ( | OPER | ) |
Value:
OPER(res[0], v1[0], f1, f2); \
OPER(res[1], v1[1], f1, f2); \
v1 += 4; i -= 4; \
OPER(res[2], v1[-2], f1, f2); \
OPER(res[3], v1[-1], f1, f2); \
res += 4
Definition at line 481 of file unroll_prefetch_def.h.
| #define UNROLL4_KERNEL4_FIXUP do {} while(0) |
Definition at line 490 of file unroll_prefetch_def.h.
| #define UNROLL4_KERNEL4_PREPARE do {} while(0) |
Definition at line 489 of file unroll_prefetch_def.h.
| #define UNROLL4_KERNEL5 | ( | OPER | ) |
Value:
OPER(res[0], v1[0], v2[0], f1, f2); \
i -= 4; \
OPER(res[1], v1[1], v2[1], f1, f2); \
v1 += 4; \
OPER(res[2], v1[-2], v2[2], f1, f2); \
v2 += 4; \
OPER(res[3], v1[-1], v2[-1], f1, f2); \
res += 4
Definition at line 160 of file unroll_prefetch_def.h.
| #define UNROLL4_KERNEL5_FIXUP do {} while(0) |
Definition at line 171 of file unroll_prefetch_def.h.
| #define UNROLL4_KERNEL5_PREPARE do {} while(0) |
Definition at line 170 of file unroll_prefetch_def.h.
| #define UNROLL4_PREF_KERNEL3 | ( | OPER, | |||
| T, | |||||
| PREFETCH_X, | |||||
| CA0 | ) |
Value:
if (EL_PER_CL(T) <= 1) { \ OPER(res[0], f1, f2); \ i -= 4; \ PREFETCH_X(res+PREF_OFFS(T), CA0); \ OPER(res[1], f1, f2); \ PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \ OPER(res[2], f1, f2); \ PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \ OPER(res[3], f1, f2); \ PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \ res += 4; \ } else if (EL_PER_CL(T) <= 2) { \ OPER(res[0], f1, f2); \ PREFETCH_X(res+PREF_OFFS(T), CA0); \ OPER(res[1], f1, f2); \ i -= 4; \ OPER(res[2], f1, f2); \ PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \ OPER(res[3], f1, f2); \ res += 4; \ } else { \ OPER(res[0], f1, f2); \ i -= 4; \ OPER(res[1], f1, f2); \ OPER(res[2], f1, f2); \ PREFETCH_X(res+PREF_OFFS(T), CA0); \ OPER(res[3], f1, f2); \ res += 4; \ }
Definition at line 705 of file unroll_prefetch_def.h.
| #define UNROLL4_PREF_KERNEL4 | ( | OPER, | |||
| T, | |||||
| PREFETCH_X, | |||||
| CA0, | |||||
| CA1 | ) |
Four times unrolled kernel for 4 args with prefetching.
Definition at line 439 of file unroll_prefetch_def.h.
| #define UNROLL4_PREF_KERNEL5 | ( | OPER, | |||
| T, | |||||
| CA0, | |||||
| CA1, | |||||
| CA2 | ) |
Four times unrolled kernel for 5 args with prefetching.
Definition at line 108 of file unroll_prefetch_def.h.
| #define UNROLL8_KERNEL3 | ( | OPER | ) |
Value:
OPER(res[0], f1, f2); \
OPER(res[1], f1, f2); \
OPER(res[2], f1, f2); \
OPER(res[3], f1, f2); \
i -= 8; \
OPER(res[4], f1, f2); \
OPER(res[5], f1, f2); \
OPER(res[6], f1, f2); \
OPER(res[7], f1, f2); \
res += 8
Definition at line 814 of file unroll_prefetch_def.h.
| #define UNROLL8_KERNEL3_FIXUP do {} while(0) |
Definition at line 827 of file unroll_prefetch_def.h.
| #define UNROLL8_KERNEL3_PREPARE do {} while(0) |
Definition at line 826 of file unroll_prefetch_def.h.
| #define UNROLL8_KERNEL4 | ( | OPER | ) |
Value:
OPER(res[0], v1[0], f1, f2); \
OPER(res[1], v1[1], f1, f2); \
OPER(res[2], v1[2], f1, f2); \
OPER(res[3], v1[3], f1, f2); \
v1 += 8; i -= 8; \
OPER(res[4], v1[-4], f1, f2); \
OPER(res[5], v1[-3], f1, f2); \
OPER(res[6], v1[-2], f1, f2); \
OPER(res[7], v1[-1], f1, f2); \
res += 8
Definition at line 577 of file unroll_prefetch_def.h.
| #define UNROLL8_KERNEL4_FIXUP do {} while(0) |
Definition at line 590 of file unroll_prefetch_def.h.
| #define UNROLL8_KERNEL4_PREPARE do {} while(0) |
Definition at line 589 of file unroll_prefetch_def.h.
| #define UNROLL8_KERNEL5 | ( | OPER | ) |
Value:
OPER(res[0], v1[0], v2[0], f1, f2); \
OPER(res[1], v1[1], v2[1], f1, f2); \
i -= 8; \
OPER(res[2], v1[2], v2[2], f1, f2); \
OPER(res[3], v1[3], v2[3], f1, f2); \
v1 += 8; \
OPER(res[4], v1[-4], v2[4], f1, f2); \
OPER(res[5], v1[-3], v2[5], f1, f2); \
v2 += 8; \
OPER(res[6], v1[-2], v2[-2], f1, f2); \
OPER(res[7], v1[-1], v2[-1], f1, f2); \
res += 8
Definition at line 277 of file unroll_prefetch_def.h.
| #define UNROLL8_KERNEL5_FIXUP do {} while(0) |
Definition at line 292 of file unroll_prefetch_def.h.
| #define UNROLL8_KERNEL5_PREPARE do {} while(0) |
Definition at line 291 of file unroll_prefetch_def.h.
| #define UNROLL8_PREF_KERNEL3 | ( | OPER, | |||
| T, | |||||
| PREFETCH_X, | |||||
| CA0 | ) |
Eight times unrolled kernel for 3 args with prefetching.
Definition at line 750 of file unroll_prefetch_def.h.
| #define UNROLL8_PREF_KERNEL4 | ( | OPER, | |||
| T, | |||||
| PREFETCH_X, | |||||
| CA0, | |||||
| CA1 | ) |
Eight times unrolled kernel for 4 args with prefetching.
Definition at line 494 of file unroll_prefetch_def.h.
| #define UNROLL8_PREF_KERNEL5 | ( | OPER, | |||
| T, | |||||
| CA0, | |||||
| CA1, | |||||
| CA2 | ) |
Eight times unrolled kernel for 5 args with prefetching.
Definition at line 175 of file unroll_prefetch_def.h.
| #define UNROLL_DEPTH 4 |
When unrolling the loops, I had the following architectural details in mind:
Which means * We can execute more than one instruction in parallel per cycle. This was the reason to mix FP and Integer insns. * That we should have some delay between doing a computation and using the result, as it computation has to go through the pipeline before the result becomes available.
Funny enough, with this little knowledge, we do better than any compiler I found. Compaq cxx on alpha comes close, though. KG.
Definition at line 40 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_1V | ( | FNAME, | |||
| OP1 | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, T* RESTRICT const);) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ T* RESTRICT const _res) \ { \ register long i = sz; \ register T* res = _res; \ VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL3_PREP; \ do { \ UNR_KERNEL3(OP1); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL3_FIX; \ } \ \ for (; i; --i) { \ OP1(*res, f1, f2); \ ++res; \ } \ }
Definition at line 1240 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_1V_C | ( | FNAME, | |||
| OP1 | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, T* RESTRICT const, LCTYPED(T));) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ T* RESTRICT const _res, \ LCTYPE(T) f2) \ { \ register long i = sz; \ register T* res = _res; \ VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL3_PREP; \ do { \ UNR_KERNEL3(OP1); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL3_FIX; \ } \ \ for (; i; --i) { \ OP1(*res, f1, f2); \ ++res; \ } \ }
Definition at line 1266 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_1V_CC | ( | FNAME, | |||
| OP1 | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, T* RESTRICT const, \ LCTYPED(T), LCTYPED(T));) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ T* RESTRICT const _res, \ LCTYPE(T) f1, \ LCTYPE(T) f2) \ { \ register long i = sz; \ register T* res = _res; \ VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL3_PREP; \ do { \ UNR_KERNEL3(OP1); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL3_FIX; \ } \ \ for (; i; --i) { \ OP1(*res, f1, f2); \ ++res; \ } \ }
Definition at line 1293 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_1V_PREF | ( | OP, | |||
| T, | |||||
| PREFETCH_X, | |||||
| CW | ) | do {} while (0) |
Definition at line 1000 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_1V_T | ( | FNAME, | |||
| OP1, | |||||
| TYPE | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, const T* const, TYPE&);) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ const T* const _res, \ TYPE &_f2) \ { \ register typename tbci_traits<TYPE>::loop_refval_type f2(_f2); \ register const T* res = _res; \ register long i = sz; \ VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL3_PREP; \ do { \ UNR_KERNEL3(OP1); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL3_FIX; \ } \ \ for (; i; --i) { \ OP1(*res, f1, f2); \ ++res; \ } \ _f2 = f2; \ }
Definition at line 1322 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_2V | ( | FNAME, | |||
| OP2 | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, T* RESTRICT const, const T* RESTRICT const);) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ T* RESTRICT const _res, \ const T* RESTRICT const _v1) \ { \ PREFETCH_R(_v1, 3); \ register const T *v1 = _v1; \ register T* res = _res; \ register long i = sz; \ VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_WRITE); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL4_PREP; \ do { \ UNR_KERNEL4(OP2); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL4_FIX; \ } \ \ for (; i; --i) { \ OP2(*res, *v1, f1, f2); \ ++v1; ++res; \ } \ }
Definition at line 1115 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_2V_C | ( | FNAME, | |||
| OP2 | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \ LCTYPED(T));) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ T* RESTRICT const _res, \ const T* RESTRICT const _v1, \ LCTYPE(T) f2) \ { \ PREFETCH_R(_v1, 3); \ register const T *v1 = _v1; \ register T* res = _res; \ register long i = sz; \ VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL4_PREP; \ do { \ UNR_KERNEL4(OP2); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL4_FIX; \ } \ \ for (; i; --i) { \ OP2(*res, *v1, f1, f2); \ ++v1; ++res; \ } \ }
Definition at line 1144 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_2V_CC | ( | FNAME, | |||
| OP2 | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \ LCTYPED(T), LCTYPED(T));) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ T* RESTRICT const _res, \ const T* RESTRICT const _v1, \ LCTYPE(T) f1, \ LCTYPE(T) f2) \ { \ PREFETCH_R(_v1, 3); \ register const T *v1 = _v1; \ register T* res = _res; \ register long i = sz; \ VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL4_PREP; \ do { \ UNR_KERNEL4(OP2); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL4_FIX; \ } \ \ for (; i; --i) { \ OP2(*res, *v1, f1, f2); \ ++v1; ++res; \ } \ }
Definition at line 1175 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_2V_PREF | ( | OP, | |||
| T, | |||||
| PREFETCH_X, | |||||
| CW | ) | do {} while (0) |
Definition at line 999 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_2V_T | ( | FNAME, | |||
| OP2, | |||||
| TYPE | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, const T* RESTRICT const, \ const T* RESTRICT const, TYPE&);) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ const T* RESTRICT const _res, \ const T* RESTRICT const _v1, \ TYPE &_f2) \ { \ PREFETCH_R(_v1, 3); \ register const T *v1 = _v1; \ register const T* res = _res; \ register typename tbci_traits<TYPE>::loop_refval_type f2(_f2); \ register long i = sz; \ VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_READ); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL4_PREP; \ do { \ UNR_KERNEL4(OP2); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL4_FIX; \ } \ \ for (; i; --i) { \ OP2(*res, *v1, f1, f2); \ ++v1; ++res; \ } \ _f2 = f2; \ }
Definition at line 1207 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_3V | ( | FNAME, | |||
| OP3 | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, T* RESTRICT const, const T* RESTRICT const, const T* RESTRICT const);) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ T* RESTRICT const _res, \ const T* RESTRICT const _v1, \ const T* RESTRICT const _v2) \ { \ PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \ register const T *v1 = _v1, *v2 = _v2; \ register T *res = _res; \ register long i = sz; \ VKERN_TEMPL_3V_PREF(OP3,T); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL5_PREP; \ do { \ UNR_KERNEL5(OP3); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL5_FIX; \ } \ \ for (; i; --i) { \ OP3(*res, *v1, *v2, f1, f2); \ ++v1; ++v2; ++res; \ } \ }
So we have to use a local register var to force it doing so. for maximum performance. However, this is only beneficial in case we have an elementary type that does fit into a register. It would be nice to have macros that automatically do it when needed. However, sizeof(T) can't be evaluated by the preprocessor, so we can't know. Instead we use explicit specialization of our templates. Operations of type vec = vec OP vec
Definition at line 1020 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_3V_C | ( | FNAME, | |||
| OP3 | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \ const T* RESTRICT const, LCTYPED(T));) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ T* RESTRICT const _res, \ const T* RESTRICT const _v1, \ const T* RESTRICT const _v2, \ LCTYPE(T) f2) \ { \ PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \ register const T *v1 = _v1, *v2 = _v2; \ register T *res = _res; \ register long i = sz; \ VKERN_TEMPL_3V_PREF(OP3,T); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL5_PREP; \ do { \ UNR_KERNEL5(OP3); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL5_FIX; \ } \ \ for (; i; --i) { \ OP3(*res, *v1, *v2, f1, f2); \ ++v1; ++v2; ++res; \ } \ }
Definition at line 1050 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_3V_CC | ( | FNAME, | |||
| OP3 | ) |
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \ (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \ const T* RESTRICT const, LCTYPED(T), LCTYPED(T));) \ template <typename T> \ VEC_INLINE void FNAME (const unsigned long sz, \ T* RESTRICT const _res, \ const T* RESTRICT const _v1, \ const T* RESTRICT const _v2, \ LCTYPE(T) f1, \ LCTYPE(T) f2) \ { \ PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \ register long i = sz; \ register const T *v1 = _v1, *v2 = _v2; \ register T *res = _res; \ VKERN_TEMPL_3V_PREF(OP3,T); \ \ if (LIKELY(i >= UNROLL_DEPTH)) { \ UNR_KERNEL5_PREP; \ do { \ UNR_KERNEL5(OP3); \ } while (i >= UNROLL_DEPTH); \ UNR_KERNEL5_FIX; \ } \ \ for (; i; --i) { \ OP3(*res, *v1, *v2, f1, f2); \ ++v1; ++v2; ++res; \ } \ }
Definition at line 1082 of file unroll_prefetch_def.h.
| #define VKERN_TEMPL_3V_PREF | ( | OP, | |||
| T | ) | do {} while (0) |
Fragments to be combined for different cases 1,2,3 vector fields 0,1,2 scalars to multiply with variable number of data elements per cacheline 1,2,4,8,16 cachelines ahead prefetch 1,2,4,8 fold unrolling.
The structure is the same, always. (1) Before anything else, start read prefecthing. (2) Unrolled and (both read+write) prefetching loop (3) Unrolled loop (for the elements where prefecthing would be beyond array which could be a performance problem and for write prefecthing maybe a real problem (4) Non-unrolled loop for the remaining elements.
Definition at line 998 of file unroll_prefetch_def.h.
1.5.6