00001
00007 #ifndef TBCI_MATRIX_KERNELS_H
00008 #define TBCI_MATRIX_KERNELS_H
00009
00010 template <typename T> class Matrix;
00011 template <typename T> class TMatrix;
00012 template <typename T> class TSMatrix;
00013 template <typename T> class Vector;
00014 template <typename T> class TVector;
00015 template <typename T> class TSVector;
00016
00017
00018
00019 #ifdef OLD_MAT_MAT_MULT
00020 template <typename T>
00021 void do_mat_mat_mult (const unsigned start, const unsigned end,
00022 TMatrix<T> *res, const Matrix<T> *a,
00023 const Matrix<T> *b)
00024 {
00025 const unsigned bc = b->columns();
00026 const unsigned ac = a->columns();
00027 for (unsigned i=start; i<end; ++i) {
00028 for (unsigned j=0; j<bc; ++j) {
00029 PREFETCH_R(a->getcolptr(i), 2);
00030 PREFETCH_W(&res->setval(i,j), 2);
00031 ALIGN2 (register T tmp (a->get(i,0) * b->get(0,j)), MIN_ALIGN2);
00032 for (register unsigned l=1; l<ac; ++l)
00033 tmp += a->get(i,l) * b->get(l,j);
00034 res->set(tmp,i,j);
00035 }
00036 }
00037 }
00038 #else
00039
00041
00042
00043
00044
00045
00046
00047
00048 template <typename T>
00049 void do_mat_mat_mult (const unsigned start, const unsigned end,
00050 TMatrix<T> *res,
00051 const Matrix<T> *a, const Matrix<T> *b)
00052 {
00053 const unsigned bc = b->columns();
00054 const unsigned ac = a->columns();
00055 for (unsigned i=start; i<end; ++i) {
00056 const T* RESTRICT aptr = a->getcolptr(i);
00057 PREFETCH_R(aptr,2);
00058 for (unsigned l=0; l<ac; ++l) {
00059 T* RESTRICT rptr = res->getcolptr(i);
00060 const T* RESTRICT bptr = b->getcolptr(l);
00061 int n = bc - 8;
00062 PREFETCH_R(rptr,2); PREFETCH_R(bptr,1);
00063 ALIGN3(const register T tmp, *aptr, MIN_ALIGN2);
00064 if (LIKELY(n >= 0)) {
00065 if (sizeof(T)*2 >= CACHELINE_SZ) {
00066 PREFETCH_R(rptr+2,2); PREFETCH_R(bptr+2,1);
00067 }
00068 PREFETCH_R(rptr+4,2); PREFETCH_R(bptr+4,1);
00069 if (sizeof(T)*2 >= CACHELINE_SZ) {
00070 PREFETCH_R(rptr+6,2); PREFETCH_R(bptr+6,1);
00071 }
00072 for (; n > 3; n -= 4) {
00073 *rptr += tmp * *bptr;
00074 PREFETCH_R(rptr+ 8,2);
00075 *(rptr+1) += tmp * *(bptr+1);
00076 PREFETCH_R(bptr+ 8,1);
00077 *(rptr+2) += tmp * *(bptr+2);
00078 if (sizeof(T)*2 >= CACHELINE_SZ)
00079 PREFETCH_R(rptr+10,2);
00080 rptr += 4;
00081 if (sizeof(T)*2 >= CACHELINE_SZ)
00082 PREFETCH_R(bptr+10,1);
00083 *(rptr-1) += tmp * *(bptr+3);
00084 bptr += 4;
00085 }
00086 }
00087 if (LIKELY(l+4 < ac-CACHELINE_SZ/sizeof(T)))
00088 PREFETCH_R(aptr+4, 2);
00089 for (n += 8; n; --n)
00090 *rptr++ += tmp * *bptr++;
00091 aptr++;
00092 }
00093 }
00094 }
00095 #endif
00096
00097
00098 #define COST_MATVEC(r,c) (r*(COST_UNIT_STORE+COST_LOOP \
00099 +c*(3*COST_UNIT_LOAD+COST_CALL+COST_ADD+COST_MULT+COST_LOOP)))
00100
00101 template <typename T>
00102 void do_mat_vec_mult (const unsigned start, const unsigned end,
00103 TVector<T> *res,
00104 const Matrix<T> *mat, const Vector<T> *vec)
00105 {
00106 #ifdef DEBUG_THREAD
00107 fprintf (stderr, "do_mat_vec_mult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
00108 #endif
00109 PREFETCH_R(vec->vec,3);
00110 const unsigned mc = mat->col;
00111 #ifdef TBCI_SIMD_SUM
00112 if (UNLIKELY(sizeof(T) < TBCI_SIMD_ALIGN && mc % (TBCI_SIMD_ALIGN/sizeof(T)))) {
00113 for (unsigned rw = start; rw < end; ++rw) {
00114 res->vec[rw] = T(0.0);
00115 #if 0
00116 if (!((unsigned long)mat->mat[rw] % TBCI_SIMD_ALIGN))
00117 #endif
00118 do_vec_mult<T>(mc, mat->mat[rw], vec->vec, res->vec[rw]);
00119 #if 0
00120 else
00121 do_vec_mult_unaligned<T>(mc, mat->mat[rw], vec->vec, res->vec[rw]);
00122 #endif
00123 }
00124 } else
00125 #endif
00126 for (unsigned rw = start; rw < end; ++rw) {
00127 #if 1
00128
00129 res->vec[rw] = T(0.0);
00130 do_vec_mult<T> (mc, mat->mat[rw], vec->vec, res->vec[rw]);
00131 #else
00132 register const T* matptr = mat->mat[rw];
00133 PREFETCH_R(matptr, 1);
00134 register const T* vecptr = vec->vec;
00135
00136 register int i = mc - 9;
00137 ALIGN2(T el (*matptr++ * *vecptr++), MIN_ALIGN2);
00138 if (i >= 0) {
00139 if (sizeof(T)*2 >= CACHELINE_SZ)
00140 PREFETCH_R(matptr+2, 1);
00141 PREFETCH_R(matptr+4, 1);
00142 if (sizeof(T)*2 >= CACHELINE_SZ)
00143 PREFETCH_R(matptr+6, 1);
00144
00145 for (; i > 3; i -= 4) {
00146 el += *matptr * *vecptr;
00147 PREFETCH_R(matptr+ 8,1);
00148 el += *(matptr+1) * *(vecptr+1);
00149
00150 el += *(matptr+2) * *(vecptr+2);
00151 if (sizeof(T)*2 >= CACHELINE_SZ)
00152 PREFETCH_R(vecptr+10,1);
00153 el += *(matptr+3) * *(vecptr+3);
00154
00155 matptr += 4; vecptr += 4;
00156 }
00157 }
00158 if (LIKELY(rw < end-CACHELINE_SZ/sizeof(T))) {
00159 PREFETCH_W(res->vec+rw, 2);
00160 }
00161 for (i += 8; i; --i)
00162 el += *matptr++ * *vecptr++;
00163
00164 res->vec[rw] = el;
00165 #endif
00166 }
00167 }
00168
00169 template <typename T>
00170 void do_mat_tsv_mult (const unsigned start, const unsigned end,
00171 TVector<T> *res,
00172 const Matrix<T> *mat, const TSVector<T> *tsv)
00173 {
00174 PREFETCH_R(tsv->vec,3);
00175 const unsigned mc = mat->col;
00176 const T fact = tsv->getfac();
00177 #ifdef TBCI_SIMD_SUM
00178 if (UNLIKELY(sizeof(T) < TBCI_SIMD_ALIGN && mc % (TBCI_SIMD_ALIGN/sizeof(T)))) {
00179 for (unsigned rw = start; rw < end; ++rw) {
00180 res->vec[rw] = T(0.0);
00181 #if 0
00182 if (!((unsigned long)mat->mat[rw] % TBCI_SIMD_ALIGN))
00183 #endif
00184 do_vec_mult<T>(mc, mat->mat[rw], tsv->vec, res->vec[rw]);
00185 #if 0
00186 else
00187 do_vec_mult_unaligned<T>(mc, mat->mat[rw], tsv->vec, res->vec[rw]);
00188 #endif
00189 res->vec[rw] *= fact;
00190 }
00191 } else
00192 #endif
00193 for (unsigned rw = start; rw < end; ++rw) {
00194
00195 res->vec[rw] = T(0.0);
00196
00197 do_vec_mult<T> (mc, mat->mat[rw], tsv->vec, res->vec[rw]);
00198 res->vec[rw] *= fact;
00199 }
00200 }
00201
00202 template <typename T>
00203 void do_mat_vec_transmult (const unsigned start, const unsigned end,
00204 TVector<T> *res,
00205 const Matrix<T> *mat, const Vector<T> *vec)
00206 {
00207 PREFETCH_R(vec->vec,3);
00208 const unsigned mr = mat->row;
00209 for (unsigned cl = start; cl < end; cl++)
00210 {
00211 PREFETCH_R(mat->mat[0]+cl,1);
00212 register T* vecptr = vec->vec;
00213 register int i = mr - 9;
00214 int r = 1;
00215 ALIGN3(T el, mat->mat[0][cl] * *vecptr++, MIN_ALIGN2);
00216 if (LIKELY(i >= 0)) {
00217 PREFETCH_R(mat->mat[1]+cl,1);
00218 PREFETCH_R(mat->mat[2]+cl,1);
00219 PREFETCH_R(mat->mat[3]+cl,1);
00220 PREFETCH_R(mat->mat[4]+cl,1);
00221 PREFETCH_R(mat->mat[5]+cl,1);
00222 PREFETCH_R(mat->mat[6]+cl,1);
00223 PREFETCH_R(mat->mat[7]+cl,1);
00224 for (; i > 3; i -= 4) {
00225 el += mat->mat[r][cl] * *vecptr;
00226 PREFETCH_R(mat->mat[r+ 8]+cl, 1);
00227 el += mat->mat[r+1][cl] * *(vecptr+1);
00228 PREFETCH_R(mat->mat[r+ 9]+cl, 1);
00229 el += mat->mat[r+2][cl] * *(vecptr+2);
00230 PREFETCH_R(mat->mat[r+10]+cl, 1);
00231 el += mat->mat[r+3][cl] * *(vecptr+3);
00232 PREFETCH_R(mat->mat[r+11]+cl, 1);
00233 r += 4; vecptr += 4;
00234 }
00235 }
00236 if (LIKELY(cl < end-CACHELINE_SZ/sizeof(T))) {
00237 PREFETCH_W(res->vec + cl, 2);
00238 }
00239 for (i += 8; i; --i, r++)
00240 el += mat->mat[r][cl] * *vecptr++;
00241
00242 res->vec[cl] = el;
00243 }
00244 }
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273 #endif