Stxxl
1.4.0
|
00001 /*************************************************************************** 00002 * include/stxxl/bits/containers/matrix_arithmetic.h 00003 * 00004 * Part of the STXXL. See http://stxxl.sourceforge.net 00005 * 00006 * Copyright (C) 2010-2011 Raoul Steffen <R-Steffen@gmx.de> 00007 * 00008 * Distributed under the Boost Software License, Version 1.0. 00009 * (See accompanying file LICENSE_1_0.txt or copy at 00010 * http://www.boost.org/LICENSE_1_0.txt) 00011 **************************************************************************/ 00012 00013 #ifndef STXXL_MATRIX_ARITHMETIC_HEADER 00014 #define STXXL_MATRIX_ARITHMETIC_HEADER 00015 00016 #include <stxxl/bits/mng/mng.h> 00017 #include <stxxl/bits/containers/matrix_low_level.h> 00018 00019 __STXXL_BEGIN_NAMESPACE 00020 00021 #ifndef STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS 00022 #define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS 3 00023 #endif 00024 #ifndef STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE 00025 #define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE 2 00026 #endif 00027 00028 template <typename ValueType> 00029 class column_vector; 00030 00031 template <typename ValueType> 00032 class row_vector; 00033 00034 template <typename ValueType, unsigned BlockSideLength> 00035 class swappable_block_matrix; 00036 00037 struct matrix_operation_statistic_dataset 00038 { 00039 int_type block_multiplication_calls, 00040 block_multiplications_saved_through_zero, 00041 block_addition_calls, 00042 block_additions_saved_through_zero; 00043 00044 matrix_operation_statistic_dataset() 00045 : block_multiplication_calls(0), 00046 block_multiplications_saved_through_zero(0), 00047 block_addition_calls(0), 00048 block_additions_saved_through_zero(0) {} 00049 00050 matrix_operation_statistic_dataset operator + (const matrix_operation_statistic_dataset & stat) 00051 { 00052 matrix_operation_statistic_dataset res(*this); 00053 res.block_multiplication_calls += stat.block_multiplication_calls; 00054 res.block_multiplications_saved_through_zero += stat.block_multiplications_saved_through_zero; 00055 res.block_addition_calls += stat.block_addition_calls; 00056 res.block_additions_saved_through_zero += stat.block_additions_saved_through_zero; 00057 return res; 00058 } 00059 00060 matrix_operation_statistic_dataset operator - (const matrix_operation_statistic_dataset & stat) 00061 { 00062 matrix_operation_statistic_dataset res(*this); 00063 res.block_multiplication_calls -= stat.block_multiplication_calls; 00064 res.block_multiplications_saved_through_zero -= stat.block_multiplications_saved_through_zero; 00065 res.block_addition_calls -= stat.block_addition_calls; 00066 res.block_additions_saved_through_zero -= stat.block_additions_saved_through_zero; 00067 return res; 00068 } 00069 }; 00070 00071 struct matrix_operation_statistic 00072 : public singleton<matrix_operation_statistic>, public matrix_operation_statistic_dataset 00073 {}; 00074 00075 struct matrix_operation_statistic_data : public matrix_operation_statistic_dataset 00076 { 00077 matrix_operation_statistic_data(const matrix_operation_statistic & stat = * matrix_operation_statistic::get_instance()) 00078 : matrix_operation_statistic_dataset(stat) {} 00079 00080 matrix_operation_statistic_data(const matrix_operation_statistic_dataset & stat) 00081 : matrix_operation_statistic_dataset(stat) {} 00082 00083 matrix_operation_statistic_data & operator = (const matrix_operation_statistic & stat) 00084 { 00085 return *this = matrix_operation_statistic_data(stat); 00086 } 00087 00088 void set() 00089 { operator = (* matrix_operation_statistic::get_instance()); } 00090 00091 matrix_operation_statistic_data operator + (const matrix_operation_statistic_data & stat) 00092 { return matrix_operation_statistic_data(matrix_operation_statistic_dataset(*this) + matrix_operation_statistic_dataset(stat)); } 00093 00094 matrix_operation_statistic_data operator - (const matrix_operation_statistic_data & stat) 00095 { return matrix_operation_statistic_data(matrix_operation_statistic_dataset(*this) - matrix_operation_statistic_dataset(stat)); } 00096 }; 00097 00098 std::ostream & operator << (std::ostream & o, const matrix_operation_statistic_data & statsd) 00099 { 00100 o << "matrix operation statistics" << std::endl; 00101 o << "block multiplication calls : " 00102 << statsd.block_multiplication_calls << std::endl; 00103 o << "block multiplications saved through zero blocks: " 00104 << statsd.block_multiplications_saved_through_zero << std::endl; 00105 o << "block multiplications performed : " 00106 << statsd.block_multiplication_calls - statsd.block_multiplications_saved_through_zero << std::endl; 00107 o << "block addition calls : " 00108 << statsd.block_addition_calls << std::endl; 00109 o << "block additions saved through zero blocks : " 00110 << statsd.block_additions_saved_through_zero << std::endl; 00111 o << "block additions performed : " 00112 << statsd.block_addition_calls - statsd.block_additions_saved_through_zero << std::endl; 00113 return o; 00114 } 00115 00116 //! \brief A static_quadtree holds 4^Level elements arranged in a quad tree. 00117 //! 00118 //! Static quad trees are useful for recursive algorithms with fixed depth 00119 //! that partition the in- and output and perform pre- and postcalculations on the partitions. 00120 //! The four children of one node are denoted as ul (up left), ur (up right), dl (down left), and dr (down right). 00121 template <typename ValueType, unsigned Level> 00122 struct static_quadtree 00123 { 00124 typedef static_quadtree<ValueType, Level - 1> smaller_static_quadtree; 00125 00126 smaller_static_quadtree ul, ur, dl, dr; 00127 00128 static_quadtree(smaller_static_quadtree ul, smaller_static_quadtree ur, 00129 smaller_static_quadtree dl, smaller_static_quadtree dr) 00130 : ul(ul), ur(ur), dl(dl), dr(dr) {} 00131 00132 static_quadtree() {} 00133 00134 static_quadtree & operator &= (const static_quadtree & right) 00135 { 00136 ul &= right.ul; ur &= right.ur; dl &= right.dl; dr &= right.dr; 00137 return *this; 00138 } 00139 00140 static_quadtree & operator += (const static_quadtree & right) 00141 { 00142 ul += right.ul; ur += right.ur; dl += right.dl; dr += right.dr; 00143 return *this; 00144 } 00145 00146 static_quadtree & operator -= (const static_quadtree & right) 00147 { 00148 ul -= right.ul; ur -= right.ur; dl -= right.dl; dr -= right.dr; 00149 return *this; 00150 } 00151 00152 static_quadtree operator & (const static_quadtree & right) const 00153 { return static_quadtree(ul & right.ul, ur & right.ur, dl & right.dl, dr & right.dr); } 00154 00155 static_quadtree operator + (const static_quadtree & right) const 00156 { return static_quadtree(ul + right.ul, ur + right.ur, dl + right.dl, dr + right.dr); } 00157 00158 static_quadtree operator - (const static_quadtree & right) const 00159 { return static_quadtree(ul - right.ul, ur - right.ur, dl - right.dl, dr - right.dr); } 00160 }; 00161 00162 template <typename ValueType> 00163 struct static_quadtree<ValueType, 0> 00164 { 00165 ValueType val; 00166 00167 static_quadtree(const ValueType & v) 00168 : val(v) {} 00169 00170 static_quadtree() {} 00171 00172 operator const ValueType & () const 00173 { return val; } 00174 00175 operator ValueType & () 00176 { return val; } 00177 00178 static_quadtree & operator &= (const static_quadtree & right) 00179 { 00180 val &= right.val; 00181 return *this; 00182 } 00183 00184 static_quadtree & operator += (const static_quadtree & right) 00185 { 00186 val += right.val; 00187 return *this; 00188 } 00189 00190 static_quadtree & operator -= (const static_quadtree & right) 00191 { 00192 val -= right.val; 00193 return *this; 00194 } 00195 00196 static_quadtree operator ! () const 00197 { return static_quadtree(! val); } 00198 00199 static_quadtree operator & (const static_quadtree & right) const 00200 { return val & right.val; } 00201 00202 static_quadtree operator + (const static_quadtree & right) const 00203 { return val + right.val; } 00204 00205 static_quadtree operator - (const static_quadtree & right) const 00206 { return val - right.val; } 00207 }; 00208 00209 template <typename ValueType, unsigned BlockSideLength, unsigned Level, bool AExists, bool BExists> 00210 struct feedable_strassen_winograd 00211 { 00212 typedef static_quadtree<bool, Level> zbt; // true <=> is a zero-block 00213 typedef static_quadtree<ValueType, Level> vt; 00214 00215 typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, AExists, BExists> smaller_feedable_strassen_winograd_ab; 00216 typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, AExists, false> smaller_feedable_strassen_winograd_a; 00217 typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, false, BExists> smaller_feedable_strassen_winograd_b; 00218 typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, false, false> smaller_feedable_strassen_winograd_n; 00219 00220 typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type; 00221 typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type; 00222 typedef typename block_scheduler_type::internal_block_type internal_block_type; 00223 typedef typename swappable_block_matrix_type::size_type size_type; 00224 00225 const size_type n, m, l; 00226 smaller_feedable_strassen_winograd_ab p1, p2; 00227 smaller_feedable_strassen_winograd_n p3, p4, p5; 00228 smaller_feedable_strassen_winograd_b p6; 00229 smaller_feedable_strassen_winograd_a p7; 00230 00231 feedable_strassen_winograd( 00232 const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col, 00233 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l, 00234 const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col) 00235 : n(n), m(m), l(l), 00236 p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col), 00237 p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col), 00238 p3( bs_c, n/2, m/2, l/2), 00239 p4( bs_c, n/2, m/2, l/2), 00240 p5( bs_c, n/2, m/2, l/2), 00241 p6( bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2), 00242 p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {} 00243 00244 feedable_strassen_winograd( 00245 const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col, 00246 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l) 00247 : n(n), m(m), l(l), 00248 p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2), 00249 p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2), 00250 p3( bs_c, n/2, m/2, l/2), 00251 p4( bs_c, n/2, m/2, l/2), 00252 p5( bs_c, n/2, m/2, l/2), 00253 p6( bs_c, n/2, m/2, l/2), 00254 p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {} 00255 00256 feedable_strassen_winograd( 00257 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l, 00258 const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col) 00259 : n(n), m(m), l(l), 00260 p1(bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col), 00261 p2(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col), 00262 p3(bs_c, n/2, m/2, l/2), 00263 p4(bs_c, n/2, m/2, l/2), 00264 p5(bs_c, n/2, m/2, l/2), 00265 p6(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2), 00266 p7(bs_c, n/2, m/2, l/2) {} 00267 00268 feedable_strassen_winograd( 00269 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l) 00270 : n(n), m(m), l(l), 00271 p1(bs_c, n/2, m/2, l/2), 00272 p2(bs_c, n/2, m/2, l/2), 00273 p3(bs_c, n/2, m/2, l/2), 00274 p4(bs_c, n/2, m/2, l/2), 00275 p5(bs_c, n/2, m/2, l/2), 00276 p6(bs_c, n/2, m/2, l/2), 00277 p7(bs_c, n/2, m/2, l/2) {} 00278 00279 void begin_feeding_a_block(const size_type & block_row, const size_type & block_col, const zbt zb) 00280 { 00281 typename zbt::smaller_static_quadtree s1 = zb.dl & zb.dr, 00282 s2 = s1 & zb.ul, 00283 s3 = zb.ul & zb.dl, 00284 s4 = zb.ur & s2; 00285 p1.begin_feeding_a_block(block_row, block_col, zb.ul); 00286 p2.begin_feeding_a_block(block_row, block_col, zb.ur); 00287 p3.begin_feeding_a_block(block_row, block_col, s1); 00288 p4.begin_feeding_a_block(block_row, block_col, s2); 00289 p5.begin_feeding_a_block(block_row, block_col, s3); 00290 p6.begin_feeding_a_block(block_row, block_col, s4); 00291 p7.begin_feeding_a_block(block_row, block_col, zb.dr); 00292 } 00293 00294 void feed_a_element(const int_type element_num, const vt v) 00295 { 00296 typename vt::smaller_static_quadtree s1 = v.dl + v.dr, 00297 s2 = s1 - v.ul, 00298 s3 = v.ul - v.dl, 00299 s4 = v.ur - s2; 00300 p1.feed_a_element(element_num, v.ul); 00301 p2.feed_a_element(element_num, v.ur); 00302 p3.feed_a_element(element_num, s1); 00303 p4.feed_a_element(element_num, s2); 00304 p5.feed_a_element(element_num, s3); 00305 p6.feed_a_element(element_num, s4); 00306 p7.feed_a_element(element_num, v.dr); 00307 } 00308 00309 void end_feeding_a_block(const size_type & block_row, const size_type & block_col, const zbt zb) 00310 { 00311 typename zbt::smaller_static_quadtree s1 = zb.dl & zb.dr, 00312 s2 = s1 & zb.ul, 00313 s3 = zb.ul & zb.dl, 00314 s4 = zb.ur & s2; 00315 p1.end_feeding_a_block(block_row, block_col, zb.ul); 00316 p2.end_feeding_a_block(block_row, block_col, zb.ur); 00317 p3.end_feeding_a_block(block_row, block_col, s1); 00318 p4.end_feeding_a_block(block_row, block_col, s2); 00319 p5.end_feeding_a_block(block_row, block_col, s3); 00320 p6.end_feeding_a_block(block_row, block_col, s4); 00321 p7.end_feeding_a_block(block_row, block_col, zb.dr); 00322 } 00323 00324 void begin_feeding_b_block(const size_type & block_row, const size_type & block_col, const zbt zb) 00325 { 00326 typename zbt::smaller_static_quadtree t1 = zb.ur & zb.ul, 00327 t2 = zb.dr & t1, 00328 t3 = zb.dr & zb.ur, 00329 t4 = zb.dl & t2; 00330 p1.begin_feeding_b_block(block_row, block_col, zb.ul); 00331 p2.begin_feeding_b_block(block_row, block_col, zb.dl); 00332 p3.begin_feeding_b_block(block_row, block_col, t1); 00333 p4.begin_feeding_b_block(block_row, block_col, t2); 00334 p5.begin_feeding_b_block(block_row, block_col, t3); 00335 p6.begin_feeding_b_block(block_row, block_col, zb.dr); 00336 p7.begin_feeding_b_block(block_row, block_col, t4); 00337 } 00338 00339 void feed_b_element(const int_type element_num, const vt v) 00340 { 00341 typename vt::smaller_static_quadtree t1 = v.ur - v.ul, 00342 t2 = v.dr - t1, 00343 t3 = v.dr - v.ur, 00344 t4 = v.dl - t2; 00345 p1.feed_b_element(element_num, v.ul); 00346 p2.feed_b_element(element_num, v.dl); 00347 p3.feed_b_element(element_num, t1); 00348 p4.feed_b_element(element_num, t2); 00349 p5.feed_b_element(element_num, t3); 00350 p6.feed_b_element(element_num, v.dr); 00351 p7.feed_b_element(element_num, t4); 00352 } 00353 00354 void end_feeding_b_block(const size_type & block_row, const size_type & block_col, const zbt zb) 00355 { 00356 typename zbt::smaller_static_quadtree t1 = zb.ur & zb.ul, 00357 t2 = zb.dr & t1, 00358 t3 = zb.dr & zb.ur, 00359 t4 = zb.dl & t2; 00360 p1.end_feeding_b_block(block_row, block_col, zb.ul); 00361 p2.end_feeding_b_block(block_row, block_col, zb.dl); 00362 p3.end_feeding_b_block(block_row, block_col, t1); 00363 p4.end_feeding_b_block(block_row, block_col, t2); 00364 p5.end_feeding_b_block(block_row, block_col, t3); 00365 p6.end_feeding_b_block(block_row, block_col, zb.dr); 00366 p7.end_feeding_b_block(block_row, block_col, t4); 00367 } 00368 00369 void multiply() 00370 { 00371 p1.multiply(); 00372 p2.multiply(); 00373 p3.multiply(); 00374 p4.multiply(); 00375 p5.multiply(); 00376 p6.multiply(); 00377 p7.multiply(); 00378 } 00379 00380 zbt begin_reading_block(const size_type & block_row, const size_type & block_col) 00381 { 00382 zbt r; 00383 r.ur = r.ul = p1.begin_reading_block(block_row, block_col); 00384 r.ul &= p2.begin_reading_block(block_row, block_col); 00385 r.ur &= p4.begin_reading_block(block_row, block_col); 00386 r.dr = r.dl = p5.begin_reading_block(block_row, block_col); 00387 r.dl &= r.ur; 00388 r.dl &= p7.begin_reading_block(block_row, block_col); 00389 r.ur &= p3.begin_reading_block(block_row, block_col); 00390 r.dr &= r.ur; 00391 r.ur &= p6.begin_reading_block(block_row, block_col); 00392 return r; 00393 } 00394 00395 vt read_element(int_type element_num) 00396 { 00397 vt r; 00398 r.ur = r.ul = p1.read_element(element_num); 00399 r.ul += p2.read_element(element_num); 00400 r.ur += p4.read_element(element_num); 00401 r.dr = r.dl = p5.read_element(element_num); 00402 r.dl += r.ur; 00403 r.dl += p7.read_element(element_num); 00404 r.ur += p3.read_element(element_num); 00405 r.dr += r.ur; 00406 r.ur += p6.read_element(element_num); 00407 return r; 00408 } 00409 00410 zbt end_reading_block(const size_type & block_row, const size_type & block_col) 00411 { 00412 zbt r; 00413 r.ur = r.ul = p1.end_reading_block(block_row, block_col); 00414 r.ul &= p2.end_reading_block(block_row, block_col); 00415 r.ur &= p4.end_reading_block(block_row, block_col); 00416 r.dr = r.dl = p5.end_reading_block(block_row, block_col); 00417 r.dl &= r.ur; 00418 r.dl &= p7.end_reading_block(block_row, block_col); 00419 r.ur &= p3.end_reading_block(block_row, block_col); 00420 r.dr &= r.ur; 00421 r.ur &= p6.end_reading_block(block_row, block_col); 00422 return r; 00423 } 00424 }; 00425 00426 template <typename ValueType, unsigned BlockSideLength, bool AExists, bool BExists> 00427 struct feedable_strassen_winograd<ValueType, BlockSideLength, 0, AExists, BExists> 00428 { 00429 typedef static_quadtree<bool, 0> zbt; // true <=> is a zero-block 00430 typedef static_quadtree<ValueType, 0> vt; 00431 00432 typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type; 00433 typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type; 00434 typedef typename block_scheduler_type::internal_block_type internal_block_type; 00435 typedef typename swappable_block_matrix_type::size_type size_type; 00436 00437 swappable_block_matrix_type a, b, c; 00438 const size_type n, m, l; 00439 internal_block_type * iblock; 00440 00441 feedable_strassen_winograd( 00442 const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col, 00443 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l, 00444 const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col) 00445 : a(existing_a, n, l, a_from_row, a_from_col), 00446 b(existing_b, n, l, b_from_row, b_from_col), 00447 c(bs_c, n, m), 00448 n(n), m(m), l(l), 00449 iblock(0) {} 00450 00451 feedable_strassen_winograd( 00452 const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col, 00453 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l) 00454 : a(existing_a, n, l, a_from_row, a_from_col), 00455 b(bs_c, n, l), 00456 c(bs_c, n, m), 00457 n(n), m(m), l(l), 00458 iblock(0) {} 00459 00460 feedable_strassen_winograd( 00461 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l, 00462 const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col) 00463 : a(bs_c, n, l), 00464 b(existing_b, n, l, b_from_row, b_from_col), 00465 c(bs_c, n, m), 00466 n(n), m(m), l(l), 00467 iblock(0) {} 00468 00469 feedable_strassen_winograd( 00470 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l) 00471 : a(bs_c, n, l), 00472 b(bs_c, n, l), 00473 c(bs_c, n, m), 00474 n(n), m(m), l(l), 00475 iblock(0) {} 00476 00477 void begin_feeding_a_block(const size_type & block_row, const size_type & block_col, const zbt) 00478 { 00479 if (! AExists) 00480 iblock = & a.bs.acquire(a(block_row, block_col), true); 00481 } 00482 00483 void feed_a_element(const int_type element_num, const vt v) 00484 { 00485 if (! AExists) 00486 (*iblock)[element_num] = v; 00487 } 00488 00489 void end_feeding_a_block(const size_type & block_row, const size_type & block_col, const zbt zb) 00490 { 00491 if (! AExists) 00492 { 00493 a.bs.release(a(block_row, block_col), ! zb); 00494 iblock = 0; 00495 } 00496 } 00497 00498 void begin_feeding_b_block(const size_type & block_row, const size_type & block_col, const zbt) 00499 { 00500 if (! BExists) 00501 iblock = & b.bs.acquire(b(block_row, block_col), true); 00502 } 00503 00504 void feed_b_element(const int_type element_num, const vt v) 00505 { 00506 if (! BExists) 00507 (*iblock)[element_num] = v; 00508 } 00509 00510 void end_feeding_b_block(const size_type & block_row, const size_type & block_col, const zbt zb) 00511 { 00512 if (! BExists) 00513 { 00514 b.bs.release(b(block_row, block_col), ! zb); 00515 iblock = 0; 00516 } 00517 } 00518 00519 void multiply() 00520 { matrix_operations<ValueType, BlockSideLength>::choose_level_for_feedable_sw(a, b, c); } 00521 00522 zbt begin_reading_block(const size_type & block_row, const size_type & block_col) 00523 { 00524 bool zb = ! c.bs.is_initialized(c(block_row, block_col)); 00525 iblock = & c.bs.acquire(c(block_row, block_col)); 00526 return zb; 00527 } 00528 00529 vt read_element(const int_type element_num) 00530 { return (*iblock)[element_num]; } 00531 00532 zbt end_reading_block(const size_type & block_row, const size_type & block_col) 00533 { 00534 c.bs.release(c(block_row, block_col), false); 00535 iblock = 0; 00536 return ! c.bs.is_initialized(c(block_row, block_col)); 00537 } 00538 }; 00539 00540 template <typename ValueType, unsigned BlockSideLength, unsigned Level> 00541 struct matrix_to_quadtree 00542 { 00543 typedef static_quadtree<bool, Level> zbt; // true <=> is a zero-block 00544 typedef static_quadtree<ValueType, Level> vt; 00545 00546 typedef matrix_to_quadtree<ValueType, BlockSideLength, Level - 1> smaller_matrix_to_quadtree; 00547 00548 typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type; 00549 typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type; 00550 typedef typename block_scheduler_type::internal_block_type internal_block_type; 00551 typedef typename swappable_block_matrix_type::size_type size_type; 00552 00553 smaller_matrix_to_quadtree ul, ur, dl, dr; 00554 00555 matrix_to_quadtree(const swappable_block_matrix_type & matrix) 00556 : ul(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, 0), 00557 ur(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, matrix.get_width()/2), 00558 dl(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, 0), 00559 dr(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, matrix.get_width()/2) 00560 { assert(! (matrix.get_height() % 2 | matrix.get_width() % 2)); } 00561 00562 matrix_to_quadtree(const swappable_block_matrix_type & matrix, 00563 const size_type height, const size_type width, const size_type from_row, const size_type from_col) 00564 : ul(matrix, height/2, width/2, from_row, from_col), 00565 ur(matrix, height/2, width/2, from_row, from_col + width/2), 00566 dl(matrix, height/2, width/2, from_row + height/2, from_col), 00567 dr(matrix, height/2, width/2, from_row + height/2, from_col + width/2) 00568 { assert(! (height % 2 | width % 2)); } 00569 00570 void begin_feeding_block(const size_type & block_row, const size_type & block_col, const zbt zb) 00571 { 00572 ul.begin_feeding_block(block_row, block_col, zb.ul); 00573 ur.begin_feeding_block(block_row, block_col, zb.ur); 00574 dl.begin_feeding_block(block_row, block_col, zb.dl); 00575 dr.begin_feeding_block(block_row, block_col, zb.dr); 00576 } 00577 00578 void feed_element(const int_type element_num, const vt v) 00579 { 00580 ul.feed_element(element_num, v.ul); 00581 ur.feed_element(element_num, v.ur); 00582 dl.feed_element(element_num, v.dl); 00583 dr.feed_element(element_num, v.dr); 00584 } 00585 00586 void feed_and_add_element(const int_type element_num, const vt v) 00587 { 00588 ul.feed_and_add_element(element_num, v.ul); 00589 ur.feed_and_add_element(element_num, v.ur); 00590 dl.feed_and_add_element(element_num, v.dl); 00591 dr.feed_and_add_element(element_num, v.dr); 00592 } 00593 00594 void end_feeding_block(const size_type & block_row, const size_type & block_col, const zbt zb) 00595 { 00596 ul.end_feeding_block(block_row, block_col, zb.ul); 00597 ur.end_feeding_block(block_row, block_col, zb.ur); 00598 dl.end_feeding_block(block_row, block_col, zb.dl); 00599 dr.end_feeding_block(block_row, block_col, zb.dr); 00600 } 00601 00602 zbt begin_reading_block(const size_type & block_row, const size_type & block_col) 00603 { 00604 zbt zb; 00605 zb.ul = ul.begin_reading_block(block_row, block_col); 00606 zb.ur = ur.begin_reading_block(block_row, block_col); 00607 zb.dl = dl.begin_reading_block(block_row, block_col); 00608 zb.dr = dr.begin_reading_block(block_row, block_col); 00609 return zb; 00610 } 00611 00612 vt read_element(const int_type element_num) 00613 { 00614 vt v; 00615 v.ul = ul.read_element(element_num); 00616 v.ur = ur.read_element(element_num); 00617 v.dl = dl.read_element(element_num); 00618 v.dr = dr.read_element(element_num); 00619 return v; 00620 } 00621 00622 zbt end_reading_block(const size_type & block_row, const size_type & block_col) 00623 { 00624 zbt zb; 00625 zb.ul = ul.end_reading_block(block_row, block_col); 00626 zb.ur = ur.end_reading_block(block_row, block_col); 00627 zb.dl = dl.end_reading_block(block_row, block_col); 00628 zb.dr = dr.end_reading_block(block_row, block_col); 00629 return zb; 00630 } 00631 00632 const size_type & get_height_in_blocks() 00633 { return ul.get_height_in_blocks(); } 00634 00635 const size_type & get_width_in_blocks() 00636 { return ul.get_width_in_blocks(); } 00637 }; 00638 00639 template <typename ValueType, unsigned BlockSideLength> 00640 struct matrix_to_quadtree<ValueType, BlockSideLength, 0> 00641 { 00642 typedef static_quadtree<bool, 0> zbt; // true <=> is a zero-block 00643 typedef static_quadtree<ValueType, 0> vt; 00644 00645 typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type; 00646 typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type; 00647 typedef typename block_scheduler_type::internal_block_type internal_block_type; 00648 typedef typename swappable_block_matrix_type::size_type size_type; 00649 00650 swappable_block_matrix_type m; 00651 internal_block_type * iblock; 00652 00653 matrix_to_quadtree(const swappable_block_matrix_type & matrix) 00654 : m(matrix, matrix.get_height(), matrix.get_width(), 0, 0), 00655 iblock(0) {} 00656 00657 matrix_to_quadtree(const swappable_block_matrix_type & matrix, 00658 const size_type height, const size_type width, const size_type from_row, const size_type from_col) 00659 : m(matrix, height, width, from_row, from_col), 00660 iblock(0) {} 00661 00662 void begin_feeding_block(const size_type & block_row, const size_type & block_col, const zbt) 00663 { iblock = & m.bs.acquire(m(block_row, block_col)); } 00664 00665 void feed_element(const int_type element_num, const vt v) 00666 { (*iblock)[element_num] = v; } 00667 00668 void feed_and_add_element(const int_type element_num, const vt v) 00669 { (*iblock)[element_num] += v; } 00670 00671 void end_feeding_block(const size_type & block_row, const size_type & block_col, const zbt zb) 00672 { 00673 m.bs.release(m(block_row, block_col), ! zb); 00674 iblock = 0; 00675 } 00676 00677 zbt begin_reading_block(const size_type & block_row, const size_type & block_col) 00678 { 00679 zbt zb = ! m.bs.is_initialized(m(block_row, block_col)); 00680 iblock = & m.bs.acquire(m(block_row, block_col)); 00681 return zb; 00682 } 00683 00684 vt read_element(const int_type element_num) 00685 { return (*iblock)[element_num]; } 00686 00687 zbt end_reading_block(const size_type & block_row, const size_type & block_col) 00688 { 00689 m.bs.release(m(block_row, block_col), false); 00690 iblock = 0; 00691 return ! m.bs.is_initialized(m(block_row, block_col)); 00692 } 00693 00694 const size_type & get_height_in_blocks() 00695 { return m.get_height(); } 00696 00697 const size_type & get_width_in_blocks() 00698 { return m.get_width(); } 00699 }; 00700 00701 template <typename ValueType, unsigned BlockSideLength, unsigned Level, bool AExists, bool BExists> 00702 struct feedable_strassen_winograd_block_grained 00703 { 00704 typedef static_quadtree<bool, Level> zbt; // true <=> is a zero-block 00705 typedef static_quadtree<ValueType, Level> vt; 00706 00707 typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, AExists, BExists> smaller_feedable_strassen_winograd_ab; 00708 typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, AExists, false> smaller_feedable_strassen_winograd_a; 00709 typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, false, BExists> smaller_feedable_strassen_winograd_b; 00710 typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, false, false> smaller_feedable_strassen_winograd_n; 00711 00712 typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type; 00713 typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type; 00714 typedef typename block_scheduler_type::internal_block_type internal_block_type; 00715 typedef typename swappable_block_matrix_type::size_type size_type; 00716 typedef matrix_operations<ValueType, BlockSideLength> Ops; 00717 00718 const size_type n, m, l; 00719 smaller_feedable_strassen_winograd_ab p1, p2; 00720 smaller_feedable_strassen_winograd_n p3, p4, p5; 00721 smaller_feedable_strassen_winograd_b p6; 00722 smaller_feedable_strassen_winograd_a p7; 00723 00724 inline feedable_strassen_winograd_block_grained( 00725 const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col, 00726 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l, 00727 const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col) 00728 : n(n), m(m), l(l), 00729 p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col), 00730 p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col), 00731 p3( bs_c, n/2, m/2, l/2), 00732 p4( bs_c, n/2, m/2, l/2), 00733 p5( bs_c, n/2, m/2, l/2), 00734 p6( bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2), 00735 p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {} 00736 00737 inline feedable_strassen_winograd_block_grained( 00738 const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col, 00739 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l) 00740 : n(n), m(m), l(l), 00741 p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2), 00742 p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2), 00743 p3( bs_c, n/2, m/2, l/2), 00744 p4( bs_c, n/2, m/2, l/2), 00745 p5( bs_c, n/2, m/2, l/2), 00746 p6( bs_c, n/2, m/2, l/2), 00747 p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {} 00748 00749 inline feedable_strassen_winograd_block_grained( 00750 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l, 00751 const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col) 00752 : n(n), m(m), l(l), 00753 p1(bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col), 00754 p2(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col), 00755 p3(bs_c, n/2, m/2, l/2), 00756 p4(bs_c, n/2, m/2, l/2), 00757 p5(bs_c, n/2, m/2, l/2), 00758 p6(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2), 00759 p7(bs_c, n/2, m/2, l/2) {} 00760 00761 inline feedable_strassen_winograd_block_grained( 00762 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l) 00763 : n(n), m(m), l(l), 00764 p1(bs_c, n/2, m/2, l/2), 00765 p2(bs_c, n/2, m/2, l/2), 00766 p3(bs_c, n/2, m/2, l/2), 00767 p4(bs_c, n/2, m/2, l/2), 00768 p5(bs_c, n/2, m/2, l/2), 00769 p6(bs_c, n/2, m/2, l/2), 00770 p7(bs_c, n/2, m/2, l/2) {} 00771 00772 inline void feed_a(const size_type & row, const size_type & col, const swappable_block_matrix_type & bl) 00773 { 00774 // partition bl 00775 typename Ops::swappable_block_matrix_quarterer qbl(bl); 00776 // preadditions 00777 swappable_block_matrix_type s1(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()), 00778 s2(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()), 00779 s3(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()), 00780 s4(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()); 00781 Ops::strassen_winograd_preaddition_a(qbl.ul, qbl.ur, qbl.dl, qbl.dr, s1, s2, s3, s4); 00782 // feed recursive 00783 p1.feed_a(row, col, qbl.ul); 00784 p2.feed_a(row, col, qbl.ur); 00785 p3.feed_a(row, col, s1); 00786 p4.feed_a(row, col, s2); 00787 p5.feed_a(row, col, s3); 00788 p6.feed_a(row, col, s4); 00789 p7.feed_a(row, col, qbl.dr); 00790 } 00791 00792 inline void feed_b(const size_type & row, const size_type & col, const swappable_block_matrix_type & bl) 00793 { 00794 // partition bl 00795 typename Ops::swappable_block_matrix_quarterer qbl(bl); 00796 // preadditions 00797 swappable_block_matrix_type t1(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()), 00798 t2(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()), 00799 t3(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()), 00800 t4(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()); 00801 Ops::strassen_winograd_preaddition_b(qbl.ul, qbl.ur, qbl.dl, qbl.dr, t1, t2, t3, t4); 00802 // feed recursive 00803 p1.feed_b(row, col, qbl.ul); 00804 p2.feed_b(row, col, qbl.dl); 00805 p3.feed_b(row, col, t1); 00806 p4.feed_b(row, col, t2); 00807 p5.feed_b(row, col, t3); 00808 p6.feed_b(row, col, qbl.dr); 00809 p7.feed_b(row, col, t4); 00810 } 00811 00812 inline void multiply() 00813 { 00814 p1.multiply(); 00815 p2.multiply(); 00816 p3.multiply(); 00817 p4.multiply(); 00818 p5.multiply(); 00819 p6.multiply(); 00820 p7.multiply(); 00821 } 00822 00823 inline void read_and_add(const size_type & row, const size_type & col, const swappable_block_matrix_type & bl) 00824 { 00825 // partition bl 00826 typename Ops::swappable_block_matrix_quarterer qbl(bl); 00827 // postadditions 00828 swappable_block_matrix_type px(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()); 00829 p2.read_and_add(row, col, qbl.ul); 00830 p1.read_and_add(row, col, px); 00831 Ops::element_op(qbl.ul, px, typename Ops::addition()); 00832 p4.read_and_add(row, col, px); 00833 Ops::element_op(qbl.ur, px, typename Ops::addition()); 00834 p5.read_and_add(row, col, px); 00835 Ops::element_op_twice_nontransposed(qbl.dl, qbl.dr, px, typename Ops::addition()); 00836 px.set_zero(); 00837 p7.read_and_add(row, col, qbl.dl); 00838 p3.read_and_add(row, col, px); 00839 Ops::element_op_twice_nontransposed(qbl.dr, qbl.ur, px, typename Ops::addition()); 00840 p6.read_and_add(row, col, qbl.ur); 00841 } 00842 00843 inline static unsigned_type get_num_temp_grains() 00844 { return smaller_feedable_strassen_winograd_ab::get_num_temp_grains() + (4 ^ Level) * 2; } 00845 }; 00846 00847 template <typename ValueType, unsigned BlockSideLength, bool AExists, bool BExists> 00848 struct feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, 0, AExists, BExists> 00849 { 00850 typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type; 00851 typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type; 00852 typedef typename swappable_block_matrix_type::swappable_block_identifier_type swappable_block_identifier_type; 00853 typedef typename swappable_block_matrix_type::size_type size_type; 00854 typedef matrix_operations<ValueType, BlockSideLength> Ops; 00855 00856 typedef static_quadtree<swappable_block_identifier_type, 0> bt; 00857 00858 swappable_block_matrix_type a, b, c; 00859 00860 inline feedable_strassen_winograd_block_grained( 00861 const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col, 00862 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l, 00863 const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col) 00864 : a(existing_a, n, l, a_from_row, a_from_col), 00865 b(existing_b, n, l, b_from_row, b_from_col), 00866 c(bs_c, n, m) {} 00867 00868 inline feedable_strassen_winograd_block_grained( 00869 const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col, 00870 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l) 00871 : a(existing_a, n, l, a_from_row, a_from_col), 00872 b(bs_c, n, l), 00873 c(bs_c, n, m) {} 00874 00875 inline feedable_strassen_winograd_block_grained( 00876 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l, 00877 const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col) 00878 : a(bs_c, n, l), 00879 b(existing_b, n, l, b_from_row, b_from_col), 00880 c(bs_c, n, m) {} 00881 00882 inline feedable_strassen_winograd_block_grained( 00883 block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l) 00884 : a(bs_c, n, l), 00885 b(bs_c, n, l), 00886 c(bs_c, n, m) {} 00887 00888 inline void feed_a(const size_type & row, const size_type & col, const swappable_block_matrix_type & bl) 00889 { 00890 if (! AExists) 00891 { 00892 // copy bl to a from (row, col) (assuming a from (row, col) == 0) 00893 swappable_block_matrix_type at(a, bl.get_height(), bl.get_width(), row, col); 00894 Ops::element_op(at, bl, typename Ops::addition()); 00895 } 00896 } 00897 00898 inline void feed_b(const size_type & row, const size_type & col, const swappable_block_matrix_type & bl) 00899 { 00900 if (! BExists) 00901 { 00902 // copy bl(0,0) to b(row, col) (assuming b from (row, col) == 0) 00903 swappable_block_matrix_type bt(b, bl.get_height(), bl.get_width(), row, col); 00904 Ops::element_op(bt, bl, typename Ops::addition()); 00905 } 00906 } 00907 00908 inline void multiply() 00909 { 00910 matrix_operations<ValueType, BlockSideLength>:: 00911 multi_level_strassen_winograd_multiply_and_add_block_grained(a, b, c); 00912 if (! AExists) 00913 a.set_zero(); 00914 if (! BExists) 00915 b.set_zero(); 00916 } 00917 00918 00919 inline void read_and_add(const size_type & row, const size_type & col, swappable_block_matrix_type & bl) 00920 { 00921 // add c from (row, col) to bl 00922 swappable_block_matrix_type ct(c, bl.get_height(), bl.get_width(), row, col); 00923 Ops::element_op(bl, ct, typename Ops::addition()); 00924 ct.set_zero(); 00925 } 00926 00927 inline static unsigned_type get_num_temp_grains() 00928 { return 0; } 00929 }; 00930 00931 template <typename ValueType, unsigned BlockSideLength, unsigned Level, unsigned Granularity> 00932 struct matrix_to_quadtree_block_grained 00933 { 00934 typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type; 00935 typedef typename swappable_block_matrix_type::size_type size_type; 00936 00937 typedef matrix_to_quadtree_block_grained<ValueType, BlockSideLength, Level - 1, Granularity> smaller_matrix_to_quadtree_block_grained; 00938 00939 smaller_matrix_to_quadtree_block_grained ul, ur, dl, dr; 00940 00941 inline matrix_to_quadtree_block_grained(const swappable_block_matrix_type & matrix) 00942 : ul(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, 0), 00943 ur(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, matrix.get_width()/2), 00944 dl(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, 0), 00945 dr(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, matrix.get_width()/2) 00946 { assert(! (matrix.get_height() % 2 | matrix.get_width() % 2)); } 00947 00948 inline matrix_to_quadtree_block_grained(const swappable_block_matrix_type & matrix, 00949 const size_type height, const size_type width, const size_type from_row, const size_type from_col) 00950 : ul(matrix, height/2, width/2, from_row, from_col), 00951 ur(matrix, height/2, width/2, from_row, from_col + width/2), 00952 dl(matrix, height/2, width/2, from_row + height/2, from_col), 00953 dr(matrix, height/2, width/2, from_row + height/2, from_col + width/2) 00954 { assert(! (height % 2 | width % 2)); } 00955 00956 inline swappable_block_matrix_type operator () (const size_type & row, const size_type & col) 00957 { 00958 return swappable_block_matrix_type(ul(row, col), ur(row, col), dl(row, col), dr(row, col)); 00959 } 00960 00961 inline const size_type get_height() 00962 { return ul.get_height(); } 00963 00964 inline const size_type get_width() 00965 { return ul.get_width(); } 00966 }; 00967 00968 template <typename ValueType, unsigned BlockSideLength, unsigned Granularity> 00969 struct matrix_to_quadtree_block_grained<ValueType, BlockSideLength, 0, Granularity> 00970 { 00971 typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type; 00972 typedef typename swappable_block_matrix_type::size_type size_type; 00973 00974 swappable_block_matrix_type m; 00975 00976 inline matrix_to_quadtree_block_grained(const swappable_block_matrix_type & matrix) 00977 : m(matrix, matrix.get_height(), matrix.get_width(), 0, 0) 00978 { assert(! (matrix.get_height() % Granularity | matrix.get_width() % Granularity)); } 00979 00980 inline matrix_to_quadtree_block_grained(const swappable_block_matrix_type & matrix, 00981 const size_type height, const size_type width, const size_type from_row, const size_type from_col) 00982 : m(matrix, height, width, from_row, from_col) 00983 { assert(! (matrix.get_height() % Granularity | matrix.get_width() % Granularity)); } 00984 00985 inline swappable_block_matrix_type operator () (const size_type & row, const size_type & col) 00986 { 00987 return swappable_block_matrix_type(m, Granularity, Granularity, row * Granularity, col * Granularity); 00988 } 00989 00990 inline const size_type get_height() 00991 { return m.get_height() / Granularity; } 00992 00993 inline const size_type get_width() 00994 { return m.get_width() / Granularity; } 00995 }; 00996 00997 template <typename ValueType, unsigned BlockSideLength> 00998 struct matrix_operations 00999 { 01000 // tuning-parameter: Only matrices larger than this (in blocks) are processed by Strassen-Winograd. 01001 // you have to adapt choose_level_for_feedable_sw, too 01002 static const int_type strassen_winograd_base_case_size; 01003 01004 typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type; 01005 typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type; 01006 typedef typename swappable_block_matrix_type::swappable_block_identifier_type swappable_block_identifier_type; 01007 typedef typename block_scheduler_type::internal_block_type internal_block_type; 01008 typedef typename swappable_block_matrix_type::size_type size_type; 01009 typedef column_vector<ValueType> column_vector_type; 01010 typedef row_vector<ValueType> row_vector_type; 01011 typedef typename column_vector_type::size_type vector_size_type; 01012 01013 // +-+-+-+ addition +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 01014 01015 struct addition 01016 { 01017 /* op(c,a,b) means c = a <op> b e.g. assign sum 01018 * op(c,a) means c <op>= a e.g. add up 01019 * op(a) means <op>a e.g. sign 01020 * 01021 * it should hold: 01022 * op(c,0,0) equivalent c = 0 01023 * op(c=0,a) equivalent c = op(a) 01024 * op(c,0) equivalent {} 01025 */ 01026 01027 inline ValueType & operator () (ValueType & c, const ValueType & a, const ValueType & b) { return c = a + b; } 01028 inline ValueType & operator () (ValueType & c, const ValueType & a) { return c += a; } 01029 inline ValueType operator () (const ValueType & a) { return +a; } 01030 }; 01031 01032 struct subtraction 01033 { 01034 inline ValueType & operator () (ValueType & c, const ValueType & a, const ValueType & b) { return c = a - b; } 01035 inline ValueType & operator () (ValueType & c, const ValueType & a) { return c -= a; } 01036 inline ValueType operator () (const ValueType & a) { return -a; } 01037 }; 01038 01039 struct scalar_multiplication 01040 { 01041 inline scalar_multiplication(const ValueType scalar = 1) : s(scalar) {} 01042 inline ValueType & operator () (ValueType & c, const ValueType & a) { return c = a * s; } 01043 inline ValueType operator () (const ValueType & a) { return a * s; } 01044 inline operator const ValueType & () { return s; } 01045 const ValueType s; 01046 }; 01047 01048 // element_op<Op>(C,A,B) calculates C = A <Op> B 01049 template <class Op> static swappable_block_matrix_type & 01050 element_op(swappable_block_matrix_type & C, 01051 const swappable_block_matrix_type & A, 01052 const swappable_block_matrix_type & B, Op op = Op()) 01053 { 01054 for (size_type row = 0; row < C.get_height(); ++row) 01055 for (size_type col = 0; col < C.get_width(); ++col) 01056 element_op_swappable_block( 01057 C(row, col), C.is_transposed(), C.bs, 01058 A(row, col), A.is_transposed(), A.bs, 01059 B(row, col), B.is_transposed(), B.bs, op); 01060 return C; 01061 } 01062 01063 // element_op<Op>(C,A) calculates C <Op>= A 01064 template <class Op> static swappable_block_matrix_type & 01065 element_op(swappable_block_matrix_type & C, 01066 const swappable_block_matrix_type & A, Op op = Op()) 01067 { 01068 for (size_type row = 0; row < C.get_height(); ++row) 01069 for (size_type col = 0; col < C.get_width(); ++col) 01070 element_op_swappable_block( 01071 C(row, col), C.is_transposed(), C.bs, 01072 A(row, col), A.is_transposed(), A.bs, op); 01073 return C; 01074 } 01075 01076 // element_op<Op>(C) calculates C = <Op>C 01077 template <class Op> static swappable_block_matrix_type & 01078 element_op(swappable_block_matrix_type & C, Op op = Op()) 01079 { 01080 for (size_type row = 0; row < C.get_height(); ++row) 01081 for (size_type col = 0; col < C.get_width(); ++col) 01082 element_op_swappable_block( 01083 C(row, col), C.bs, op); 01084 return C; 01085 } 01086 01087 // calculates c = a <Op> b 01088 template <class Op> static void 01089 element_op_swappable_block( 01090 const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type & bs_c, 01091 const swappable_block_identifier_type a, bool a_is_transposed, block_scheduler_type & bs_a, 01092 const swappable_block_identifier_type b, bool b_is_transposed, block_scheduler_type & bs_b, Op op = Op()) 01093 { 01094 if (! bs_c.is_simulating()) 01095 ++ matrix_operation_statistic::get_instance()->block_addition_calls; 01096 // check if zero-block (== ! initialized) 01097 if (! bs_a.is_initialized(a) && ! bs_b.is_initialized(b)) 01098 { 01099 // => a and b are zero -> set c zero 01100 bs_c.deinitialize(c); 01101 if (! bs_c.is_simulating()) 01102 ++ matrix_operation_statistic::get_instance()->block_additions_saved_through_zero; 01103 return; 01104 } 01105 a_is_transposed = a_is_transposed != c_is_transposed; 01106 b_is_transposed = b_is_transposed != c_is_transposed; 01107 if (! bs_a.is_initialized(a)) 01108 { 01109 // a is zero -> copy b 01110 internal_block_type & ic = bs_c.acquire(c, true), 01111 & ib = bs_b.acquire(b); 01112 if (! bs_c.is_simulating()) 01113 { 01114 if (b_is_transposed) 01115 low_level_matrix_binary_ass_op<ValueType, BlockSideLength, false, true, Op>(& ic[0], 0, & ib[0], op); 01116 else 01117 low_level_matrix_binary_ass_op<ValueType, BlockSideLength, false, false, Op>(& ic[0], 0, & ib[0], op); 01118 } 01119 bs_b.release(b, false); 01120 bs_c.release(c, true); 01121 } 01122 else if (! bs_b.is_initialized(b)) 01123 { 01124 // b is zero -> copy a 01125 internal_block_type & ic = bs_c.acquire(c, true), 01126 & ia = bs_a.acquire(a); 01127 if (! bs_c.is_simulating()) 01128 { 01129 if (a_is_transposed) 01130 low_level_matrix_binary_ass_op<ValueType, BlockSideLength, true, false, Op>(& ic[0], & ia[0], 0, op); 01131 else 01132 low_level_matrix_binary_ass_op<ValueType, BlockSideLength, false, false, Op>(& ic[0], & ia[0], 0, op); 01133 } 01134 bs_a.release(a, false); 01135 bs_c.release(c, true); 01136 } 01137 else 01138 { 01139 internal_block_type & ic = bs_c.acquire(c, true), 01140 & ia = bs_a.acquire(a), 01141 & ib = bs_b.acquire(b); 01142 if (! bs_c.is_simulating()) 01143 { 01144 if (a_is_transposed) 01145 { 01146 if (b_is_transposed) 01147 low_level_matrix_binary_ass_op<ValueType, BlockSideLength, true, true, Op>(& ic[0], & ia[0], & ib[0], op); 01148 else 01149 low_level_matrix_binary_ass_op<ValueType, BlockSideLength, true, false, Op>(& ic[0], & ia[0], & ib[0], op); 01150 } 01151 else 01152 { 01153 if (b_is_transposed) 01154 low_level_matrix_binary_ass_op<ValueType, BlockSideLength, false, true, Op>(& ic[0], & ia[0], & ib[0], op); 01155 else 01156 low_level_matrix_binary_ass_op<ValueType, BlockSideLength, false, false, Op>(& ic[0], & ia[0], & ib[0], op); 01157 } 01158 } 01159 bs_a.release(a, false); 01160 bs_b.release(b, false); 01161 bs_c.release(c, true); 01162 } 01163 } 01164 01165 // calculates c <op>= a 01166 template <class Op> static void 01167 element_op_swappable_block( 01168 const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type & bs_c, 01169 const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type & bs_a, Op op = Op()) 01170 { 01171 if (! bs_c.is_simulating()) 01172 ++ matrix_operation_statistic::get_instance()->block_addition_calls; 01173 // check if zero-block (== ! initialized) 01174 if (! bs_a.is_initialized(a)) 01175 { 01176 // => b is zero => nothing to do 01177 if (! bs_c.is_simulating()) 01178 ++ matrix_operation_statistic::get_instance()->block_additions_saved_through_zero; 01179 return; 01180 } 01181 const bool c_is_zero = ! bs_c.is_initialized(c); 01182 // acquire 01183 internal_block_type & ic = bs_c.acquire(c, c_is_zero), 01184 & ia = bs_a.acquire(a); 01185 // add 01186 if (! bs_c.is_simulating()) 01187 { 01188 if (c_is_zero) 01189 if (c_is_transposed == a_is_transposed) 01190 low_level_matrix_unary_op<ValueType, BlockSideLength, false, Op>(& ic[0], & ia[0], op); 01191 else 01192 low_level_matrix_unary_op<ValueType, BlockSideLength, true, Op>(& ic[0], & ia[0], op); 01193 else 01194 if (c_is_transposed == a_is_transposed) 01195 low_level_matrix_unary_ass_op<ValueType, BlockSideLength, false, Op>(& ic[0], & ia[0], op); 01196 else 01197 low_level_matrix_unary_ass_op<ValueType, BlockSideLength, true, Op>(& ic[0], & ia[0], op); 01198 } 01199 // release 01200 bs_c.release(c, true); 01201 bs_a.release(a, false); 01202 } 01203 01204 // calculates c = <op>c 01205 template <class Op> static void 01206 element_op_swappable_block( 01207 const swappable_block_identifier_type c, block_scheduler_type & bs_c, Op op = Op()) 01208 { 01209 if (! bs_c.is_simulating()) 01210 ++ matrix_operation_statistic::get_instance()->block_addition_calls; 01211 // check if zero-block (== ! initialized) 01212 if (! bs_c.is_initialized(c)) 01213 { 01214 // => c is zero => nothing to do 01215 if (! bs_c.is_simulating()) 01216 ++ matrix_operation_statistic::get_instance()->block_additions_saved_through_zero; 01217 return; 01218 } 01219 // acquire 01220 internal_block_type & ic = bs_c.acquire(c); 01221 // add 01222 if (! bs_c.is_simulating()) 01223 low_level_matrix_unary_op<ValueType, BlockSideLength, false, Op>(& ic[0], & ic[0], op); 01224 // release 01225 bs_c.release(c, true); 01226 } 01227 01228 // additions for strassen-winograd 01229 01230 inline static void 01231 strassen_winograd_preaddition_a(swappable_block_matrix_type & a11, 01232 swappable_block_matrix_type & a12, 01233 swappable_block_matrix_type & a21, 01234 swappable_block_matrix_type & a22, 01235 swappable_block_matrix_type & s1, 01236 swappable_block_matrix_type & s2, 01237 swappable_block_matrix_type & s3, 01238 swappable_block_matrix_type & s4) 01239 { 01240 for (size_type row = 0; row < a11.get_height(); ++row) 01241 for (size_type col = 0; col < a11.get_width(); ++col) 01242 { 01243 op_swappable_block_nontransposed(s3, a11, subtraction(), a21, row, col); 01244 op_swappable_block_nontransposed(s1, a21, addition(), a22, row, col); 01245 op_swappable_block_nontransposed(s2, s1, subtraction(), a11, row, col); 01246 op_swappable_block_nontransposed(s4, a12, subtraction(), s2, row, col); 01247 } 01248 } 01249 01250 inline static void 01251 strassen_winograd_preaddition_b(swappable_block_matrix_type & b11, 01252 swappable_block_matrix_type & b12, 01253 swappable_block_matrix_type & b21, 01254 swappable_block_matrix_type & b22, 01255 swappable_block_matrix_type & t1, 01256 swappable_block_matrix_type & t2, 01257 swappable_block_matrix_type & t3, 01258 swappable_block_matrix_type & t4) 01259 { 01260 for (size_type row = 0; row < b11.get_height(); ++row) 01261 for (size_type col = 0; col < b11.get_width(); ++col) 01262 { 01263 op_swappable_block_nontransposed(t3, b22, subtraction(), b12, row, col); 01264 op_swappable_block_nontransposed(t1, b12, subtraction(), b11, row, col); 01265 op_swappable_block_nontransposed(t2, b22, subtraction(), t1, row, col); 01266 op_swappable_block_nontransposed(t4, b21, subtraction(), t2, row, col); 01267 } 01268 } 01269 01270 inline static void 01271 strassen_winograd_postaddition(swappable_block_matrix_type & c11, // = p2 01272 swappable_block_matrix_type & c12, // = p6 01273 swappable_block_matrix_type & c21, // = p7 01274 swappable_block_matrix_type & c22, // = p4 01275 swappable_block_matrix_type & p1, 01276 swappable_block_matrix_type & p3, 01277 swappable_block_matrix_type & p5) 01278 { 01279 for (size_type row = 0; row < c11.get_height(); ++row) 01280 for (size_type col = 0; col < c11.get_width(); ++col) 01281 { 01282 op_swappable_block_nontransposed(c11, addition(), p1, row, col); // (u1) 01283 op_swappable_block_nontransposed( p1, addition(), c22, row, col); // (u2) 01284 op_swappable_block_nontransposed( p5, addition(), p1, row, col); // (u3) 01285 op_swappable_block_nontransposed(c21, addition(), p5, row, col); // (u4) 01286 op_swappable_block_nontransposed(c22, p5, addition(), p3, row, col); // (u5) 01287 op_swappable_block_nontransposed( p1, addition(), p3, row, col); // (u6) 01288 op_swappable_block_nontransposed(c12, addition(), p1, row, col); // (u7) 01289 } 01290 } 01291 01292 // calculates c1 += a; c2 += a 01293 template <class Op> inline static void 01294 element_op_twice_nontransposed(swappable_block_matrix_type & c1, 01295 swappable_block_matrix_type & c2, 01296 const swappable_block_matrix_type & a, Op op = Op()) 01297 { 01298 for (size_type row = 0; row < a.get_height(); ++row) 01299 for (size_type col = 0; col < a.get_width(); ++col) 01300 { 01301 element_op_swappable_block( 01302 c1(row, col), false, c1.bs, 01303 a(row, col), false, a.bs, op); 01304 element_op_swappable_block( 01305 c2(row, col), false, c2.bs, 01306 a(row, col), false, a.bs, op); 01307 } 01308 } 01309 01310 template <class Op> inline static void 01311 op_swappable_block_nontransposed(swappable_block_matrix_type & c, 01312 swappable_block_matrix_type & a, Op op, swappable_block_matrix_type & b, 01313 size_type & row, size_type & col) 01314 { 01315 element_op_swappable_block( 01316 c(row, col), false, c.bs, 01317 a(row, col), false, a.bs, 01318 b(row, col), false, b.bs, op); 01319 } 01320 01321 template <class Op> inline static void 01322 op_swappable_block_nontransposed(swappable_block_matrix_type & c, Op op, swappable_block_matrix_type & a, 01323 size_type & row, size_type & col) 01324 { 01325 element_op_swappable_block( 01326 c(row, col), false, c.bs, 01327 a(row, col), false, a.bs, op); 01328 } 01329 01330 // +-+ end addition +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 01331 01332 // +-+-+-+ matrix multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 01333 01334 /* n, m and l denote the three dimensions of a matrix multiplication, according to the following ascii-art diagram: 01335 * 01336 * +--m--+ 01337 * +----l-----+ | | +--m--+ 01338 * | | | | | | 01339 * n A | • l B | = n C | 01340 * | | | | | | 01341 * +----------+ | | +-----+ 01342 * +-----+ 01343 * 01344 * The index-variables are called i, j, k for dimension 01345 * n, m, l . 01346 */ 01347 01348 // requires height and width divisible by 2 01349 struct swappable_block_matrix_quarterer 01350 { 01351 swappable_block_matrix_type upleft, upright, 01352 downleft, downright, 01353 & ul, & ur, & dl, & dr; 01354 01355 swappable_block_matrix_quarterer(const swappable_block_matrix_type & whole) 01356 : upleft (whole, whole.get_height()/2, whole.get_width()/2, 0, 0), 01357 upright (whole, whole.get_height()/2, whole.get_width()/2, 0, whole.get_width()/2), 01358 downleft (whole, whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, 0), 01359 downright(whole, whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, whole.get_width()/2), 01360 ul(upleft), ur(upright), dl(downleft), dr(downright) 01361 { assert(! (whole.get_height() % 2 | whole.get_width() % 2)); } 01362 }; 01363 01364 struct swappable_block_matrix_padding_quarterer 01365 { 01366 swappable_block_matrix_type upleft, upright, 01367 downleft, downright, 01368 & ul, & ur, & dl, & dr; 01369 01370 swappable_block_matrix_padding_quarterer(const swappable_block_matrix_type & whole) 01371 : upleft (whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2), 0, 0), 01372 upright (whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2), 0, div_ceil(whole.get_width(),2)), 01373 downleft (whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2), div_ceil(whole.get_height(),2), 0), 01374 downright(whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2), div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2)), 01375 ul(upleft), ur(upright), dl(downleft), dr(downright) {} 01376 }; 01377 01378 struct swappable_block_matrix_approximative_quarterer 01379 { 01380 swappable_block_matrix_type upleft, upright, 01381 downleft, downright, 01382 & ul, & ur, & dl, & dr; 01383 01384 swappable_block_matrix_approximative_quarterer(const swappable_block_matrix_type & whole) 01385 : upleft (whole, whole.get_height()/2, whole.get_width()/2, 0, 0), 01386 upright (whole, whole.get_height()/2, whole.get_width() - whole.get_width()/2, 0, whole.get_width()/2), 01387 downleft (whole, whole.get_height() - whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, 0), 01388 downright(whole, whole.get_height() - whole.get_height()/2, whole.get_width() - whole.get_width()/2, whole.get_height()/2, whole.get_width()/2), 01389 ul(upleft), ur(upright), dl(downleft), dr(downright) {} 01390 }; 01391 01392 //! \brief calculates C = A * B + C 01393 // requires fitting dimensions 01394 static swappable_block_matrix_type & 01395 multi_level_strassen_winograd_multiply_and_add_block_grained(const swappable_block_matrix_type & A, 01396 const swappable_block_matrix_type & B, 01397 swappable_block_matrix_type & C) 01398 { 01399 int_type num_levels = log2_ceil(std::min(A.get_width(), std::min(C.get_width(), C.get_height()))); 01400 if (num_levels > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE) 01401 { 01402 if (num_levels > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS) 01403 num_levels = STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS; 01404 swappable_block_matrix_type padded_a(A, round_up_to_power_of_two(A.get_height(), num_levels), 01405 round_up_to_power_of_two(A.get_width(), num_levels), 0, 0), 01406 padded_b(B, round_up_to_power_of_two(B.get_height(), num_levels), 01407 round_up_to_power_of_two(B.get_width(), num_levels), 0, 0), 01408 padded_c(C, round_up_to_power_of_two(C.get_height(), num_levels), 01409 round_up_to_power_of_two(C.get_width(), num_levels), 0, 0); 01410 switch (num_levels) 01411 { 01412 #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 5 && 5 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE) 01413 case 5: 01414 use_feedable_sw_block_grained<5>(padded_a, padded_a, padded_c); 01415 break; 01416 #endif 01417 #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 4 && 4 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE) 01418 case 4: 01419 use_feedable_sw_block_grained<4>(padded_a, padded_a, padded_c); 01420 break; 01421 #endif 01422 #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 3 && 3 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE) 01423 case 3: 01424 use_feedable_sw_block_grained<3>(padded_a, padded_a, padded_c); 01425 break; 01426 #endif 01427 #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 2 && 2 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE) 01428 case 2: 01429 use_feedable_sw_block_grained<2>(padded_a, padded_a, padded_c); 01430 break; 01431 #endif 01432 default: // only here in case of wrong bounds 01433 strassen_winograd_multiply_and_add_interleaved(A, B, C); 01434 break; 01435 } 01436 } 01437 else 01438 // base case 01439 strassen_winograd_multiply_and_add_interleaved(A, B, C); 01440 return C; 01441 } 01442 01443 // input matrices have to be padded 01444 template <unsigned Level> 01445 static void use_feedable_sw_block_grained(const swappable_block_matrix_type & A, 01446 const swappable_block_matrix_type & B, 01447 swappable_block_matrix_type & C) 01448 { 01449 const unsigned granularity = 1; 01450 01451 feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level, true, true> 01452 fsw(A, 0, 0, C.bs, C.get_height(), C.get_width(), A.get_width(), B, 0, 0); 01453 // preadditions for A 01454 { 01455 matrix_to_quadtree_block_grained<ValueType, BlockSideLength, Level, granularity> 01456 mtq_a (A); 01457 for (size_type row = 0; row < mtq_a.get_height(); ++row) 01458 for (size_type col = 0; col < mtq_a.get_width(); ++col) 01459 fsw.feed_a(row, col, mtq_a(row, col)); 01460 } 01461 // preadditions for B 01462 { 01463 matrix_to_quadtree_block_grained<ValueType, BlockSideLength, Level, granularity> 01464 mtq_b (B); 01465 for (size_type row = 0; row < mtq_b.get_height(); ++row) 01466 for (size_type col = 0; col < mtq_b.get_width(); ++col) 01467 fsw.feed_b(row, col, mtq_b(row, col)); 01468 } 01469 // recursive multiplications 01470 fsw.multiply(); 01471 // postadditions 01472 { 01473 matrix_to_quadtree_block_grained<ValueType, BlockSideLength, Level, granularity> 01474 mtq_c (C); 01475 for (size_type row = 0; row < mtq_c.get_height(); ++row) 01476 for (size_type col = 0; col < mtq_c.get_width(); ++col) 01477 fsw.read_and_add(row, col, mtq_c(row, col)); 01478 } 01479 } 01480 01481 //! \brief calculates C = A * B + C 01482 // requires fitting dimensions 01483 static swappable_block_matrix_type & 01484 multi_level_strassen_winograd_multiply_and_add(const swappable_block_matrix_type & A, 01485 const swappable_block_matrix_type & B, 01486 swappable_block_matrix_type & C) 01487 { 01488 int_type p = log2_ceil(std::min(A.get_width(), std::min(C.get_width(), C.get_height()))); 01489 01490 swappable_block_matrix_type padded_a(A, round_up_to_power_of_two(A.get_height(), p), 01491 round_up_to_power_of_two(A.get_width(), p), 0, 0), 01492 padded_b(B, round_up_to_power_of_two(B.get_height(), p), 01493 round_up_to_power_of_two(B.get_width(), p), 0, 0), 01494 padded_c(C, round_up_to_power_of_two(C.get_height(), p), 01495 round_up_to_power_of_two(C.get_width(), p), 0, 0); 01496 choose_level_for_feedable_sw(padded_a, padded_b, padded_c); 01497 return C; 01498 } 01499 01500 // input matrices have to be padded 01501 static void choose_level_for_feedable_sw(const swappable_block_matrix_type & A, 01502 const swappable_block_matrix_type & B, 01503 swappable_block_matrix_type & C) 01504 { 01505 switch (log2_ceil(std::min(A.get_width(), std::min(C.get_width(), C.get_height())))) 01506 { 01507 default: 01508 /* 01509 use_feedable_sw<4>(A, B, C); 01510 break; 01511 case 3: 01512 use_feedable_sw<3>(A, B, C); 01513 break; 01514 case 2:*/ 01515 use_feedable_sw<2>(A, B, C); 01516 break; 01517 case 1: 01518 /*use_feedable_sw<1>(A, B, C); 01519 break;*/ 01520 case 0: 01521 // base case 01522 recursive_multiply_and_add(A, B, C); 01523 break; 01524 } 01525 } 01526 01527 // input matrices have to be padded 01528 template <unsigned Level> 01529 static void use_feedable_sw(const swappable_block_matrix_type & A, 01530 const swappable_block_matrix_type & B, 01531 swappable_block_matrix_type & C) 01532 { 01533 feedable_strassen_winograd<ValueType, BlockSideLength, Level, true, true> 01534 fsw(A, 0, 0, C.bs, C.get_height(), C.get_width(), A.get_width(), B, 0, 0); 01535 // preadditions for A 01536 matrix_to_quadtree<ValueType, BlockSideLength, Level> 01537 mtq_a (A); 01538 for (size_type block_row = 0; block_row < mtq_a.get_height_in_blocks(); ++block_row) 01539 for (size_type block_col = 0; block_col < mtq_a.get_width_in_blocks(); ++block_col) 01540 { 01541 fsw.begin_feeding_a_block(block_row, block_col, 01542 mtq_a.begin_reading_block(block_row, block_col)); 01543 #if STXXL_PARALLEL 01544 #pragma omp parallel for 01545 #endif 01546 for (int_type element_row_in_block = 0; element_row_in_block < int_type(BlockSideLength); ++element_row_in_block) 01547 for (int_type element_col_in_block = 0; element_col_in_block < int_type(BlockSideLength); ++element_col_in_block) 01548 fsw.feed_a_element(element_row_in_block * BlockSideLength + element_col_in_block, 01549 mtq_a.read_element(element_row_in_block * BlockSideLength + element_col_in_block)); 01550 fsw.end_feeding_a_block(block_row, block_col, 01551 mtq_a.end_reading_block(block_row, block_col)); 01552 } 01553 // preadditions for B 01554 matrix_to_quadtree<ValueType, BlockSideLength, Level> 01555 mtq_b (B); 01556 for (size_type block_row = 0; block_row < mtq_b.get_height_in_blocks(); ++block_row) 01557 for (size_type block_col = 0; block_col < mtq_b.get_width_in_blocks(); ++block_col) 01558 { 01559 fsw.begin_feeding_b_block(block_row, block_col, 01560 mtq_b.begin_reading_block(block_row, block_col)); 01561 #if STXXL_PARALLEL 01562 #pragma omp parallel for 01563 #endif 01564 for (int_type element_row_in_block = 0; element_row_in_block < int_type(BlockSideLength); ++element_row_in_block) 01565 for (int_type element_col_in_block = 0; element_col_in_block < int_type(BlockSideLength); ++element_col_in_block) 01566 fsw.feed_b_element(element_row_in_block * BlockSideLength + element_col_in_block, 01567 mtq_b.read_element(element_row_in_block * BlockSideLength + element_col_in_block)); 01568 fsw.end_feeding_b_block(block_row, block_col, 01569 mtq_b.end_reading_block(block_row, block_col)); 01570 } 01571 // recursive multiplications 01572 fsw.multiply(); 01573 // postadditions 01574 matrix_to_quadtree<ValueType, BlockSideLength, Level> 01575 mtq_c (C); 01576 for (size_type block_row = 0; block_row < mtq_c.get_height_in_blocks(); ++block_row) 01577 for (size_type block_col = 0; block_col < mtq_c.get_width_in_blocks(); ++block_col) 01578 { 01579 mtq_c.begin_feeding_block(block_row, block_col, 01580 fsw.begin_reading_block(block_row, block_col)); 01581 #if STXXL_PARALLEL 01582 #pragma omp parallel for 01583 #endif 01584 for (int_type element_row_in_block = 0; element_row_in_block < int_type(BlockSideLength); ++element_row_in_block) 01585 for (int_type element_col_in_block = 0; element_col_in_block < int_type(BlockSideLength); ++element_col_in_block) 01586 mtq_c.feed_and_add_element(element_row_in_block * BlockSideLength + element_col_in_block, 01587 fsw.read_element(element_row_in_block * BlockSideLength + element_col_in_block)); 01588 mtq_c.end_feeding_block(block_row, block_col, 01589 fsw.end_reading_block(block_row, block_col)); 01590 } 01591 } 01592 01593 //! \brief calculates C = A * B 01594 // assumes fitting dimensions 01595 static swappable_block_matrix_type & 01596 strassen_winograd_multiply(const swappable_block_matrix_type & A, 01597 const swappable_block_matrix_type & B, 01598 swappable_block_matrix_type & C) 01599 { 01600 // base case 01601 if (C.get_height() <= strassen_winograd_base_case_size 01602 || C.get_width() <= strassen_winograd_base_case_size 01603 || A.get_width() <= strassen_winograd_base_case_size) 01604 { 01605 C.set_zero(); 01606 return recursive_multiply_and_add(A, B, C); 01607 } 01608 01609 // partition matrix 01610 swappable_block_matrix_padding_quarterer qa(A), qb(B), qc(C); 01611 // preadditions 01612 swappable_block_matrix_type s1(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()), 01613 s2(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()), 01614 s3(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()), 01615 s4(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()), 01616 t1(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()), 01617 t2(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()), 01618 t3(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()), 01619 t4(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()); 01620 strassen_winograd_preaddition_a(qa.ul, qa.ur, qa.dl, qa.dr, s1, s2, s3, s4); 01621 strassen_winograd_preaddition_b(qb.ul, qb.ur, qb.dl, qb.dr, t1, t2, t3, t4); 01622 // recursive multiplications 01623 swappable_block_matrix_type p1(C.bs, qc.ul.get_height(), qc.ul.get_width(), qc.ul.is_transposed()), 01624 // p2 stored in qc.ul 01625 p3(C.bs, qc.ul.get_height(), qc.ul.get_width(), qc.ul.is_transposed()), 01626 // p4 stored in qc.dr 01627 p5(C.bs, qc.ul.get_height(), qc.ul.get_width(), qc.ul.is_transposed()); 01628 // p6 stored in qc.ur 01629 // p7 stored in qc.dl 01630 strassen_winograd_multiply(qa.ul, qb.ul, p1); 01631 strassen_winograd_multiply(qa.ur, qb.dl, qc.ul); 01632 strassen_winograd_multiply( s1, t1, p3); 01633 strassen_winograd_multiply( s2, t2, qc.dr); 01634 strassen_winograd_multiply( s3, t3, p5); 01635 strassen_winograd_multiply( s4, qb.dr, qc.ur); 01636 strassen_winograd_multiply(qa.dr, t4, qc.dl); 01637 // postadditions 01638 strassen_winograd_postaddition(qc.ul, qc.ur, qc.dl, qc.dr, p1, p3, p5); 01639 return C; 01640 } 01641 01642 //! \brief calculates C = A * B + C 01643 // assumes fitting dimensions 01644 static swappable_block_matrix_type & 01645 strassen_winograd_multiply_and_add_interleaved(const swappable_block_matrix_type & A, 01646 const swappable_block_matrix_type & B, 01647 swappable_block_matrix_type & C) 01648 { 01649 // base case 01650 if (C.get_height() <= strassen_winograd_base_case_size 01651 || C.get_width() <= strassen_winograd_base_case_size 01652 || A.get_width() <= strassen_winograd_base_case_size) 01653 return recursive_multiply_and_add(A, B, C); 01654 01655 // partition matrix 01656 swappable_block_matrix_padding_quarterer qa(A), qb(B), qc(C); 01657 // preadditions 01658 swappable_block_matrix_type s1(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()), 01659 s2(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()), 01660 s3(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()), 01661 s4(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()), 01662 t1(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()), 01663 t2(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()), 01664 t3(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()), 01665 t4(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()); 01666 strassen_winograd_preaddition_a(qa.ul, qa.ur, qa.dl, qa.dr, s1, s2, s3, s4); 01667 strassen_winograd_preaddition_b(qb.ul, qb.ur, qb.dl, qb.dr, t1, t2, t3, t4); 01668 // recursive multiplications and postadditions 01669 swappable_block_matrix_type px(C.bs, qc.ul.get_height(), qc.ul.get_width(), qc.ul.is_transposed()); 01670 strassen_winograd_multiply_and_add_interleaved(qa.ur, qb.dl, qc.ul); // p2 01671 strassen_winograd_multiply_and_add_interleaved(qa.ul, qb.ul, px); // p1 01672 element_op<addition>(qc.ul, px); 01673 strassen_winograd_multiply_and_add_interleaved(s2, t2, px); // p4 01674 s2.set_zero(); 01675 t2.set_zero(); 01676 element_op<addition>(qc.ur, px); 01677 strassen_winograd_multiply_and_add_interleaved(s3, t3, px); // p5 01678 s3.set_zero(); 01679 t3.set_zero(); 01680 element_op_twice_nontransposed<addition>(qc.dl, qc.dr, px); 01681 px.set_zero(); 01682 strassen_winograd_multiply_and_add_interleaved(qa.dr, t4, qc.dl); // p7 01683 t4.set_zero(); 01684 strassen_winograd_multiply_and_add_interleaved(s1, t1, px); // p3 01685 s1.set_zero(); 01686 t1.set_zero(); 01687 element_op_twice_nontransposed<addition>(qc.dr, qc.ur, px); 01688 px.set_zero(); 01689 strassen_winograd_multiply_and_add_interleaved(s4, qb.dr, qc.ur); // p6 01690 return C; 01691 } 01692 01693 //! \brief calculates C = A * B + C 01694 // assumes fitting dimensions 01695 static swappable_block_matrix_type & 01696 strassen_winograd_multiply_and_add(const swappable_block_matrix_type & A, 01697 const swappable_block_matrix_type & B, 01698 swappable_block_matrix_type & C) 01699 { 01700 // base case 01701 if (C.get_height() <= strassen_winograd_base_case_size 01702 || C.get_width() <= strassen_winograd_base_case_size 01703 || A.get_width() <= strassen_winograd_base_case_size) 01704 return recursive_multiply_and_add(A, B, C); 01705 01706 // partition matrix 01707 swappable_block_matrix_padding_quarterer qa(A), qb(B), qc(C); 01708 // preadditions 01709 swappable_block_matrix_type s1(C.bs, qa.ul.get_height(), qa.ul.get_width()), 01710 s2(C.bs, qa.ul.get_height(), qa.ul.get_width()), 01711 s3(C.bs, qa.ul.get_height(), qa.ul.get_width()), 01712 s4(C.bs, qa.ul.get_height(), qa.ul.get_width()), 01713 t1(C.bs, qb.ul.get_height(), qb.ul.get_width()), 01714 t2(C.bs, qb.ul.get_height(), qb.ul.get_width()), 01715 t3(C.bs, qb.ul.get_height(), qb.ul.get_width()), 01716 t4(C.bs, qb.ul.get_height(), qb.ul.get_width()); 01717 element_op<subtraction>(s3, qa.ul, qa.dl); 01718 element_op<addition>(s1, qa.dl, qa.dr); 01719 element_op<subtraction>(s2, s1, qa.ul); 01720 element_op<subtraction>(s4, qa.ur, s2); 01721 element_op<subtraction>(t3, qb.dr, qb.ur); 01722 element_op<subtraction>(t1, qb.ur, qb.ul); 01723 element_op<subtraction>(t2, qb.dr, t1); 01724 element_op<subtraction>(t4, qb.dl, t2); 01725 // recursive multiplications and postadditions 01726 swappable_block_matrix_type px(C.bs, qc.ul.get_height(), qc.ul.get_width()); 01727 strassen_winograd_multiply_and_add(qa.ur, qb.dl, qc.ul); // p2 01728 strassen_winograd_multiply_and_add(qa.ul, qb.ul, px); // p1 01729 element_op<addition>(qc.ul, px); 01730 strassen_winograd_multiply_and_add(s2, t2, px); // p4 01731 element_op<addition>(qc.ur, px); 01732 strassen_winograd_multiply_and_add(s3, t3, px); // p5 01733 element_op<addition>(qc.dl, px); 01734 element_op<addition>(qc.dr, px); 01735 px.set_zero(); 01736 strassen_winograd_multiply_and_add(qa.dr, t4, qc.dl); // p7 01737 strassen_winograd_multiply_and_add(s1, t1, px); // p3 01738 element_op<addition>(qc.dr, px); 01739 element_op<addition>(qc.ur, px); 01740 strassen_winograd_multiply_and_add(s4, qb.dr, qc.ur); // p6 01741 return C; 01742 } 01743 01744 //! \brief calculates C = A * B + C 01745 // assumes fitting dimensions 01746 static swappable_block_matrix_type & 01747 recursive_multiply_and_add(const swappable_block_matrix_type & A, 01748 const swappable_block_matrix_type & B, 01749 swappable_block_matrix_type & C) 01750 { 01751 // catch empty intervals 01752 if (C.get_height() * C.get_width() * A.get_width() == 0) 01753 return C; 01754 // base case 01755 if ((C.get_height() == 1) + (C.get_width() == 1) + (A.get_width() == 1) >= 2) 01756 return naive_multiply_and_add(A, B, C); 01757 01758 // partition matrix 01759 swappable_block_matrix_approximative_quarterer qa(A), qb(B), qc(C); 01760 // recursive multiplication 01761 // The order of recursive calls is optimized to enhance locality. C has priority because it has to be read and written. 01762 recursive_multiply_and_add(qa.ul, qb.ul, qc.ul); 01763 recursive_multiply_and_add(qa.ur, qb.dl, qc.ul); 01764 recursive_multiply_and_add(qa.ur, qb.dr, qc.ur); 01765 recursive_multiply_and_add(qa.ul, qb.ur, qc.ur); 01766 recursive_multiply_and_add(qa.dl, qb.ur, qc.dr); 01767 recursive_multiply_and_add(qa.dr, qb.dr, qc.dr); 01768 recursive_multiply_and_add(qa.dr, qb.dl, qc.dl); 01769 recursive_multiply_and_add(qa.dl, qb.ul, qc.dl); 01770 01771 return C; 01772 } 01773 01774 //! \brief calculates C = A * B + C 01775 // requires fitting dimensions 01776 static swappable_block_matrix_type & 01777 naive_multiply_and_add(const swappable_block_matrix_type & A, 01778 const swappable_block_matrix_type & B, 01779 swappable_block_matrix_type & C) 01780 { 01781 const size_type & n = C.get_height(), 01782 & m = C.get_width(), 01783 & l = A.get_width(); 01784 for (size_type i = 0; i < n; ++i) 01785 for (size_type j = 0; j < m; ++j) 01786 for (size_type k = 0; k < l; ++k) 01787 multiply_and_add_swappable_block(A(i,k), A.is_transposed(), A.bs, 01788 B(k,j), B.is_transposed(), B.bs, 01789 C(i,j), C.is_transposed(), C.bs); 01790 return C; 01791 } 01792 01793 static void multiply_and_add_swappable_block( 01794 const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type & bs_a, 01795 const swappable_block_identifier_type b, const bool b_is_transposed, block_scheduler_type & bs_b, 01796 const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type & bs_c) 01797 { 01798 if (! bs_c.is_simulating()) 01799 ++ matrix_operation_statistic::get_instance()->block_multiplication_calls; 01800 // check if zero-block (== ! initialized) 01801 if (! bs_a.is_initialized(a) || ! bs_b.is_initialized(b)) 01802 { 01803 // => one factor is zero => product is zero 01804 if (! bs_c.is_simulating()) 01805 ++ matrix_operation_statistic::get_instance()->block_multiplications_saved_through_zero; 01806 return; 01807 } 01808 // acquire 01809 ValueType * ap = bs_a.acquire(a).begin(), 01810 * bp = bs_b.acquire(b).begin(), 01811 * cp = bs_c.acquire(c).begin(); 01812 // multiply 01813 if (! bs_c.is_simulating()) 01814 low_level_matrix_multiply_and_add<ValueType, BlockSideLength> 01815 (ap, a_is_transposed, bp, b_is_transposed, cp, c_is_transposed); 01816 // release 01817 bs_a.release(a, false); 01818 bs_b.release(b, false); 01819 bs_c.release(c, true); 01820 } 01821 01822 // +-+ end matrix multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 01823 01824 // +-+-+-+ matrix-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 01825 01826 //! \brief calculates z = A * x 01827 static column_vector_type & 01828 recursive_matrix_col_vector_multiply_and_add(const swappable_block_matrix_type & A, 01829 const column_vector_type & x, column_vector_type & z, 01830 const vector_size_type offset_x = 0, const vector_size_type offset_z = 0) 01831 { 01832 // catch empty intervals 01833 if (A.get_height() * A.get_width() == 0) 01834 return z; 01835 // base case 01836 if (A.get_height() == 1 || A.get_width() == 1) 01837 return naive_matrix_col_vector_multiply_and_add(A, x, z, offset_x, offset_z); 01838 01839 // partition matrix 01840 swappable_block_matrix_approximative_quarterer qa(A); 01841 // recursive multiplication 01842 // The order of recursive calls is optimized to enhance locality. 01843 recursive_matrix_col_vector_multiply_and_add(qa.ul, x, z, offset_x, offset_z ); 01844 recursive_matrix_col_vector_multiply_and_add(qa.ur, x, z, offset_x + qa.ul.get_width(), offset_z ); 01845 recursive_matrix_col_vector_multiply_and_add(qa.dr, x, z, offset_x + qa.ul.get_width(), offset_z + qa.ul.get_height()); 01846 recursive_matrix_col_vector_multiply_and_add(qa.dl, x, z, offset_x, offset_z + qa.ul.get_height()); 01847 01848 return z; 01849 } 01850 01851 static column_vector_type & 01852 naive_matrix_col_vector_multiply_and_add(const swappable_block_matrix_type & A, 01853 const column_vector_type & x, column_vector_type & z, 01854 const vector_size_type offset_x = 0, const vector_size_type offset_z = 0) 01855 { 01856 for (size_type row = 0; row < A.get_height(); ++row) 01857 for (size_type col = 0; col < A.get_width(); ++col) 01858 matrix_col_vector_multiply_and_add_swappable_block(A(row, col), A.is_transposed(), A.bs, 01859 x, z, (offset_x + col) * BlockSideLength, (offset_z + row) * BlockSideLength); 01860 return z; 01861 } 01862 01863 static void matrix_col_vector_multiply_and_add_swappable_block( 01864 const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type & bs_a, 01865 const column_vector_type & x, column_vector_type & z, 01866 const vector_size_type offset_x = 0, const vector_size_type offset_z = 0) 01867 { 01868 // check if zero-block (== ! initialized) 01869 if (! bs_a.is_initialized(a)) 01870 { 01871 // => matrix is zero => product is zero 01872 return; 01873 } 01874 // acquire 01875 internal_block_type & ia = bs_a.acquire(a); 01876 // multiply 01877 if (! bs_a.is_simulating()) 01878 { 01879 int_type row_limit = std::min(BlockSideLength, unsigned(z.size() - offset_z)), 01880 col_limit = std::min(BlockSideLength, unsigned(x.size() - offset_x)); 01881 if (a_is_transposed) 01882 for (int_type col = 0; col < col_limit; ++col) 01883 for (int_type row = 0; row < row_limit; ++row) 01884 z[offset_z + row] += x[offset_x + col] * ia[row + col * BlockSideLength]; 01885 else 01886 for (int_type row = 0; row < row_limit; ++row) 01887 for (int_type col = 0; col < col_limit; ++col) 01888 z[offset_z + row] += x[offset_x + col] * ia[row * BlockSideLength + col]; 01889 } 01890 // release 01891 bs_a.release(a, false); 01892 } 01893 01894 //! \brief calculates z = y * A 01895 static row_vector_type & 01896 recursive_matrix_row_vector_multiply_and_add(const row_vector_type & y, 01897 const swappable_block_matrix_type & A, row_vector_type & z, 01898 const vector_size_type offset_y = 0, const vector_size_type offset_z = 0) 01899 { 01900 // catch empty intervals 01901 if (A.get_height() * A.get_width() == 0) 01902 return z; 01903 // base case 01904 if (A.get_height() == 1 || A.get_width() == 1) 01905 return naive_matrix_row_vector_multiply_and_add(y, A, z, offset_y, offset_z); 01906 01907 // partition matrix 01908 swappable_block_matrix_approximative_quarterer qa(A); 01909 // recursive multiplication 01910 // The order of recursive calls is optimized to enhance locality. 01911 recursive_matrix_row_vector_multiply_and_add(y, qa.ul, z, offset_y, offset_z ); 01912 recursive_matrix_row_vector_multiply_and_add(y, qa.dl, z, offset_y + qa.ul.get_height(), offset_z ); 01913 recursive_matrix_row_vector_multiply_and_add(y, qa.dr, z, offset_y + qa.ul.get_height(), offset_z + qa.ul.get_width()); 01914 recursive_matrix_row_vector_multiply_and_add(y, qa.ur, z, offset_y, offset_z + qa.ul.get_width()); 01915 01916 return z; 01917 } 01918 01919 static row_vector_type & 01920 naive_matrix_row_vector_multiply_and_add(const row_vector_type & y, const swappable_block_matrix_type & A, 01921 row_vector_type & z, 01922 const vector_size_type offset_y = 0, const vector_size_type offset_z = 0) 01923 { 01924 for (size_type row = 0; row < A.get_height(); ++row) 01925 for (size_type col = 0; col < A.get_width(); ++col) 01926 matrix_row_vector_multiply_and_add_swappable_block(y, A(row, col), A.is_transposed(), A.bs, 01927 z, (offset_y + row) * BlockSideLength, (offset_z + col) * BlockSideLength); 01928 return z; 01929 } 01930 01931 static void matrix_row_vector_multiply_and_add_swappable_block(const row_vector_type & y, 01932 const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type & bs_a, 01933 row_vector_type & z, 01934 const vector_size_type offset_y = 0, const vector_size_type offset_z = 0) 01935 { 01936 // check if zero-block (== ! initialized) 01937 if (! bs_a.is_initialized(a)) 01938 { 01939 // => matrix is zero => product is zero 01940 return; 01941 } 01942 // acquire 01943 internal_block_type & ia = bs_a.acquire(a); 01944 // multiply 01945 if (! bs_a.is_simulating()) 01946 { 01947 int_type row_limit = std::min(BlockSideLength, unsigned(y.size() - offset_y)), 01948 col_limit = std::min(BlockSideLength, unsigned(z.size() - offset_z)); 01949 if (a_is_transposed) 01950 for (int_type col = 0; col < col_limit; ++col) 01951 for (int_type row = 0; row < row_limit; ++row) 01952 z[offset_z + col] += ia[row + col * BlockSideLength] * y[offset_y + row]; 01953 else 01954 for (int_type row = 0; row < row_limit; ++row) 01955 for (int_type col = 0; col < col_limit; ++col) 01956 z[offset_z + col] += ia[row * BlockSideLength + col] * y[offset_y + row]; 01957 } 01958 // release 01959 bs_a.release(a, false); 01960 } 01961 01962 // +-+ end matrix-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 01963 01964 // +-+-+-+ vector-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 01965 01966 static void recursive_matrix_from_vectors(swappable_block_matrix_type A, const column_vector_type & l, 01967 const row_vector_type & r, vector_size_type offset_l = 0, vector_size_type offset_r = 0) 01968 { 01969 // catch empty intervals 01970 if (A.get_height() * A.get_width() == 0) 01971 return; 01972 // base case 01973 if (A.get_height() == 1 || A.get_width() == 1) 01974 { 01975 naive_matrix_from_vectors(A, l, r, offset_l, offset_r); 01976 return; 01977 } 01978 01979 // partition matrix 01980 swappable_block_matrix_approximative_quarterer qa(A); 01981 // recursive creation 01982 // The order of recursive calls is optimized to enhance locality. 01983 recursive_matrix_from_vectors(qa.ul, l, r, offset_l, offset_r ); 01984 recursive_matrix_from_vectors(qa.ur, l, r, offset_l, offset_r + qa.ul.get_width()); 01985 recursive_matrix_from_vectors(qa.dr, l, r, offset_l + qa.ul.get_height(), offset_r + qa.ul.get_width()); 01986 recursive_matrix_from_vectors(qa.dl, l, r, offset_l + qa.ul.get_height(), offset_r ); 01987 } 01988 01989 static void naive_matrix_from_vectors(swappable_block_matrix_type A, const column_vector_type & l, 01990 const row_vector_type & r, vector_size_type offset_l = 0, vector_size_type offset_r = 0) 01991 { 01992 for (size_type row = 0; row < A.get_height(); ++row) 01993 for (size_type col = 0; col < A.get_width(); ++col) 01994 matrix_from_vectors_swappable_block(A(row, col), A.is_transposed(), A.bs, 01995 l, r, (offset_l + row) * BlockSideLength, (offset_r + col) * BlockSideLength); 01996 } 01997 01998 static void matrix_from_vectors_swappable_block(swappable_block_identifier_type a, 01999 const bool a_is_transposed, block_scheduler_type & bs_a, 02000 const column_vector_type & l, const row_vector_type & r, 02001 vector_size_type offset_l, vector_size_type offset_r) 02002 { 02003 // acquire 02004 internal_block_type & ia = bs_a.acquire(a, true); 02005 // multiply 02006 if (! bs_a.is_simulating()) 02007 { 02008 int_type row_limit = std::min(BlockSideLength, unsigned(l.size() - offset_l)), 02009 col_limit = std::min(BlockSideLength, unsigned(r.size() - offset_r)); 02010 if (a_is_transposed) 02011 for (int_type col = 0; col < col_limit; ++col) 02012 for (int_type row = 0; row < row_limit; ++row) 02013 ia[row + col * BlockSideLength] = l[row + offset_l] * r[col + offset_r]; 02014 else 02015 for (int_type row = 0; row < row_limit; ++row) 02016 for (int_type col = 0; col < col_limit; ++col) 02017 ia[row * BlockSideLength + col] = l[row + offset_l] * r[col + offset_r]; 02018 } 02019 // release 02020 bs_a.release(a, true); 02021 } 02022 02023 // +-+ end vector-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 02024 }; 02025 02026 // Adjust choose_level_for_feedable_sw, too! 02027 template <typename ValueType, unsigned BlockSideLength> 02028 const int_type matrix_operations<ValueType, BlockSideLength>::strassen_winograd_base_case_size = 3; 02029 02030 __STXXL_END_NAMESPACE 02031 02032 #endif /* STXXL_MATRIX_ARITHMETIC_HEADER */