Stxxl  1.4.0
include/stxxl/bits/containers/matrix_arithmetic.h
Go to the documentation of this file.
00001 /***************************************************************************
00002  *  include/stxxl/bits/containers/matrix_arithmetic.h
00003  *
00004  *  Part of the STXXL. See http://stxxl.sourceforge.net
00005  *
00006  *  Copyright (C) 2010-2011 Raoul Steffen <R-Steffen@gmx.de>
00007  *
00008  *  Distributed under the Boost Software License, Version 1.0.
00009  *  (See accompanying file LICENSE_1_0.txt or copy at
00010  *  http://www.boost.org/LICENSE_1_0.txt)
00011  **************************************************************************/
00012 
00013 #ifndef STXXL_MATRIX_ARITHMETIC_HEADER
00014 #define STXXL_MATRIX_ARITHMETIC_HEADER
00015 
00016 #include <stxxl/bits/mng/mng.h>
00017 #include <stxxl/bits/containers/matrix_low_level.h>
00018 
00019 __STXXL_BEGIN_NAMESPACE
00020 
00021 #ifndef STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS
00022 #define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS 3
00023 #endif
00024 #ifndef STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE
00025 #define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE 2
00026 #endif
00027 
00028 template <typename ValueType>
00029 class column_vector;
00030 
00031 template <typename ValueType>
00032 class row_vector;
00033 
00034 template <typename ValueType, unsigned BlockSideLength>
00035 class swappable_block_matrix;
00036 
00037 struct matrix_operation_statistic_dataset
00038 {
00039     int_type block_multiplication_calls,
00040              block_multiplications_saved_through_zero,
00041              block_addition_calls,
00042              block_additions_saved_through_zero;
00043 
00044     matrix_operation_statistic_dataset()
00045         : block_multiplication_calls(0),
00046           block_multiplications_saved_through_zero(0),
00047           block_addition_calls(0),
00048           block_additions_saved_through_zero(0) {}
00049 
00050     matrix_operation_statistic_dataset operator + (const matrix_operation_statistic_dataset & stat)
00051     {
00052         matrix_operation_statistic_dataset res(*this);
00053         res.block_multiplication_calls += stat.block_multiplication_calls;
00054         res.block_multiplications_saved_through_zero += stat.block_multiplications_saved_through_zero;
00055         res.block_addition_calls += stat.block_addition_calls;
00056         res.block_additions_saved_through_zero += stat.block_additions_saved_through_zero;
00057         return res;
00058     }
00059 
00060     matrix_operation_statistic_dataset operator - (const matrix_operation_statistic_dataset & stat)
00061     {
00062         matrix_operation_statistic_dataset res(*this);
00063         res.block_multiplication_calls -= stat.block_multiplication_calls;
00064         res.block_multiplications_saved_through_zero -= stat.block_multiplications_saved_through_zero;
00065         res.block_addition_calls -= stat.block_addition_calls;
00066         res.block_additions_saved_through_zero -= stat.block_additions_saved_through_zero;
00067         return res;
00068     }
00069 };
00070 
00071 struct matrix_operation_statistic
00072     : public singleton<matrix_operation_statistic>, public matrix_operation_statistic_dataset
00073 {};
00074 
00075 struct matrix_operation_statistic_data : public matrix_operation_statistic_dataset
00076 {
00077     matrix_operation_statistic_data(const matrix_operation_statistic & stat = * matrix_operation_statistic::get_instance())
00078         : matrix_operation_statistic_dataset(stat) {}
00079 
00080     matrix_operation_statistic_data(const matrix_operation_statistic_dataset & stat)
00081         : matrix_operation_statistic_dataset(stat) {}
00082 
00083     matrix_operation_statistic_data & operator = (const matrix_operation_statistic & stat)
00084     {
00085         return *this = matrix_operation_statistic_data(stat);
00086     }
00087 
00088     void set()
00089     { operator = (* matrix_operation_statistic::get_instance()); }
00090 
00091     matrix_operation_statistic_data operator + (const matrix_operation_statistic_data & stat)
00092     { return matrix_operation_statistic_data(matrix_operation_statistic_dataset(*this) + matrix_operation_statistic_dataset(stat)); }
00093 
00094     matrix_operation_statistic_data operator - (const matrix_operation_statistic_data & stat)
00095     { return matrix_operation_statistic_data(matrix_operation_statistic_dataset(*this) - matrix_operation_statistic_dataset(stat)); }
00096 };
00097 
00098 std::ostream & operator << (std::ostream & o, const matrix_operation_statistic_data & statsd)
00099 {
00100     o << "matrix operation statistics" << std::endl;
00101     o << "block multiplication calls                     : "
00102             << statsd.block_multiplication_calls << std::endl;
00103     o << "block multiplications saved through zero blocks: "
00104             << statsd.block_multiplications_saved_through_zero << std::endl;
00105     o << "block multiplications performed                : "
00106             << statsd.block_multiplication_calls - statsd.block_multiplications_saved_through_zero << std::endl;
00107     o << "block addition calls                           : "
00108             << statsd.block_addition_calls << std::endl;
00109     o << "block additions saved through zero blocks      : "
00110             << statsd.block_additions_saved_through_zero << std::endl;
00111     o << "block additions performed                      : "
00112             << statsd.block_addition_calls - statsd.block_additions_saved_through_zero << std::endl;
00113     return o;
00114 }
00115 
00116 //! \brief A static_quadtree holds 4^Level elements arranged in a quad tree.
00117 //!
00118 //! Static quad trees are useful for recursive algorithms with fixed depth
00119 //!   that partition the in- and output and perform pre- and postcalculations on the partitions.
00120 //! The four children of one node are denoted as ul (up left), ur (up right), dl (down left), and dr (down right).
00121 template <typename ValueType, unsigned Level>
00122 struct static_quadtree
00123 {
00124     typedef static_quadtree<ValueType, Level - 1> smaller_static_quadtree;
00125 
00126     smaller_static_quadtree ul, ur, dl, dr;
00127 
00128     static_quadtree(smaller_static_quadtree ul, smaller_static_quadtree ur,
00129             smaller_static_quadtree dl, smaller_static_quadtree dr)
00130         : ul(ul), ur(ur), dl(dl), dr(dr) {}
00131 
00132     static_quadtree() {}
00133 
00134     static_quadtree & operator &= (const static_quadtree & right)
00135     {
00136         ul &= right.ul; ur &= right.ur; dl &= right.dl; dr &= right.dr;
00137         return *this;
00138     }
00139 
00140     static_quadtree & operator += (const static_quadtree & right)
00141     {
00142         ul += right.ul; ur += right.ur; dl += right.dl; dr += right.dr;
00143         return *this;
00144     }
00145 
00146     static_quadtree & operator -= (const static_quadtree & right)
00147     {
00148         ul -= right.ul; ur -= right.ur; dl -= right.dl; dr -= right.dr;
00149         return *this;
00150     }
00151 
00152     static_quadtree operator & (const static_quadtree & right) const
00153     { return static_quadtree(ul & right.ul, ur & right.ur, dl & right.dl, dr & right.dr); }
00154 
00155     static_quadtree operator + (const static_quadtree & right) const
00156     { return static_quadtree(ul + right.ul, ur + right.ur, dl + right.dl, dr + right.dr); }
00157 
00158     static_quadtree operator - (const static_quadtree & right) const
00159     { return static_quadtree(ul - right.ul, ur - right.ur, dl - right.dl, dr - right.dr); }
00160 };
00161 
00162 template <typename ValueType>
00163 struct static_quadtree<ValueType, 0>
00164 {
00165     ValueType val;
00166 
00167     static_quadtree(const ValueType & v)
00168         : val(v) {}
00169 
00170     static_quadtree() {}
00171 
00172     operator const ValueType & () const
00173     { return val; }
00174 
00175     operator ValueType & ()
00176     { return val; }
00177 
00178     static_quadtree & operator &= (const static_quadtree & right)
00179     {
00180         val &= right.val;
00181         return *this;
00182     }
00183 
00184     static_quadtree & operator += (const static_quadtree & right)
00185     {
00186         val += right.val;
00187         return *this;
00188     }
00189 
00190     static_quadtree & operator -= (const static_quadtree & right)
00191     {
00192         val -= right.val;
00193         return *this;
00194     }
00195 
00196     static_quadtree operator ! () const
00197     { return static_quadtree(! val); }
00198 
00199     static_quadtree operator & (const static_quadtree & right) const
00200     { return val & right.val; }
00201 
00202     static_quadtree operator + (const static_quadtree & right) const
00203     { return val + right.val; }
00204 
00205     static_quadtree operator - (const static_quadtree & right) const
00206     { return val - right.val; }
00207 };
00208 
00209 template <typename ValueType, unsigned BlockSideLength, unsigned Level, bool AExists, bool BExists>
00210 struct feedable_strassen_winograd
00211 {
00212     typedef static_quadtree<bool, Level> zbt;     // true <=> is a zero-block
00213     typedef static_quadtree<ValueType, Level> vt;
00214 
00215     typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, AExists, BExists> smaller_feedable_strassen_winograd_ab;
00216     typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, AExists, false> smaller_feedable_strassen_winograd_a;
00217     typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, false, BExists> smaller_feedable_strassen_winograd_b;
00218     typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, false, false> smaller_feedable_strassen_winograd_n;
00219 
00220     typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type;
00221     typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type;
00222     typedef typename block_scheduler_type::internal_block_type internal_block_type;
00223     typedef typename swappable_block_matrix_type::size_type size_type;
00224 
00225     const size_type n, m, l;
00226     smaller_feedable_strassen_winograd_ab p1, p2;
00227     smaller_feedable_strassen_winograd_n  p3, p4, p5;
00228     smaller_feedable_strassen_winograd_b  p6;
00229     smaller_feedable_strassen_winograd_a  p7;
00230 
00231     feedable_strassen_winograd(
00232             const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col,
00233             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l,
00234             const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col)
00235         : n(n), m(m), l(l),
00236           p1(existing_a, a_from_row,       a_from_col,       bs_c, n/2, m/2, l/2, existing_b, b_from_row,       b_from_col),
00237           p2(existing_a, a_from_row,       a_from_col + l/2, bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
00238           p3(                                                bs_c, n/2, m/2, l/2),
00239           p4(                                                bs_c, n/2, m/2, l/2),
00240           p5(                                                bs_c, n/2, m/2, l/2),
00241           p6(                                                bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
00242           p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
00243 
00244     feedable_strassen_winograd(
00245             const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col,
00246             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l)
00247         : n(n), m(m), l(l),
00248           p1(existing_a, a_from_row,       a_from_col,       bs_c, n/2, m/2, l/2),
00249           p2(existing_a, a_from_row,       a_from_col + l/2, bs_c, n/2, m/2, l/2),
00250           p3(                                                bs_c, n/2, m/2, l/2),
00251           p4(                                                bs_c, n/2, m/2, l/2),
00252           p5(                                                bs_c, n/2, m/2, l/2),
00253           p6(                                                bs_c, n/2, m/2, l/2),
00254           p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
00255 
00256     feedable_strassen_winograd(
00257             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l,
00258             const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col)
00259         : n(n), m(m), l(l),
00260           p1(bs_c, n/2, m/2, l/2, existing_b, b_from_row,       b_from_col),
00261           p2(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
00262           p3(bs_c, n/2, m/2, l/2),
00263           p4(bs_c, n/2, m/2, l/2),
00264           p5(bs_c, n/2, m/2, l/2),
00265           p6(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
00266           p7(bs_c, n/2, m/2, l/2) {}
00267 
00268     feedable_strassen_winograd(
00269             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l)
00270         : n(n), m(m), l(l),
00271           p1(bs_c, n/2, m/2, l/2),
00272           p2(bs_c, n/2, m/2, l/2),
00273           p3(bs_c, n/2, m/2, l/2),
00274           p4(bs_c, n/2, m/2, l/2),
00275           p5(bs_c, n/2, m/2, l/2),
00276           p6(bs_c, n/2, m/2, l/2),
00277           p7(bs_c, n/2, m/2, l/2) {}
00278 
00279     void begin_feeding_a_block(const size_type & block_row, const size_type & block_col, const zbt zb)
00280     {
00281         typename zbt::smaller_static_quadtree s1 = zb.dl & zb.dr,
00282                                               s2 = s1    & zb.ul,
00283                                               s3 = zb.ul & zb.dl,
00284                                               s4 = zb.ur & s2;
00285         p1.begin_feeding_a_block(block_row, block_col, zb.ul);
00286         p2.begin_feeding_a_block(block_row, block_col, zb.ur);
00287         p3.begin_feeding_a_block(block_row, block_col, s1);
00288         p4.begin_feeding_a_block(block_row, block_col, s2);
00289         p5.begin_feeding_a_block(block_row, block_col, s3);
00290         p6.begin_feeding_a_block(block_row, block_col, s4);
00291         p7.begin_feeding_a_block(block_row, block_col, zb.dr);
00292     }
00293 
00294     void feed_a_element(const int_type element_num, const vt v)
00295     {
00296         typename vt::smaller_static_quadtree s1 = v.dl + v.dr,
00297                                              s2 = s1   - v.ul,
00298                                              s3 = v.ul - v.dl,
00299                                              s4 = v.ur - s2;
00300         p1.feed_a_element(element_num, v.ul);
00301         p2.feed_a_element(element_num, v.ur);
00302         p3.feed_a_element(element_num, s1);
00303         p4.feed_a_element(element_num, s2);
00304         p5.feed_a_element(element_num, s3);
00305         p6.feed_a_element(element_num, s4);
00306         p7.feed_a_element(element_num, v.dr);
00307     }
00308 
00309     void end_feeding_a_block(const size_type & block_row, const size_type & block_col, const zbt zb)
00310     {
00311         typename zbt::smaller_static_quadtree s1 = zb.dl & zb.dr,
00312                                               s2 = s1    & zb.ul,
00313                                               s3 = zb.ul & zb.dl,
00314                                               s4 = zb.ur & s2;
00315         p1.end_feeding_a_block(block_row, block_col, zb.ul);
00316         p2.end_feeding_a_block(block_row, block_col, zb.ur);
00317         p3.end_feeding_a_block(block_row, block_col, s1);
00318         p4.end_feeding_a_block(block_row, block_col, s2);
00319         p5.end_feeding_a_block(block_row, block_col, s3);
00320         p6.end_feeding_a_block(block_row, block_col, s4);
00321         p7.end_feeding_a_block(block_row, block_col, zb.dr);
00322     }
00323 
00324     void begin_feeding_b_block(const size_type & block_row, const size_type & block_col, const zbt zb)
00325     {
00326         typename zbt::smaller_static_quadtree t1 = zb.ur & zb.ul,
00327                                               t2 = zb.dr & t1,
00328                                               t3 = zb.dr & zb.ur,
00329                                               t4 = zb.dl & t2;
00330         p1.begin_feeding_b_block(block_row, block_col, zb.ul);
00331         p2.begin_feeding_b_block(block_row, block_col, zb.dl);
00332         p3.begin_feeding_b_block(block_row, block_col, t1);
00333         p4.begin_feeding_b_block(block_row, block_col, t2);
00334         p5.begin_feeding_b_block(block_row, block_col, t3);
00335         p6.begin_feeding_b_block(block_row, block_col, zb.dr);
00336         p7.begin_feeding_b_block(block_row, block_col, t4);
00337     }
00338 
00339     void feed_b_element(const int_type element_num, const vt v)
00340     {
00341         typename vt::smaller_static_quadtree t1 = v.ur - v.ul,
00342                                              t2 = v.dr - t1,
00343                                              t3 = v.dr - v.ur,
00344                                              t4 = v.dl - t2;
00345         p1.feed_b_element(element_num, v.ul);
00346         p2.feed_b_element(element_num, v.dl);
00347         p3.feed_b_element(element_num, t1);
00348         p4.feed_b_element(element_num, t2);
00349         p5.feed_b_element(element_num, t3);
00350         p6.feed_b_element(element_num, v.dr);
00351         p7.feed_b_element(element_num, t4);
00352     }
00353 
00354     void end_feeding_b_block(const size_type & block_row, const size_type & block_col, const zbt zb)
00355     {
00356         typename zbt::smaller_static_quadtree t1 = zb.ur & zb.ul,
00357                                               t2 = zb.dr & t1,
00358                                               t3 = zb.dr & zb.ur,
00359                                               t4 = zb.dl & t2;
00360         p1.end_feeding_b_block(block_row, block_col, zb.ul);
00361         p2.end_feeding_b_block(block_row, block_col, zb.dl);
00362         p3.end_feeding_b_block(block_row, block_col, t1);
00363         p4.end_feeding_b_block(block_row, block_col, t2);
00364         p5.end_feeding_b_block(block_row, block_col, t3);
00365         p6.end_feeding_b_block(block_row, block_col, zb.dr);
00366         p7.end_feeding_b_block(block_row, block_col, t4);
00367     }
00368 
00369     void multiply()
00370     {
00371         p1.multiply();
00372         p2.multiply();
00373         p3.multiply();
00374         p4.multiply();
00375         p5.multiply();
00376         p6.multiply();
00377         p7.multiply();
00378     }
00379 
00380     zbt begin_reading_block(const size_type & block_row, const size_type & block_col)
00381     {
00382         zbt r;
00383         r.ur = r.ul = p1.begin_reading_block(block_row, block_col);
00384         r.ul &= p2.begin_reading_block(block_row, block_col);
00385         r.ur &= p4.begin_reading_block(block_row, block_col);
00386         r.dr = r.dl = p5.begin_reading_block(block_row, block_col);
00387         r.dl &= r.ur;
00388         r.dl &= p7.begin_reading_block(block_row, block_col);
00389         r.ur &= p3.begin_reading_block(block_row, block_col);
00390         r.dr &= r.ur;
00391         r.ur &= p6.begin_reading_block(block_row, block_col);
00392         return r;
00393     }
00394 
00395     vt read_element(int_type element_num)
00396     {
00397         vt r;
00398         r.ur = r.ul = p1.read_element(element_num);
00399         r.ul += p2.read_element(element_num);
00400         r.ur += p4.read_element(element_num);
00401         r.dr = r.dl = p5.read_element(element_num);
00402         r.dl += r.ur;
00403         r.dl += p7.read_element(element_num);
00404         r.ur += p3.read_element(element_num);
00405         r.dr += r.ur;
00406         r.ur += p6.read_element(element_num);
00407         return r;
00408     }
00409 
00410     zbt end_reading_block(const size_type & block_row, const size_type & block_col)
00411     {
00412         zbt r;
00413         r.ur = r.ul = p1.end_reading_block(block_row, block_col);
00414         r.ul &= p2.end_reading_block(block_row, block_col);
00415         r.ur &= p4.end_reading_block(block_row, block_col);
00416         r.dr = r.dl = p5.end_reading_block(block_row, block_col);
00417         r.dl &= r.ur;
00418         r.dl &= p7.end_reading_block(block_row, block_col);
00419         r.ur &= p3.end_reading_block(block_row, block_col);
00420         r.dr &= r.ur;
00421         r.ur &= p6.end_reading_block(block_row, block_col);
00422         return r;
00423     }
00424 };
00425 
00426 template <typename ValueType, unsigned BlockSideLength, bool AExists, bool BExists>
00427 struct feedable_strassen_winograd<ValueType, BlockSideLength, 0, AExists, BExists>
00428 {
00429     typedef static_quadtree<bool, 0> zbt;     // true <=> is a zero-block
00430     typedef static_quadtree<ValueType, 0> vt;
00431 
00432     typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type;
00433     typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type;
00434     typedef typename block_scheduler_type::internal_block_type internal_block_type;
00435     typedef typename swappable_block_matrix_type::size_type size_type;
00436 
00437     swappable_block_matrix_type a, b, c;
00438     const size_type n, m, l;
00439     internal_block_type * iblock;
00440 
00441     feedable_strassen_winograd(
00442             const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col,
00443             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l,
00444             const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col)
00445         : a(existing_a, n, l, a_from_row, a_from_col),
00446           b(existing_b, n, l, b_from_row, b_from_col),
00447           c(bs_c, n, m),
00448           n(n), m(m), l(l),
00449           iblock(0) {}
00450 
00451     feedable_strassen_winograd(
00452             const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col,
00453             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l)
00454         : a(existing_a, n, l, a_from_row, a_from_col),
00455           b(bs_c, n, l),
00456           c(bs_c, n, m),
00457           n(n), m(m), l(l),
00458           iblock(0) {}
00459 
00460     feedable_strassen_winograd(
00461             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l,
00462             const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col)
00463         : a(bs_c, n, l),
00464           b(existing_b, n, l, b_from_row, b_from_col),
00465           c(bs_c, n, m),
00466           n(n), m(m), l(l),
00467           iblock(0) {}
00468 
00469     feedable_strassen_winograd(
00470             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l)
00471         : a(bs_c, n, l),
00472           b(bs_c, n, l),
00473           c(bs_c, n, m),
00474           n(n), m(m), l(l),
00475           iblock(0) {}
00476 
00477     void begin_feeding_a_block(const size_type & block_row, const size_type & block_col, const zbt)
00478     {
00479         if (! AExists)
00480             iblock = & a.bs.acquire(a(block_row, block_col), true);
00481     }
00482 
00483     void feed_a_element(const int_type element_num, const vt v)
00484     {
00485         if (! AExists)
00486             (*iblock)[element_num] = v;
00487     }
00488 
00489     void end_feeding_a_block(const size_type & block_row, const size_type & block_col, const zbt zb)
00490     {
00491         if (! AExists)
00492         {
00493             a.bs.release(a(block_row, block_col), ! zb);
00494             iblock = 0;
00495         }
00496     }
00497 
00498     void begin_feeding_b_block(const size_type & block_row, const size_type & block_col, const zbt)
00499     {
00500         if (! BExists)
00501             iblock = & b.bs.acquire(b(block_row, block_col), true);
00502     }
00503 
00504     void feed_b_element(const int_type element_num, const vt v)
00505     {
00506         if (! BExists)
00507             (*iblock)[element_num] = v;
00508     }
00509 
00510     void end_feeding_b_block(const size_type & block_row, const size_type & block_col, const zbt zb)
00511     {
00512         if (! BExists)
00513         {
00514             b.bs.release(b(block_row, block_col), ! zb);
00515             iblock = 0;
00516         }
00517     }
00518 
00519     void multiply()
00520     { matrix_operations<ValueType, BlockSideLength>::choose_level_for_feedable_sw(a, b, c); }
00521 
00522     zbt begin_reading_block(const size_type & block_row, const size_type & block_col)
00523     {
00524         bool zb = ! c.bs.is_initialized(c(block_row, block_col));
00525         iblock = & c.bs.acquire(c(block_row, block_col));
00526         return zb;
00527     }
00528 
00529     vt read_element(const int_type element_num)
00530     { return (*iblock)[element_num]; }
00531 
00532     zbt end_reading_block(const size_type & block_row, const size_type & block_col)
00533     {
00534         c.bs.release(c(block_row, block_col), false);
00535         iblock = 0;
00536         return ! c.bs.is_initialized(c(block_row, block_col));
00537     }
00538 };
00539 
00540 template <typename ValueType, unsigned BlockSideLength, unsigned Level>
00541 struct matrix_to_quadtree
00542 {
00543     typedef static_quadtree<bool, Level> zbt;     // true <=> is a zero-block
00544     typedef static_quadtree<ValueType, Level> vt;
00545 
00546     typedef matrix_to_quadtree<ValueType, BlockSideLength, Level - 1> smaller_matrix_to_quadtree;
00547 
00548     typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type;
00549     typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type;
00550     typedef typename block_scheduler_type::internal_block_type internal_block_type;
00551     typedef typename swappable_block_matrix_type::size_type size_type;
00552 
00553     smaller_matrix_to_quadtree ul, ur, dl, dr;
00554 
00555     matrix_to_quadtree(const swappable_block_matrix_type & matrix)
00556         : ul(matrix, matrix.get_height()/2, matrix.get_width()/2,                     0,                    0),
00557           ur(matrix, matrix.get_height()/2, matrix.get_width()/2,                     0, matrix.get_width()/2),
00558           dl(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2,                    0),
00559           dr(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, matrix.get_width()/2)
00560     { assert(! (matrix.get_height() % 2 | matrix.get_width() % 2)); }
00561 
00562     matrix_to_quadtree(const swappable_block_matrix_type & matrix,
00563             const size_type height, const size_type width, const size_type from_row, const size_type from_col)
00564         : ul(matrix, height/2, width/2, from_row,            from_col),
00565           ur(matrix, height/2, width/2, from_row,            from_col + width/2),
00566           dl(matrix, height/2, width/2, from_row + height/2, from_col),
00567           dr(matrix, height/2, width/2, from_row + height/2, from_col + width/2)
00568     { assert(! (height % 2 | width % 2)); }
00569 
00570     void begin_feeding_block(const size_type & block_row, const size_type & block_col, const zbt zb)
00571     {
00572         ul.begin_feeding_block(block_row, block_col, zb.ul);
00573         ur.begin_feeding_block(block_row, block_col, zb.ur);
00574         dl.begin_feeding_block(block_row, block_col, zb.dl);
00575         dr.begin_feeding_block(block_row, block_col, zb.dr);
00576     }
00577 
00578     void feed_element(const int_type element_num, const vt v)
00579     {
00580         ul.feed_element(element_num, v.ul);
00581         ur.feed_element(element_num, v.ur);
00582         dl.feed_element(element_num, v.dl);
00583         dr.feed_element(element_num, v.dr);
00584     }
00585 
00586     void feed_and_add_element(const int_type element_num, const vt v)
00587     {
00588         ul.feed_and_add_element(element_num, v.ul);
00589         ur.feed_and_add_element(element_num, v.ur);
00590         dl.feed_and_add_element(element_num, v.dl);
00591         dr.feed_and_add_element(element_num, v.dr);
00592     }
00593 
00594     void end_feeding_block(const size_type & block_row, const size_type & block_col, const zbt zb)
00595     {
00596         ul.end_feeding_block(block_row, block_col, zb.ul);
00597         ur.end_feeding_block(block_row, block_col, zb.ur);
00598         dl.end_feeding_block(block_row, block_col, zb.dl);
00599         dr.end_feeding_block(block_row, block_col, zb.dr);
00600     }
00601 
00602     zbt begin_reading_block(const size_type & block_row, const size_type & block_col)
00603     {
00604         zbt zb;
00605         zb.ul = ul.begin_reading_block(block_row, block_col);
00606         zb.ur = ur.begin_reading_block(block_row, block_col);
00607         zb.dl = dl.begin_reading_block(block_row, block_col);
00608         zb.dr = dr.begin_reading_block(block_row, block_col);
00609         return zb;
00610     }
00611 
00612     vt read_element(const int_type element_num)
00613     {
00614         vt v;
00615         v.ul = ul.read_element(element_num);
00616         v.ur = ur.read_element(element_num);
00617         v.dl = dl.read_element(element_num);
00618         v.dr = dr.read_element(element_num);
00619         return v;
00620     }
00621 
00622     zbt end_reading_block(const size_type & block_row, const size_type & block_col)
00623     {
00624         zbt zb;
00625         zb.ul = ul.end_reading_block(block_row, block_col);
00626         zb.ur = ur.end_reading_block(block_row, block_col);
00627         zb.dl = dl.end_reading_block(block_row, block_col);
00628         zb.dr = dr.end_reading_block(block_row, block_col);
00629         return zb;
00630     }
00631 
00632     const size_type & get_height_in_blocks()
00633     { return ul.get_height_in_blocks(); }
00634 
00635     const size_type & get_width_in_blocks()
00636     { return ul.get_width_in_blocks(); }
00637 };
00638 
00639 template <typename ValueType, unsigned BlockSideLength>
00640 struct matrix_to_quadtree<ValueType, BlockSideLength, 0>
00641 {
00642     typedef static_quadtree<bool, 0> zbt;     // true <=> is a zero-block
00643     typedef static_quadtree<ValueType, 0> vt;
00644 
00645     typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type;
00646     typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type;
00647     typedef typename block_scheduler_type::internal_block_type internal_block_type;
00648     typedef typename swappable_block_matrix_type::size_type size_type;
00649 
00650     swappable_block_matrix_type m;
00651     internal_block_type * iblock;
00652 
00653     matrix_to_quadtree(const swappable_block_matrix_type & matrix)
00654         : m(matrix, matrix.get_height(), matrix.get_width(), 0, 0),
00655           iblock(0) {}
00656 
00657     matrix_to_quadtree(const swappable_block_matrix_type & matrix,
00658             const size_type height, const size_type width, const size_type from_row, const size_type from_col)
00659         : m(matrix, height, width, from_row, from_col),
00660           iblock(0) {}
00661 
00662     void begin_feeding_block(const size_type & block_row, const size_type & block_col, const zbt)
00663     { iblock = & m.bs.acquire(m(block_row, block_col)); }
00664 
00665     void feed_element(const int_type element_num, const vt v)
00666     { (*iblock)[element_num] = v; }
00667 
00668     void feed_and_add_element(const int_type element_num, const vt v)
00669     { (*iblock)[element_num] += v; }
00670 
00671     void end_feeding_block(const size_type & block_row, const size_type & block_col, const zbt zb)
00672     {
00673         m.bs.release(m(block_row, block_col), ! zb);
00674         iblock = 0;
00675     }
00676 
00677     zbt begin_reading_block(const size_type & block_row, const size_type & block_col)
00678     {
00679         zbt zb = ! m.bs.is_initialized(m(block_row, block_col));
00680         iblock = & m.bs.acquire(m(block_row, block_col));
00681         return zb;
00682     }
00683 
00684     vt read_element(const int_type element_num)
00685     { return (*iblock)[element_num]; }
00686 
00687     zbt end_reading_block(const size_type & block_row, const size_type & block_col)
00688     {
00689         m.bs.release(m(block_row, block_col), false);
00690         iblock = 0;
00691         return  ! m.bs.is_initialized(m(block_row, block_col));
00692     }
00693 
00694     const size_type & get_height_in_blocks()
00695     { return m.get_height(); }
00696 
00697     const size_type & get_width_in_blocks()
00698     { return m.get_width(); }
00699 };
00700 
00701 template <typename ValueType, unsigned BlockSideLength, unsigned Level, bool AExists, bool BExists>
00702 struct feedable_strassen_winograd_block_grained
00703 {
00704     typedef static_quadtree<bool, Level> zbt;     // true <=> is a zero-block
00705     typedef static_quadtree<ValueType, Level> vt;
00706 
00707     typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, AExists, BExists> smaller_feedable_strassen_winograd_ab;
00708     typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, AExists, false> smaller_feedable_strassen_winograd_a;
00709     typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, false, BExists> smaller_feedable_strassen_winograd_b;
00710     typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, false, false> smaller_feedable_strassen_winograd_n;
00711 
00712     typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type;
00713     typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type;
00714     typedef typename block_scheduler_type::internal_block_type internal_block_type;
00715     typedef typename swappable_block_matrix_type::size_type size_type;
00716     typedef matrix_operations<ValueType, BlockSideLength> Ops;
00717 
00718     const size_type n, m, l;
00719     smaller_feedable_strassen_winograd_ab p1, p2;
00720     smaller_feedable_strassen_winograd_n  p3, p4, p5;
00721     smaller_feedable_strassen_winograd_b  p6;
00722     smaller_feedable_strassen_winograd_a  p7;
00723 
00724     inline feedable_strassen_winograd_block_grained(
00725             const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col,
00726             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l,
00727             const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col)
00728         : n(n), m(m), l(l),
00729           p1(existing_a, a_from_row,       a_from_col,       bs_c, n/2, m/2, l/2, existing_b, b_from_row,       b_from_col),
00730           p2(existing_a, a_from_row,       a_from_col + l/2, bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
00731           p3(                                                bs_c, n/2, m/2, l/2),
00732           p4(                                                bs_c, n/2, m/2, l/2),
00733           p5(                                                bs_c, n/2, m/2, l/2),
00734           p6(                                                bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
00735           p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
00736 
00737     inline feedable_strassen_winograd_block_grained(
00738             const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col,
00739             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l)
00740         : n(n), m(m), l(l),
00741           p1(existing_a, a_from_row,       a_from_col,       bs_c, n/2, m/2, l/2),
00742           p2(existing_a, a_from_row,       a_from_col + l/2, bs_c, n/2, m/2, l/2),
00743           p3(                                                bs_c, n/2, m/2, l/2),
00744           p4(                                                bs_c, n/2, m/2, l/2),
00745           p5(                                                bs_c, n/2, m/2, l/2),
00746           p6(                                                bs_c, n/2, m/2, l/2),
00747           p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
00748 
00749     inline feedable_strassen_winograd_block_grained(
00750             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l,
00751             const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col)
00752         : n(n), m(m), l(l),
00753           p1(bs_c, n/2, m/2, l/2, existing_b, b_from_row,       b_from_col),
00754           p2(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
00755           p3(bs_c, n/2, m/2, l/2),
00756           p4(bs_c, n/2, m/2, l/2),
00757           p5(bs_c, n/2, m/2, l/2),
00758           p6(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
00759           p7(bs_c, n/2, m/2, l/2) {}
00760 
00761     inline feedable_strassen_winograd_block_grained(
00762             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l)
00763         : n(n), m(m), l(l),
00764           p1(bs_c, n/2, m/2, l/2),
00765           p2(bs_c, n/2, m/2, l/2),
00766           p3(bs_c, n/2, m/2, l/2),
00767           p4(bs_c, n/2, m/2, l/2),
00768           p5(bs_c, n/2, m/2, l/2),
00769           p6(bs_c, n/2, m/2, l/2),
00770           p7(bs_c, n/2, m/2, l/2) {}
00771 
00772     inline void feed_a(const size_type & row, const size_type & col, const swappable_block_matrix_type & bl)
00773     {
00774         // partition bl
00775         typename Ops::swappable_block_matrix_quarterer qbl(bl);
00776         // preadditions
00777         swappable_block_matrix_type s1(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
00778                                     s2(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
00779                                     s3(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
00780                                     s4(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed());
00781         Ops::strassen_winograd_preaddition_a(qbl.ul, qbl.ur, qbl.dl, qbl.dr, s1, s2, s3, s4);
00782         // feed recursive
00783         p1.feed_a(row, col, qbl.ul);
00784         p2.feed_a(row, col, qbl.ur);
00785         p3.feed_a(row, col, s1);
00786         p4.feed_a(row, col, s2);
00787         p5.feed_a(row, col, s3);
00788         p6.feed_a(row, col, s4);
00789         p7.feed_a(row, col, qbl.dr);
00790     }
00791 
00792     inline void feed_b(const size_type & row, const size_type & col, const swappable_block_matrix_type & bl)
00793     {
00794         // partition bl
00795         typename Ops::swappable_block_matrix_quarterer qbl(bl);
00796         // preadditions
00797         swappable_block_matrix_type t1(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
00798                                     t2(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
00799                                     t3(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
00800                                     t4(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed());
00801         Ops::strassen_winograd_preaddition_b(qbl.ul, qbl.ur, qbl.dl, qbl.dr, t1, t2, t3, t4);
00802         // feed recursive
00803         p1.feed_b(row, col, qbl.ul);
00804         p2.feed_b(row, col, qbl.dl);
00805         p3.feed_b(row, col, t1);
00806         p4.feed_b(row, col, t2);
00807         p5.feed_b(row, col, t3);
00808         p6.feed_b(row, col, qbl.dr);
00809         p7.feed_b(row, col, t4);
00810     }
00811 
00812     inline void multiply()
00813     {
00814         p1.multiply();
00815         p2.multiply();
00816         p3.multiply();
00817         p4.multiply();
00818         p5.multiply();
00819         p6.multiply();
00820         p7.multiply();
00821     }
00822 
00823     inline void read_and_add(const size_type & row, const size_type & col, const swappable_block_matrix_type & bl)
00824     {
00825         // partition bl
00826         typename Ops::swappable_block_matrix_quarterer qbl(bl);
00827         // postadditions
00828         swappable_block_matrix_type px(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed());
00829         p2.read_and_add(row, col, qbl.ul);
00830         p1.read_and_add(row, col, px);
00831         Ops::element_op(qbl.ul, px, typename Ops::addition());
00832         p4.read_and_add(row, col, px);
00833         Ops::element_op(qbl.ur, px, typename Ops::addition());
00834         p5.read_and_add(row, col, px);
00835         Ops::element_op_twice_nontransposed(qbl.dl, qbl.dr, px, typename Ops::addition());
00836         px.set_zero();
00837         p7.read_and_add(row, col, qbl.dl);
00838         p3.read_and_add(row, col, px);
00839         Ops::element_op_twice_nontransposed(qbl.dr, qbl.ur, px, typename Ops::addition());
00840         p6.read_and_add(row, col, qbl.ur);
00841     }
00842 
00843     inline static unsigned_type get_num_temp_grains()
00844     { return smaller_feedable_strassen_winograd_ab::get_num_temp_grains() + (4 ^ Level) * 2; }
00845 };
00846 
00847 template <typename ValueType, unsigned BlockSideLength, bool AExists, bool BExists>
00848 struct feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, 0, AExists, BExists>
00849 {
00850     typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type;
00851     typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type;
00852     typedef typename swappable_block_matrix_type::swappable_block_identifier_type swappable_block_identifier_type;
00853     typedef typename swappable_block_matrix_type::size_type size_type;
00854     typedef matrix_operations<ValueType, BlockSideLength> Ops;
00855 
00856     typedef static_quadtree<swappable_block_identifier_type, 0> bt;
00857 
00858     swappable_block_matrix_type a, b, c;
00859 
00860     inline feedable_strassen_winograd_block_grained(
00861             const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col,
00862             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l,
00863             const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col)
00864         : a(existing_a, n, l, a_from_row, a_from_col),
00865           b(existing_b, n, l, b_from_row, b_from_col),
00866           c(bs_c, n, m) {}
00867 
00868     inline feedable_strassen_winograd_block_grained(
00869             const swappable_block_matrix_type & existing_a, const size_type a_from_row, const size_type a_from_col,
00870             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l)
00871         : a(existing_a, n, l, a_from_row, a_from_col),
00872           b(bs_c, n, l),
00873           c(bs_c, n, m) {}
00874 
00875     inline feedable_strassen_winograd_block_grained(
00876             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l,
00877             const swappable_block_matrix_type & existing_b, const size_type b_from_row, const size_type b_from_col)
00878         : a(bs_c, n, l),
00879           b(existing_b, n, l, b_from_row, b_from_col),
00880           c(bs_c, n, m) {}
00881 
00882     inline feedable_strassen_winograd_block_grained(
00883             block_scheduler_type & bs_c, const size_type n, const size_type m, const size_type l)
00884         : a(bs_c, n, l),
00885           b(bs_c, n, l),
00886           c(bs_c, n, m) {}
00887 
00888     inline void feed_a(const size_type & row, const size_type & col, const swappable_block_matrix_type & bl)
00889     {
00890         if (! AExists)
00891         {
00892             // copy bl to a from (row, col) (assuming a from (row, col) == 0)
00893             swappable_block_matrix_type at(a, bl.get_height(), bl.get_width(), row, col);
00894             Ops::element_op(at, bl, typename Ops::addition());
00895         }
00896     }
00897 
00898     inline void feed_b(const size_type & row, const size_type & col, const swappable_block_matrix_type & bl)
00899     {
00900         if (! BExists)
00901         {
00902             // copy bl(0,0) to b(row, col) (assuming b from (row, col) == 0)
00903             swappable_block_matrix_type bt(b, bl.get_height(), bl.get_width(), row, col);
00904             Ops::element_op(bt, bl, typename Ops::addition());
00905         }
00906     }
00907 
00908     inline void multiply()
00909     {
00910         matrix_operations<ValueType, BlockSideLength>::
00911                 multi_level_strassen_winograd_multiply_and_add_block_grained(a, b, c);
00912         if (! AExists)
00913             a.set_zero();
00914         if (! BExists)
00915             b.set_zero();
00916     }
00917 
00918 
00919     inline void read_and_add(const size_type & row, const size_type & col, swappable_block_matrix_type & bl)
00920     {
00921         // add c from (row, col) to bl
00922         swappable_block_matrix_type ct(c, bl.get_height(), bl.get_width(), row, col);
00923         Ops::element_op(bl, ct, typename Ops::addition());
00924         ct.set_zero();
00925     }
00926 
00927     inline static unsigned_type get_num_temp_grains()
00928     { return 0; }
00929 };
00930 
00931 template <typename ValueType, unsigned BlockSideLength, unsigned Level, unsigned Granularity>
00932 struct matrix_to_quadtree_block_grained
00933 {
00934     typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type;
00935     typedef typename swappable_block_matrix_type::size_type size_type;
00936 
00937     typedef matrix_to_quadtree_block_grained<ValueType, BlockSideLength, Level - 1, Granularity> smaller_matrix_to_quadtree_block_grained;
00938 
00939     smaller_matrix_to_quadtree_block_grained ul, ur, dl, dr;
00940 
00941     inline matrix_to_quadtree_block_grained(const swappable_block_matrix_type & matrix)
00942         : ul(matrix, matrix.get_height()/2, matrix.get_width()/2,                     0,                    0),
00943           ur(matrix, matrix.get_height()/2, matrix.get_width()/2,                     0, matrix.get_width()/2),
00944           dl(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2,                    0),
00945           dr(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, matrix.get_width()/2)
00946     { assert(! (matrix.get_height() % 2 | matrix.get_width() % 2)); }
00947 
00948     inline matrix_to_quadtree_block_grained(const swappable_block_matrix_type & matrix,
00949             const size_type height, const size_type width, const size_type from_row, const size_type from_col)
00950         : ul(matrix, height/2, width/2, from_row,            from_col),
00951           ur(matrix, height/2, width/2, from_row,            from_col + width/2),
00952           dl(matrix, height/2, width/2, from_row + height/2, from_col),
00953           dr(matrix, height/2, width/2, from_row + height/2, from_col + width/2)
00954     { assert(! (height % 2 | width % 2)); }
00955 
00956     inline swappable_block_matrix_type operator () (const size_type & row, const size_type & col)
00957     {
00958         return swappable_block_matrix_type(ul(row, col), ur(row, col), dl(row, col), dr(row, col));
00959     }
00960 
00961     inline const size_type get_height()
00962     { return ul.get_height(); }
00963 
00964     inline const size_type get_width()
00965     { return ul.get_width(); }
00966 };
00967 
00968 template <typename ValueType, unsigned BlockSideLength, unsigned Granularity>
00969 struct matrix_to_quadtree_block_grained<ValueType, BlockSideLength, 0, Granularity>
00970 {
00971     typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type;
00972     typedef typename swappable_block_matrix_type::size_type size_type;
00973 
00974     swappable_block_matrix_type m;
00975 
00976     inline matrix_to_quadtree_block_grained(const swappable_block_matrix_type & matrix)
00977         : m(matrix, matrix.get_height(), matrix.get_width(), 0, 0)
00978     { assert(! (matrix.get_height() % Granularity | matrix.get_width() % Granularity)); }
00979 
00980     inline matrix_to_quadtree_block_grained(const swappable_block_matrix_type & matrix,
00981             const size_type height, const size_type width, const size_type from_row, const size_type from_col)
00982         : m(matrix, height, width, from_row, from_col)
00983     { assert(! (matrix.get_height() % Granularity | matrix.get_width() % Granularity)); }
00984 
00985     inline swappable_block_matrix_type operator () (const size_type & row, const size_type & col)
00986     {
00987         return swappable_block_matrix_type(m, Granularity, Granularity, row * Granularity, col * Granularity);
00988     }
00989 
00990     inline const size_type get_height()
00991     { return m.get_height() / Granularity; }
00992 
00993     inline const size_type get_width()
00994     { return m.get_width() / Granularity; }
00995 };
00996 
00997 template <typename ValueType, unsigned BlockSideLength>
00998 struct matrix_operations
00999 {
01000     // tuning-parameter: Only matrices larger than this (in blocks) are processed by Strassen-Winograd.
01001     // you have to adapt choose_level_for_feedable_sw, too
01002     static const int_type strassen_winograd_base_case_size;
01003 
01004     typedef swappable_block_matrix<ValueType, BlockSideLength> swappable_block_matrix_type;
01005     typedef typename swappable_block_matrix_type::block_scheduler_type block_scheduler_type;
01006     typedef typename swappable_block_matrix_type::swappable_block_identifier_type swappable_block_identifier_type;
01007     typedef typename block_scheduler_type::internal_block_type internal_block_type;
01008     typedef typename swappable_block_matrix_type::size_type size_type;
01009     typedef column_vector<ValueType> column_vector_type;
01010     typedef row_vector<ValueType> row_vector_type;
01011     typedef typename column_vector_type::size_type vector_size_type;
01012 
01013     // +-+-+-+ addition +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
01014 
01015     struct addition
01016     {
01017         /* op(c,a,b) means c = a <op> b  e.g. assign sum
01018          * op(c,a)   means c <op>= a     e.g. add up
01019          * op(a)     means <op>a         e.g. sign
01020          *
01021          * it should hold:
01022          * op(c,0,0) equivalent c = 0
01023          * op(c=0,a) equivalent c = op(a)
01024          * op(c,0)   equivalent {}
01025          */
01026 
01027         inline ValueType & operator ()  (ValueType & c, const ValueType & a, const ValueType & b) { return c = a + b; }
01028         inline ValueType & operator ()  (ValueType & c, const ValueType & a)                      { return     c += a; }
01029         inline ValueType operator ()                   (const ValueType & a)                      { return       +a; }
01030     };
01031 
01032     struct subtraction
01033     {
01034         inline ValueType & operator ()  (ValueType & c, const ValueType & a, const ValueType & b) { return c = a - b; }
01035         inline ValueType & operator ()  (ValueType & c, const ValueType & a)                      { return     c -= a; }
01036         inline ValueType operator ()                   (const ValueType & a)                      { return       -a; }
01037     };
01038 
01039     struct scalar_multiplication
01040     {
01041         inline scalar_multiplication(const ValueType scalar = 1) : s(scalar) {}
01042         inline ValueType & operator ()  (ValueType & c, const ValueType & a)                      { return c = a * s; }
01043         inline ValueType operator ()                   (const ValueType & a)                      { return     a * s; }
01044         inline operator const ValueType & ()                                                      { return         s; }
01045         const ValueType s;
01046     };
01047 
01048     // element_op<Op>(C,A,B) calculates C = A <Op> B
01049     template <class Op> static swappable_block_matrix_type &
01050     element_op(swappable_block_matrix_type & C,
01051                const swappable_block_matrix_type & A,
01052                const swappable_block_matrix_type & B, Op op = Op())
01053     {
01054         for (size_type row = 0; row < C.get_height(); ++row)
01055             for (size_type col = 0; col < C.get_width(); ++col)
01056                 element_op_swappable_block(
01057                         C(row, col), C.is_transposed(), C.bs,
01058                         A(row, col), A.is_transposed(), A.bs,
01059                         B(row, col), B.is_transposed(), B.bs, op);
01060         return C;
01061     }
01062 
01063     // element_op<Op>(C,A) calculates C <Op>= A
01064     template <class Op> static swappable_block_matrix_type &
01065     element_op(swappable_block_matrix_type & C,
01066                const swappable_block_matrix_type & A, Op op = Op())
01067     {
01068         for (size_type row = 0; row < C.get_height(); ++row)
01069             for (size_type col = 0; col < C.get_width(); ++col)
01070                 element_op_swappable_block(
01071                         C(row, col), C.is_transposed(), C.bs,
01072                         A(row, col), A.is_transposed(), A.bs, op);
01073         return C;
01074     }
01075 
01076     // element_op<Op>(C) calculates C = <Op>C
01077     template <class Op> static swappable_block_matrix_type &
01078     element_op(swappable_block_matrix_type & C, Op op = Op())
01079     {
01080         for (size_type row = 0; row < C.get_height(); ++row)
01081             for (size_type col = 0; col < C.get_width(); ++col)
01082                 element_op_swappable_block(
01083                         C(row, col), C.bs, op);
01084         return C;
01085     }
01086 
01087     // calculates c = a <Op> b
01088     template <class Op> static void
01089     element_op_swappable_block(
01090             const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type & bs_c,
01091             const swappable_block_identifier_type a, bool a_is_transposed, block_scheduler_type & bs_a,
01092             const swappable_block_identifier_type b, bool b_is_transposed, block_scheduler_type & bs_b, Op op = Op())
01093     {
01094         if (! bs_c.is_simulating())
01095             ++ matrix_operation_statistic::get_instance()->block_addition_calls;
01096         // check if zero-block (== ! initialized)
01097         if (! bs_a.is_initialized(a) && ! bs_b.is_initialized(b))
01098         {
01099             // => a and b are zero -> set c zero
01100             bs_c.deinitialize(c);
01101             if (! bs_c.is_simulating())
01102                 ++ matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
01103             return;
01104         }
01105         a_is_transposed = a_is_transposed != c_is_transposed;
01106         b_is_transposed = b_is_transposed != c_is_transposed;
01107         if (! bs_a.is_initialized(a))
01108         {
01109             // a is zero -> copy b
01110             internal_block_type & ic = bs_c.acquire(c, true),
01111                                 & ib = bs_b.acquire(b);
01112             if (! bs_c.is_simulating())
01113             {
01114                 if (b_is_transposed)
01115                     low_level_matrix_binary_ass_op<ValueType, BlockSideLength, false, true, Op>(& ic[0], 0, & ib[0], op);
01116                 else
01117                     low_level_matrix_binary_ass_op<ValueType, BlockSideLength, false, false, Op>(& ic[0], 0, & ib[0], op);
01118             }
01119             bs_b.release(b, false);
01120             bs_c.release(c, true);
01121         }
01122         else if (! bs_b.is_initialized(b))
01123         {
01124             // b is zero -> copy a
01125             internal_block_type & ic = bs_c.acquire(c, true),
01126                                 & ia = bs_a.acquire(a);
01127             if (! bs_c.is_simulating())
01128             {
01129                 if (a_is_transposed)
01130                     low_level_matrix_binary_ass_op<ValueType, BlockSideLength, true, false, Op>(& ic[0], & ia[0], 0, op);
01131                 else
01132                     low_level_matrix_binary_ass_op<ValueType, BlockSideLength, false, false, Op>(& ic[0], & ia[0], 0, op);
01133             }
01134             bs_a.release(a, false);
01135             bs_c.release(c, true);
01136         }
01137         else
01138         {
01139             internal_block_type & ic = bs_c.acquire(c, true),
01140                                 & ia = bs_a.acquire(a),
01141                                 & ib = bs_b.acquire(b);
01142             if (! bs_c.is_simulating())
01143             {
01144                 if (a_is_transposed)
01145                 {
01146                     if (b_is_transposed)
01147                         low_level_matrix_binary_ass_op<ValueType, BlockSideLength, true, true, Op>(& ic[0], & ia[0], & ib[0], op);
01148                     else
01149                         low_level_matrix_binary_ass_op<ValueType, BlockSideLength, true, false, Op>(& ic[0], & ia[0], & ib[0], op);
01150                 }
01151                 else
01152                 {
01153                     if (b_is_transposed)
01154                         low_level_matrix_binary_ass_op<ValueType, BlockSideLength, false, true, Op>(& ic[0], & ia[0], & ib[0], op);
01155                     else
01156                         low_level_matrix_binary_ass_op<ValueType, BlockSideLength, false, false, Op>(& ic[0], & ia[0], & ib[0], op);
01157                 }
01158             }
01159             bs_a.release(a, false);
01160             bs_b.release(b, false);
01161             bs_c.release(c, true);
01162         }
01163     }
01164 
01165     // calculates c <op>= a
01166     template <class Op> static void
01167     element_op_swappable_block(
01168             const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type & bs_c,
01169             const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type & bs_a, Op op = Op())
01170     {
01171         if (! bs_c.is_simulating())
01172             ++ matrix_operation_statistic::get_instance()->block_addition_calls;
01173         // check if zero-block (== ! initialized)
01174         if (! bs_a.is_initialized(a))
01175         {
01176             // => b is zero => nothing to do
01177             if (! bs_c.is_simulating())
01178                 ++ matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
01179             return;
01180         }
01181         const bool c_is_zero = ! bs_c.is_initialized(c);
01182         // acquire
01183         internal_block_type & ic = bs_c.acquire(c, c_is_zero),
01184                             & ia = bs_a.acquire(a);
01185         // add
01186         if (! bs_c.is_simulating())
01187         {
01188             if (c_is_zero)
01189                 if (c_is_transposed == a_is_transposed)
01190                     low_level_matrix_unary_op<ValueType, BlockSideLength, false, Op>(& ic[0], & ia[0], op);
01191                 else
01192                     low_level_matrix_unary_op<ValueType, BlockSideLength, true, Op>(& ic[0], & ia[0], op);
01193             else
01194                 if (c_is_transposed == a_is_transposed)
01195                     low_level_matrix_unary_ass_op<ValueType, BlockSideLength, false, Op>(& ic[0], & ia[0], op);
01196                 else
01197                     low_level_matrix_unary_ass_op<ValueType, BlockSideLength, true, Op>(& ic[0], & ia[0], op);
01198         }
01199         // release
01200         bs_c.release(c, true);
01201         bs_a.release(a, false);
01202     }
01203 
01204     // calculates c = <op>c
01205     template <class Op> static void
01206     element_op_swappable_block(
01207             const swappable_block_identifier_type c, block_scheduler_type & bs_c, Op op = Op())
01208     {
01209         if (! bs_c.is_simulating())
01210             ++ matrix_operation_statistic::get_instance()->block_addition_calls;
01211         // check if zero-block (== ! initialized)
01212         if (! bs_c.is_initialized(c))
01213         {
01214             // => c is zero => nothing to do
01215             if (! bs_c.is_simulating())
01216                 ++ matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
01217             return;
01218         }
01219         // acquire
01220         internal_block_type & ic = bs_c.acquire(c);
01221         // add
01222         if (! bs_c.is_simulating())
01223             low_level_matrix_unary_op<ValueType, BlockSideLength, false, Op>(& ic[0], & ic[0], op);
01224         // release
01225         bs_c.release(c, true);
01226     }
01227 
01228     // additions for strassen-winograd
01229 
01230     inline static void
01231     strassen_winograd_preaddition_a(swappable_block_matrix_type & a11,
01232                                     swappable_block_matrix_type & a12,
01233                                     swappable_block_matrix_type & a21,
01234                                     swappable_block_matrix_type & a22,
01235                                     swappable_block_matrix_type & s1,
01236                                     swappable_block_matrix_type & s2,
01237                                     swappable_block_matrix_type & s3,
01238                                     swappable_block_matrix_type & s4)
01239     {
01240         for (size_type row = 0; row < a11.get_height(); ++row)
01241             for (size_type col = 0; col < a11.get_width(); ++col)
01242             {
01243                 op_swappable_block_nontransposed(s3, a11, subtraction(), a21, row, col);
01244                 op_swappable_block_nontransposed(s1, a21,    addition(), a22, row, col);
01245                 op_swappable_block_nontransposed(s2,  s1, subtraction(), a11, row, col);
01246                 op_swappable_block_nontransposed(s4, a12, subtraction(),  s2, row, col);
01247             }
01248     }
01249 
01250     inline static void
01251     strassen_winograd_preaddition_b(swappable_block_matrix_type & b11,
01252                                     swappable_block_matrix_type & b12,
01253                                     swappable_block_matrix_type & b21,
01254                                     swappable_block_matrix_type & b22,
01255                                     swappable_block_matrix_type & t1,
01256                                     swappable_block_matrix_type & t2,
01257                                     swappable_block_matrix_type & t3,
01258                                     swappable_block_matrix_type & t4)
01259     {
01260         for (size_type row = 0; row < b11.get_height(); ++row)
01261             for (size_type col = 0; col < b11.get_width(); ++col)
01262             {
01263                 op_swappable_block_nontransposed(t3, b22, subtraction(), b12, row, col);
01264                 op_swappable_block_nontransposed(t1, b12, subtraction(), b11, row, col);
01265                 op_swappable_block_nontransposed(t2, b22, subtraction(),  t1, row, col);
01266                 op_swappable_block_nontransposed(t4, b21, subtraction(),  t2, row, col);
01267             }
01268     }
01269 
01270     inline static void
01271     strassen_winograd_postaddition(swappable_block_matrix_type & c11, // = p2
01272                                    swappable_block_matrix_type & c12, // = p6
01273                                    swappable_block_matrix_type & c21, // = p7
01274                                    swappable_block_matrix_type & c22, // = p4
01275                                    swappable_block_matrix_type & p1,
01276                                    swappable_block_matrix_type & p3,
01277                                    swappable_block_matrix_type & p5)
01278     {
01279         for (size_type row = 0; row < c11.get_height(); ++row)
01280             for (size_type col = 0; col < c11.get_width(); ++col)
01281             {
01282                 op_swappable_block_nontransposed(c11,     addition(),  p1, row, col); // (u1)
01283                 op_swappable_block_nontransposed( p1,     addition(), c22, row, col); // (u2)
01284                 op_swappable_block_nontransposed( p5,     addition(),  p1, row, col); // (u3)
01285                 op_swappable_block_nontransposed(c21,     addition(),  p5, row, col); // (u4)
01286                 op_swappable_block_nontransposed(c22, p5, addition(),  p3, row, col); // (u5)
01287                 op_swappable_block_nontransposed( p1,     addition(),  p3, row, col); // (u6)
01288                 op_swappable_block_nontransposed(c12,     addition(),  p1, row, col); // (u7)
01289             }
01290     }
01291 
01292     // calculates c1 += a; c2 += a
01293     template <class Op> inline static void
01294     element_op_twice_nontransposed(swappable_block_matrix_type & c1,
01295                      swappable_block_matrix_type & c2,
01296                      const swappable_block_matrix_type & a, Op op = Op())
01297     {
01298         for (size_type row = 0; row < a.get_height(); ++row)
01299             for (size_type col = 0; col < a.get_width(); ++col)
01300             {
01301                 element_op_swappable_block(
01302                         c1(row, col), false, c1.bs,
01303                         a(row, col),  false, a.bs, op);
01304                 element_op_swappable_block(
01305                         c2(row, col), false, c2.bs,
01306                         a(row, col),  false, a.bs, op);
01307             }
01308     }
01309 
01310     template <class Op> inline static void
01311     op_swappable_block_nontransposed(swappable_block_matrix_type & c,
01312             swappable_block_matrix_type & a, Op op, swappable_block_matrix_type & b,
01313             size_type & row, size_type & col)
01314     {
01315         element_op_swappable_block(
01316                         c(row, col), false, c.bs,
01317                         a(row, col), false, a.bs,
01318                         b(row, col), false, b.bs, op);
01319     }
01320 
01321     template <class Op> inline static void
01322     op_swappable_block_nontransposed(swappable_block_matrix_type & c, Op op, swappable_block_matrix_type & a,
01323             size_type & row, size_type & col)
01324     {
01325         element_op_swappable_block(
01326                         c(row, col), false, c.bs,
01327                         a(row, col), false, a.bs, op);
01328     }
01329 
01330     // +-+ end addition +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
01331 
01332     // +-+-+-+ matrix multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
01333 
01334     /*  n, m and l denote the three dimensions of a matrix multiplication, according to the following ascii-art diagram:
01335      *
01336      *                 +--m--+
01337      *  +----l-----+   |     |   +--m--+
01338      *  |          |   |     |   |     |
01339      *  n    A     | • l  B  | = n  C  |
01340      *  |          |   |     |   |     |
01341      *  +----------+   |     |   +-----+
01342      *                 +-----+
01343      *
01344      * The index-variables are called i, j, k for dimension
01345      *                                n, m, l .
01346      */
01347 
01348     // requires height and width divisible by 2
01349     struct swappable_block_matrix_quarterer
01350     {
01351         swappable_block_matrix_type  upleft,   upright,
01352                                     downleft, downright,
01353                 & ul, & ur, & dl, & dr;
01354 
01355         swappable_block_matrix_quarterer(const swappable_block_matrix_type & whole)
01356             : upleft   (whole, whole.get_height()/2, whole.get_width()/2,                    0,                   0),
01357               upright  (whole, whole.get_height()/2, whole.get_width()/2,                    0, whole.get_width()/2),
01358               downleft (whole, whole.get_height()/2, whole.get_width()/2, whole.get_height()/2,                   0),
01359               downright(whole, whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, whole.get_width()/2),
01360               ul(upleft), ur(upright), dl(downleft), dr(downright)
01361         { assert(! (whole.get_height() % 2 | whole.get_width() % 2)); }
01362     };
01363 
01364     struct swappable_block_matrix_padding_quarterer
01365     {
01366         swappable_block_matrix_type  upleft,   upright,
01367                                     downleft, downright,
01368                 & ul, & ur, & dl, & dr;
01369 
01370         swappable_block_matrix_padding_quarterer(const swappable_block_matrix_type & whole)
01371             : upleft   (whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2),                              0,                             0),
01372               upright  (whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2),                              0, div_ceil(whole.get_width(),2)),
01373               downleft (whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2), div_ceil(whole.get_height(),2),                             0),
01374               downright(whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2), div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2)),
01375               ul(upleft), ur(upright), dl(downleft), dr(downright) {}
01376     };
01377 
01378     struct swappable_block_matrix_approximative_quarterer
01379     {
01380         swappable_block_matrix_type  upleft,   upright,
01381                                     downleft, downright,
01382                 & ul, & ur, & dl, & dr;
01383 
01384         swappable_block_matrix_approximative_quarterer(const swappable_block_matrix_type & whole)
01385             : upleft   (whole,                      whole.get_height()/2,                     whole.get_width()/2,                    0,                   0),
01386               upright  (whole,                      whole.get_height()/2, whole.get_width() - whole.get_width()/2,                    0, whole.get_width()/2),
01387               downleft (whole, whole.get_height() - whole.get_height()/2,                     whole.get_width()/2, whole.get_height()/2,                   0),
01388               downright(whole, whole.get_height() - whole.get_height()/2, whole.get_width() - whole.get_width()/2, whole.get_height()/2, whole.get_width()/2),
01389               ul(upleft), ur(upright), dl(downleft), dr(downright) {}
01390     };
01391 
01392     //! \brief calculates C = A * B + C
01393     // requires fitting dimensions
01394     static swappable_block_matrix_type &
01395     multi_level_strassen_winograd_multiply_and_add_block_grained(const swappable_block_matrix_type & A,
01396                                                    const swappable_block_matrix_type & B,
01397                                                    swappable_block_matrix_type & C)
01398     {
01399         int_type num_levels = log2_ceil(std::min(A.get_width(), std::min(C.get_width(), C.get_height())));
01400         if (num_levels > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
01401         {
01402             if (num_levels > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS)
01403                 num_levels = STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS;
01404             swappable_block_matrix_type padded_a(A, round_up_to_power_of_two(A.get_height(), num_levels),
01405                                                  round_up_to_power_of_two(A.get_width(), num_levels), 0, 0),
01406                                         padded_b(B, round_up_to_power_of_two(B.get_height(), num_levels),
01407                                                  round_up_to_power_of_two(B.get_width(), num_levels), 0, 0),
01408                                         padded_c(C, round_up_to_power_of_two(C.get_height(), num_levels),
01409                                                  round_up_to_power_of_two(C.get_width(), num_levels), 0, 0);
01410             switch (num_levels)
01411             {
01412             #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 5 && 5 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
01413             case 5:
01414                 use_feedable_sw_block_grained<5>(padded_a, padded_a, padded_c);
01415                 break;
01416             #endif
01417             #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 4 && 4 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
01418             case 4:
01419                 use_feedable_sw_block_grained<4>(padded_a, padded_a, padded_c);
01420                 break;
01421             #endif
01422             #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 3 && 3 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
01423             case 3:
01424                 use_feedable_sw_block_grained<3>(padded_a, padded_a, padded_c);
01425                 break;
01426             #endif
01427             #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 2 && 2 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
01428             case 2:
01429                 use_feedable_sw_block_grained<2>(padded_a, padded_a, padded_c);
01430                 break;
01431             #endif
01432             default:  // only here in case of wrong bounds
01433                 strassen_winograd_multiply_and_add_interleaved(A, B, C);
01434                 break;
01435             }
01436         }
01437         else
01438             // base case
01439             strassen_winograd_multiply_and_add_interleaved(A, B, C);
01440         return C;
01441     }
01442 
01443     // input matrices have to be padded
01444     template <unsigned Level>
01445     static void use_feedable_sw_block_grained(const swappable_block_matrix_type & A,
01446                                 const swappable_block_matrix_type & B,
01447                                 swappable_block_matrix_type & C)
01448     {
01449         const unsigned granularity = 1;
01450 
01451         feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level, true, true>
01452                 fsw(A, 0, 0, C.bs, C.get_height(), C.get_width(), A.get_width(), B, 0, 0);
01453         // preadditions for A
01454         {
01455             matrix_to_quadtree_block_grained<ValueType, BlockSideLength, Level, granularity>
01456                     mtq_a (A);
01457             for (size_type row = 0; row < mtq_a.get_height(); ++row)
01458                 for (size_type col = 0; col < mtq_a.get_width(); ++col)
01459                     fsw.feed_a(row, col, mtq_a(row, col));
01460         }
01461         // preadditions for B
01462         {
01463             matrix_to_quadtree_block_grained<ValueType, BlockSideLength, Level, granularity>
01464                     mtq_b (B);
01465             for (size_type row = 0; row < mtq_b.get_height(); ++row)
01466                 for (size_type col = 0; col < mtq_b.get_width(); ++col)
01467                     fsw.feed_b(row, col, mtq_b(row, col));
01468         }
01469         // recursive multiplications
01470         fsw.multiply();
01471         // postadditions
01472         {
01473             matrix_to_quadtree_block_grained<ValueType, BlockSideLength, Level, granularity>
01474                     mtq_c (C);
01475             for (size_type row = 0; row < mtq_c.get_height(); ++row)
01476                 for (size_type col = 0; col < mtq_c.get_width(); ++col)
01477                     fsw.read_and_add(row, col, mtq_c(row, col));
01478         }
01479     }
01480 
01481     //! \brief calculates C = A * B + C
01482     // requires fitting dimensions
01483     static swappable_block_matrix_type &
01484     multi_level_strassen_winograd_multiply_and_add(const swappable_block_matrix_type & A,
01485                                                    const swappable_block_matrix_type & B,
01486                                                    swappable_block_matrix_type & C)
01487     {
01488         int_type p = log2_ceil(std::min(A.get_width(), std::min(C.get_width(), C.get_height())));
01489 
01490         swappable_block_matrix_type padded_a(A, round_up_to_power_of_two(A.get_height(), p),
01491                                              round_up_to_power_of_two(A.get_width(), p), 0, 0),
01492                                     padded_b(B, round_up_to_power_of_two(B.get_height(), p),
01493                                              round_up_to_power_of_two(B.get_width(), p), 0, 0),
01494                                     padded_c(C, round_up_to_power_of_two(C.get_height(), p),
01495                                              round_up_to_power_of_two(C.get_width(), p), 0, 0);
01496         choose_level_for_feedable_sw(padded_a, padded_b, padded_c);
01497         return C;
01498     }
01499 
01500     // input matrices have to be padded
01501     static void choose_level_for_feedable_sw(const swappable_block_matrix_type & A,
01502                                              const swappable_block_matrix_type & B,
01503                                              swappable_block_matrix_type & C)
01504     {
01505         switch (log2_ceil(std::min(A.get_width(), std::min(C.get_width(), C.get_height()))))
01506         {
01507         default:
01508             /*
01509             use_feedable_sw<4>(A, B, C);
01510             break;
01511         case 3:
01512             use_feedable_sw<3>(A, B, C);
01513             break;
01514         case 2:*/
01515             use_feedable_sw<2>(A, B, C);
01516             break;
01517         case 1:
01518             /*use_feedable_sw<1>(A, B, C);
01519             break;*/
01520         case 0:
01521             // base case
01522             recursive_multiply_and_add(A, B, C);
01523             break;
01524         }
01525     }
01526 
01527     // input matrices have to be padded
01528     template <unsigned Level>
01529     static void use_feedable_sw(const swappable_block_matrix_type & A,
01530                                 const swappable_block_matrix_type & B,
01531                                 swappable_block_matrix_type & C)
01532     {
01533         feedable_strassen_winograd<ValueType, BlockSideLength, Level, true, true>
01534                 fsw(A, 0, 0, C.bs, C.get_height(), C.get_width(), A.get_width(), B, 0, 0);
01535         // preadditions for A
01536         matrix_to_quadtree<ValueType, BlockSideLength, Level>
01537                 mtq_a (A);
01538         for (size_type block_row = 0; block_row < mtq_a.get_height_in_blocks(); ++block_row)
01539             for (size_type block_col = 0; block_col < mtq_a.get_width_in_blocks(); ++block_col)
01540             {
01541                 fsw.begin_feeding_a_block(block_row, block_col,
01542                         mtq_a.begin_reading_block(block_row, block_col));
01543                 #if STXXL_PARALLEL
01544                 #pragma omp parallel for
01545                 #endif
01546                 for (int_type element_row_in_block = 0; element_row_in_block < int_type(BlockSideLength); ++element_row_in_block)
01547                     for (int_type element_col_in_block = 0; element_col_in_block < int_type(BlockSideLength); ++element_col_in_block)
01548                         fsw.feed_a_element(element_row_in_block * BlockSideLength + element_col_in_block,
01549                                 mtq_a.read_element(element_row_in_block * BlockSideLength + element_col_in_block));
01550                 fsw.end_feeding_a_block(block_row, block_col,
01551                         mtq_a.end_reading_block(block_row, block_col));
01552             }
01553         // preadditions for B
01554         matrix_to_quadtree<ValueType, BlockSideLength, Level>
01555                 mtq_b (B);
01556         for (size_type block_row = 0; block_row < mtq_b.get_height_in_blocks(); ++block_row)
01557             for (size_type block_col = 0; block_col < mtq_b.get_width_in_blocks(); ++block_col)
01558             {
01559                 fsw.begin_feeding_b_block(block_row, block_col,
01560                         mtq_b.begin_reading_block(block_row, block_col));
01561                 #if STXXL_PARALLEL
01562                 #pragma omp parallel for
01563                 #endif
01564                 for (int_type element_row_in_block = 0; element_row_in_block < int_type(BlockSideLength); ++element_row_in_block)
01565                     for (int_type element_col_in_block = 0; element_col_in_block < int_type(BlockSideLength); ++element_col_in_block)
01566                         fsw.feed_b_element(element_row_in_block * BlockSideLength + element_col_in_block,
01567                                 mtq_b.read_element(element_row_in_block * BlockSideLength + element_col_in_block));
01568                 fsw.end_feeding_b_block(block_row, block_col,
01569                         mtq_b.end_reading_block(block_row, block_col));
01570             }
01571         // recursive multiplications
01572         fsw.multiply();
01573         // postadditions
01574         matrix_to_quadtree<ValueType, BlockSideLength, Level>
01575                 mtq_c (C);
01576         for (size_type block_row = 0; block_row < mtq_c.get_height_in_blocks(); ++block_row)
01577             for (size_type block_col = 0; block_col < mtq_c.get_width_in_blocks(); ++block_col)
01578             {
01579                 mtq_c.begin_feeding_block(block_row, block_col,
01580                         fsw.begin_reading_block(block_row, block_col));
01581                 #if STXXL_PARALLEL
01582                 #pragma omp parallel for
01583                 #endif
01584                 for (int_type element_row_in_block = 0; element_row_in_block < int_type(BlockSideLength); ++element_row_in_block)
01585                     for (int_type element_col_in_block = 0; element_col_in_block < int_type(BlockSideLength); ++element_col_in_block)
01586                         mtq_c.feed_and_add_element(element_row_in_block * BlockSideLength + element_col_in_block,
01587                                 fsw.read_element(element_row_in_block * BlockSideLength + element_col_in_block));
01588                 mtq_c.end_feeding_block(block_row, block_col,
01589                         fsw.end_reading_block(block_row, block_col));
01590             }
01591     }
01592 
01593     //! \brief calculates C = A * B
01594     // assumes fitting dimensions
01595     static swappable_block_matrix_type &
01596     strassen_winograd_multiply(const swappable_block_matrix_type & A,
01597                                const swappable_block_matrix_type & B,
01598                                swappable_block_matrix_type & C)
01599     {
01600         // base case
01601         if (C.get_height() <= strassen_winograd_base_case_size
01602                 || C.get_width() <= strassen_winograd_base_case_size
01603                 || A.get_width() <= strassen_winograd_base_case_size)
01604         {
01605             C.set_zero();
01606             return recursive_multiply_and_add(A, B, C);
01607         }
01608 
01609         // partition matrix
01610         swappable_block_matrix_padding_quarterer qa(A), qb(B), qc(C);
01611         // preadditions
01612         swappable_block_matrix_type s1(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
01613                                     s2(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
01614                                     s3(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
01615                                     s4(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
01616                                     t1(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
01617                                     t2(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
01618                                     t3(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
01619                                     t4(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed());
01620         strassen_winograd_preaddition_a(qa.ul, qa.ur, qa.dl, qa.dr, s1, s2, s3, s4);
01621         strassen_winograd_preaddition_b(qb.ul, qb.ur, qb.dl, qb.dr, t1, t2, t3, t4);
01622         // recursive multiplications
01623         swappable_block_matrix_type p1(C.bs, qc.ul.get_height(), qc.ul.get_width(), qc.ul.is_transposed()),
01624                                  // p2 stored in qc.ul
01625                                     p3(C.bs, qc.ul.get_height(), qc.ul.get_width(), qc.ul.is_transposed()),
01626                                  // p4 stored in qc.dr
01627                                     p5(C.bs, qc.ul.get_height(), qc.ul.get_width(), qc.ul.is_transposed());
01628                                  // p6 stored in qc.ur
01629                                  // p7 stored in qc.dl
01630         strassen_winograd_multiply(qa.ul, qb.ul,    p1);
01631         strassen_winograd_multiply(qa.ur, qb.dl, qc.ul);
01632         strassen_winograd_multiply(   s1,    t1,    p3);
01633         strassen_winograd_multiply(   s2,    t2, qc.dr);
01634         strassen_winograd_multiply(   s3,    t3,    p5);
01635         strassen_winograd_multiply(   s4, qb.dr, qc.ur);
01636         strassen_winograd_multiply(qa.dr,    t4, qc.dl);
01637         // postadditions
01638         strassen_winograd_postaddition(qc.ul, qc.ur, qc.dl, qc.dr, p1, p3, p5);
01639         return C;
01640     }
01641 
01642     //! \brief calculates C = A * B + C
01643     // assumes fitting dimensions
01644     static swappable_block_matrix_type &
01645     strassen_winograd_multiply_and_add_interleaved(const swappable_block_matrix_type & A,
01646                                          const swappable_block_matrix_type & B,
01647                                          swappable_block_matrix_type & C)
01648     {
01649         // base case
01650         if (C.get_height() <= strassen_winograd_base_case_size
01651                 || C.get_width() <= strassen_winograd_base_case_size
01652                 || A.get_width() <= strassen_winograd_base_case_size)
01653             return recursive_multiply_and_add(A, B, C);
01654 
01655         // partition matrix
01656         swappable_block_matrix_padding_quarterer qa(A), qb(B), qc(C);
01657         // preadditions
01658         swappable_block_matrix_type s1(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
01659                                     s2(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
01660                                     s3(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
01661                                     s4(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
01662                                     t1(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
01663                                     t2(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
01664                                     t3(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
01665                                     t4(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed());
01666         strassen_winograd_preaddition_a(qa.ul, qa.ur, qa.dl, qa.dr, s1, s2, s3, s4);
01667         strassen_winograd_preaddition_b(qb.ul, qb.ur, qb.dl, qb.dr, t1, t2, t3, t4);
01668         // recursive multiplications and postadditions
01669         swappable_block_matrix_type px(C.bs, qc.ul.get_height(), qc.ul.get_width(), qc.ul.is_transposed());
01670         strassen_winograd_multiply_and_add_interleaved(qa.ur, qb.dl, qc.ul); // p2
01671         strassen_winograd_multiply_and_add_interleaved(qa.ul, qb.ul, px); // p1
01672         element_op<addition>(qc.ul, px);
01673         strassen_winograd_multiply_and_add_interleaved(s2, t2, px); // p4
01674         s2.set_zero();
01675         t2.set_zero();
01676         element_op<addition>(qc.ur, px);
01677         strassen_winograd_multiply_and_add_interleaved(s3, t3, px); // p5
01678         s3.set_zero();
01679         t3.set_zero();
01680         element_op_twice_nontransposed<addition>(qc.dl, qc.dr, px);
01681         px.set_zero();
01682         strassen_winograd_multiply_and_add_interleaved(qa.dr, t4, qc.dl); // p7
01683         t4.set_zero();
01684         strassen_winograd_multiply_and_add_interleaved(s1, t1, px); // p3
01685         s1.set_zero();
01686         t1.set_zero();
01687         element_op_twice_nontransposed<addition>(qc.dr, qc.ur, px);
01688         px.set_zero();
01689         strassen_winograd_multiply_and_add_interleaved(s4, qb.dr, qc.ur); // p6
01690         return C;
01691     }
01692 
01693     //! \brief calculates C = A * B + C
01694     // assumes fitting dimensions
01695     static swappable_block_matrix_type &
01696     strassen_winograd_multiply_and_add(const swappable_block_matrix_type & A,
01697                                        const swappable_block_matrix_type & B,
01698                                        swappable_block_matrix_type & C)
01699     {
01700         // base case
01701         if (C.get_height() <= strassen_winograd_base_case_size
01702                 || C.get_width() <= strassen_winograd_base_case_size
01703                 || A.get_width() <= strassen_winograd_base_case_size)
01704             return recursive_multiply_and_add(A, B, C);
01705 
01706         // partition matrix
01707         swappable_block_matrix_padding_quarterer qa(A), qb(B), qc(C);
01708         // preadditions
01709         swappable_block_matrix_type s1(C.bs, qa.ul.get_height(), qa.ul.get_width()),
01710                                     s2(C.bs, qa.ul.get_height(), qa.ul.get_width()),
01711                                     s3(C.bs, qa.ul.get_height(), qa.ul.get_width()),
01712                                     s4(C.bs, qa.ul.get_height(), qa.ul.get_width()),
01713                                     t1(C.bs, qb.ul.get_height(), qb.ul.get_width()),
01714                                     t2(C.bs, qb.ul.get_height(), qb.ul.get_width()),
01715                                     t3(C.bs, qb.ul.get_height(), qb.ul.get_width()),
01716                                     t4(C.bs, qb.ul.get_height(), qb.ul.get_width());
01717         element_op<subtraction>(s3, qa.ul, qa.dl);
01718         element_op<addition>(s1, qa.dl, qa.dr);
01719         element_op<subtraction>(s2, s1, qa.ul);
01720         element_op<subtraction>(s4, qa.ur, s2);
01721         element_op<subtraction>(t3, qb.dr, qb.ur);
01722         element_op<subtraction>(t1, qb.ur, qb.ul);
01723         element_op<subtraction>(t2, qb.dr, t1);
01724         element_op<subtraction>(t4, qb.dl, t2);
01725         // recursive multiplications and postadditions
01726         swappable_block_matrix_type px(C.bs, qc.ul.get_height(), qc.ul.get_width());
01727         strassen_winograd_multiply_and_add(qa.ur, qb.dl, qc.ul); // p2
01728         strassen_winograd_multiply_and_add(qa.ul, qb.ul, px); // p1
01729         element_op<addition>(qc.ul, px);
01730         strassen_winograd_multiply_and_add(s2, t2, px); // p4
01731         element_op<addition>(qc.ur, px);
01732         strassen_winograd_multiply_and_add(s3, t3, px); // p5
01733         element_op<addition>(qc.dl, px);
01734         element_op<addition>(qc.dr, px);
01735         px.set_zero();
01736         strassen_winograd_multiply_and_add(qa.dr, t4, qc.dl); // p7
01737         strassen_winograd_multiply_and_add(s1, t1, px); // p3
01738         element_op<addition>(qc.dr, px);
01739         element_op<addition>(qc.ur, px);
01740         strassen_winograd_multiply_and_add(s4, qb.dr, qc.ur); // p6
01741         return C;
01742     }
01743 
01744     //! \brief calculates C = A * B + C
01745     // assumes fitting dimensions
01746     static swappable_block_matrix_type &
01747     recursive_multiply_and_add(const swappable_block_matrix_type & A,
01748                                const swappable_block_matrix_type & B,
01749                                swappable_block_matrix_type & C)
01750     {
01751         // catch empty intervals
01752         if (C.get_height() * C.get_width() * A.get_width() == 0)
01753             return C;
01754         // base case
01755         if ((C.get_height() == 1) + (C.get_width() == 1) + (A.get_width() == 1) >= 2)
01756             return naive_multiply_and_add(A, B, C);
01757 
01758         // partition matrix
01759         swappable_block_matrix_approximative_quarterer qa(A), qb(B), qc(C);
01760         // recursive multiplication
01761         // The order of recursive calls is optimized to enhance locality. C has priority because it has to be read and written.
01762         recursive_multiply_and_add(qa.ul, qb.ul, qc.ul);
01763         recursive_multiply_and_add(qa.ur, qb.dl, qc.ul);
01764         recursive_multiply_and_add(qa.ur, qb.dr, qc.ur);
01765         recursive_multiply_and_add(qa.ul, qb.ur, qc.ur);
01766         recursive_multiply_and_add(qa.dl, qb.ur, qc.dr);
01767         recursive_multiply_and_add(qa.dr, qb.dr, qc.dr);
01768         recursive_multiply_and_add(qa.dr, qb.dl, qc.dl);
01769         recursive_multiply_and_add(qa.dl, qb.ul, qc.dl);
01770 
01771         return C;
01772     }
01773 
01774     //! \brief calculates C = A * B + C
01775     // requires fitting dimensions
01776     static swappable_block_matrix_type &
01777     naive_multiply_and_add(const swappable_block_matrix_type & A,
01778                            const swappable_block_matrix_type & B,
01779                            swappable_block_matrix_type & C)
01780     {
01781         const size_type & n = C.get_height(),
01782                         & m = C.get_width(),
01783                         & l = A.get_width();
01784         for (size_type i = 0; i < n; ++i)
01785             for (size_type j = 0; j < m; ++j)
01786                 for (size_type k = 0; k < l; ++k)
01787                     multiply_and_add_swappable_block(A(i,k), A.is_transposed(), A.bs,
01788                                                      B(k,j), B.is_transposed(), B.bs,
01789                                                      C(i,j), C.is_transposed(), C.bs);
01790         return C;
01791     }
01792 
01793     static void multiply_and_add_swappable_block(
01794             const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type & bs_a,
01795             const swappable_block_identifier_type b, const bool b_is_transposed, block_scheduler_type & bs_b,
01796             const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type & bs_c)
01797     {
01798         if (! bs_c.is_simulating())
01799             ++ matrix_operation_statistic::get_instance()->block_multiplication_calls;
01800         // check if zero-block (== ! initialized)
01801         if (! bs_a.is_initialized(a) || ! bs_b.is_initialized(b))
01802         {
01803             // => one factor is zero => product is zero
01804             if (! bs_c.is_simulating())
01805                 ++ matrix_operation_statistic::get_instance()->block_multiplications_saved_through_zero;
01806             return;
01807         }
01808         // acquire
01809         ValueType * ap = bs_a.acquire(a).begin(),
01810                   * bp = bs_b.acquire(b).begin(),
01811                   * cp = bs_c.acquire(c).begin();
01812         // multiply
01813         if (! bs_c.is_simulating())
01814             low_level_matrix_multiply_and_add<ValueType, BlockSideLength>
01815                     (ap, a_is_transposed, bp, b_is_transposed, cp, c_is_transposed);
01816         // release
01817         bs_a.release(a, false);
01818         bs_b.release(b, false);
01819         bs_c.release(c, true);
01820     }
01821 
01822     // +-+ end matrix multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
01823 
01824     // +-+-+-+ matrix-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
01825 
01826     //! \brief calculates z = A * x
01827     static column_vector_type &
01828     recursive_matrix_col_vector_multiply_and_add(const swappable_block_matrix_type & A,
01829                                          const column_vector_type & x, column_vector_type & z,
01830                                          const vector_size_type offset_x = 0, const vector_size_type offset_z = 0)
01831     {
01832         // catch empty intervals
01833         if (A.get_height() * A.get_width() == 0)
01834             return z;
01835         // base case
01836         if (A.get_height() == 1 || A.get_width() == 1)
01837             return naive_matrix_col_vector_multiply_and_add(A, x, z, offset_x, offset_z);
01838 
01839         // partition matrix
01840         swappable_block_matrix_approximative_quarterer qa(A);
01841         // recursive multiplication
01842         // The order of recursive calls is optimized to enhance locality.
01843         recursive_matrix_col_vector_multiply_and_add(qa.ul, x, z, offset_x,                     offset_z                     );
01844         recursive_matrix_col_vector_multiply_and_add(qa.ur, x, z, offset_x + qa.ul.get_width(), offset_z                     );
01845         recursive_matrix_col_vector_multiply_and_add(qa.dr, x, z, offset_x + qa.ul.get_width(), offset_z + qa.ul.get_height());
01846         recursive_matrix_col_vector_multiply_and_add(qa.dl, x, z, offset_x,                     offset_z + qa.ul.get_height());
01847 
01848         return z;
01849     }
01850 
01851     static column_vector_type &
01852     naive_matrix_col_vector_multiply_and_add(const swappable_block_matrix_type & A,
01853                                      const column_vector_type & x, column_vector_type & z,
01854                                      const vector_size_type offset_x = 0, const vector_size_type offset_z = 0)
01855     {
01856         for (size_type row = 0; row < A.get_height(); ++row)
01857             for (size_type col = 0; col < A.get_width(); ++col)
01858                 matrix_col_vector_multiply_and_add_swappable_block(A(row, col), A.is_transposed(), A.bs,
01859                         x, z, (offset_x + col) * BlockSideLength, (offset_z + row) * BlockSideLength);
01860         return z;
01861     }
01862 
01863     static void matrix_col_vector_multiply_and_add_swappable_block(
01864             const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type & bs_a,
01865             const column_vector_type & x, column_vector_type & z,
01866             const vector_size_type offset_x = 0, const vector_size_type offset_z = 0)
01867     {
01868         // check if zero-block (== ! initialized)
01869         if (! bs_a.is_initialized(a))
01870         {
01871             // => matrix is zero => product is zero
01872             return;
01873         }
01874         // acquire
01875         internal_block_type & ia = bs_a.acquire(a);
01876         // multiply
01877         if (! bs_a.is_simulating())
01878         {
01879             int_type row_limit = std::min(BlockSideLength, unsigned(z.size() - offset_z)),
01880                      col_limit = std::min(BlockSideLength, unsigned(x.size() - offset_x));
01881             if (a_is_transposed)
01882                 for (int_type col = 0; col < col_limit; ++col)
01883                     for (int_type row = 0; row < row_limit; ++row)
01884                         z[offset_z + row] += x[offset_x + col] * ia[row + col * BlockSideLength];
01885             else
01886                 for (int_type row = 0; row < row_limit; ++row)
01887                     for (int_type col = 0; col < col_limit; ++col)
01888                         z[offset_z + row] += x[offset_x + col] * ia[row * BlockSideLength + col];
01889         }
01890         // release
01891         bs_a.release(a, false);
01892     }
01893 
01894     //! \brief calculates z = y * A
01895     static row_vector_type &
01896     recursive_matrix_row_vector_multiply_and_add(const row_vector_type & y,
01897             const swappable_block_matrix_type & A, row_vector_type & z,
01898             const vector_size_type offset_y = 0, const vector_size_type offset_z = 0)
01899     {
01900         // catch empty intervals
01901         if (A.get_height() * A.get_width() == 0)
01902             return z;
01903         // base case
01904         if (A.get_height() == 1 || A.get_width() == 1)
01905             return naive_matrix_row_vector_multiply_and_add(y, A, z, offset_y, offset_z);
01906 
01907         // partition matrix
01908         swappable_block_matrix_approximative_quarterer qa(A);
01909         // recursive multiplication
01910         // The order of recursive calls is optimized to enhance locality.
01911         recursive_matrix_row_vector_multiply_and_add(y, qa.ul, z, offset_y,                      offset_z                    );
01912         recursive_matrix_row_vector_multiply_and_add(y, qa.dl, z, offset_y + qa.ul.get_height(), offset_z                    );
01913         recursive_matrix_row_vector_multiply_and_add(y, qa.dr, z, offset_y + qa.ul.get_height(), offset_z + qa.ul.get_width());
01914         recursive_matrix_row_vector_multiply_and_add(y, qa.ur, z, offset_y,                      offset_z + qa.ul.get_width());
01915 
01916         return z;
01917     }
01918 
01919     static row_vector_type &
01920     naive_matrix_row_vector_multiply_and_add(const row_vector_type & y, const swappable_block_matrix_type & A,
01921                                      row_vector_type & z,
01922                                      const vector_size_type offset_y = 0, const vector_size_type offset_z = 0)
01923     {
01924         for (size_type row = 0; row < A.get_height(); ++row)
01925             for (size_type col = 0; col < A.get_width(); ++col)
01926                 matrix_row_vector_multiply_and_add_swappable_block(y, A(row, col), A.is_transposed(), A.bs,
01927                         z, (offset_y + row) * BlockSideLength, (offset_z + col) * BlockSideLength);
01928         return z;
01929     }
01930 
01931     static void matrix_row_vector_multiply_and_add_swappable_block(const row_vector_type & y,
01932             const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type & bs_a,
01933             row_vector_type & z,
01934             const vector_size_type offset_y = 0, const vector_size_type offset_z = 0)
01935     {
01936         // check if zero-block (== ! initialized)
01937         if (! bs_a.is_initialized(a))
01938         {
01939             // => matrix is zero => product is zero
01940             return;
01941         }
01942         // acquire
01943         internal_block_type & ia = bs_a.acquire(a);
01944         // multiply
01945         if (! bs_a.is_simulating())
01946         {
01947             int_type row_limit = std::min(BlockSideLength, unsigned(y.size() - offset_y)),
01948                      col_limit = std::min(BlockSideLength, unsigned(z.size() - offset_z));
01949             if (a_is_transposed)
01950                 for (int_type col = 0; col < col_limit; ++col)
01951                     for (int_type row = 0; row < row_limit; ++row)
01952                         z[offset_z + col] += ia[row + col * BlockSideLength] * y[offset_y + row];
01953             else
01954                 for (int_type row = 0; row < row_limit; ++row)
01955                     for (int_type col = 0; col < col_limit; ++col)
01956                         z[offset_z + col] += ia[row * BlockSideLength + col] * y[offset_y + row];
01957         }
01958         // release
01959         bs_a.release(a, false);
01960     }
01961 
01962     // +-+ end matrix-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
01963 
01964     // +-+-+-+ vector-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
01965 
01966     static void recursive_matrix_from_vectors(swappable_block_matrix_type A, const column_vector_type & l,
01967             const row_vector_type & r, vector_size_type offset_l = 0, vector_size_type offset_r = 0)
01968     {
01969         // catch empty intervals
01970         if (A.get_height() * A.get_width() == 0)
01971             return;
01972         // base case
01973         if (A.get_height() == 1 || A.get_width() == 1)
01974         {
01975             naive_matrix_from_vectors(A, l, r, offset_l, offset_r);
01976             return;
01977         }
01978 
01979         // partition matrix
01980         swappable_block_matrix_approximative_quarterer qa(A);
01981         // recursive creation
01982         // The order of recursive calls is optimized to enhance locality.
01983         recursive_matrix_from_vectors(qa.ul, l, r, offset_l,                      offset_r                    );
01984         recursive_matrix_from_vectors(qa.ur, l, r, offset_l,                      offset_r + qa.ul.get_width());
01985         recursive_matrix_from_vectors(qa.dr, l, r, offset_l + qa.ul.get_height(), offset_r + qa.ul.get_width());
01986         recursive_matrix_from_vectors(qa.dl, l, r, offset_l + qa.ul.get_height(), offset_r                    );
01987     }
01988 
01989     static void naive_matrix_from_vectors(swappable_block_matrix_type A, const column_vector_type & l,
01990             const row_vector_type & r, vector_size_type offset_l = 0, vector_size_type offset_r = 0)
01991     {
01992         for (size_type row = 0; row < A.get_height(); ++row)
01993             for (size_type col = 0; col < A.get_width(); ++col)
01994                 matrix_from_vectors_swappable_block(A(row, col), A.is_transposed(), A.bs,
01995                         l, r, (offset_l + row) * BlockSideLength, (offset_r + col) * BlockSideLength);
01996     }
01997 
01998     static void matrix_from_vectors_swappable_block(swappable_block_identifier_type a,
01999             const bool a_is_transposed, block_scheduler_type & bs_a,
02000             const column_vector_type & l, const row_vector_type & r,
02001             vector_size_type offset_l, vector_size_type offset_r)
02002     {
02003         // acquire
02004         internal_block_type & ia = bs_a.acquire(a, true);
02005         // multiply
02006         if (! bs_a.is_simulating())
02007         {
02008             int_type row_limit = std::min(BlockSideLength, unsigned(l.size() - offset_l)),
02009                      col_limit = std::min(BlockSideLength, unsigned(r.size() - offset_r));
02010             if (a_is_transposed)
02011                 for (int_type col = 0; col < col_limit; ++col)
02012                     for (int_type row = 0; row < row_limit; ++row)
02013                         ia[row + col * BlockSideLength] = l[row + offset_l] * r[col + offset_r];
02014             else
02015                 for (int_type row = 0; row < row_limit; ++row)
02016                     for (int_type col = 0; col < col_limit; ++col)
02017                         ia[row * BlockSideLength + col] = l[row + offset_l] * r[col + offset_r];
02018         }
02019         // release
02020         bs_a.release(a, true);
02021     }
02022 
02023     // +-+ end vector-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
02024 };
02025 
02026 // Adjust choose_level_for_feedable_sw, too!
02027 template <typename ValueType, unsigned BlockSideLength>
02028 const int_type matrix_operations<ValueType, BlockSideLength>::strassen_winograd_base_case_size = 3;
02029 
02030 __STXXL_END_NAMESPACE
02031 
02032 #endif /* STXXL_MATRIX_ARITHMETIC_HEADER */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines