27 std::ostream&
operator<<(std::ostream& o,
const tensor_ranges& r) {
30 o <<
"[0.." << r[i] <<
"]";
48 template<
typename IT>
class basic_multi_iterator {
52 tensor_strides strides;
54 index_set ilst2idxnums;
55 std::vector<const tensor_strides*> slst;
59 const tensor_ranges& getcnt()
const {
return cnt; }
60 basic_multi_iterator() {
65 ilst2idxnums.reserve(64); iter.reserve(4);
67 const tensor_ranges& all_ranges()
const {
return ranges; }
68 const index_set& all_indexes()
const {
return idxnums; }
71 void insert(
const index_set& idxs,
const tensor_ranges& r,
const tensor_strides& s, IT it_) {
72 assert(idxs.size() == r.size()); assert(s.size() == r.size()+1);
74 for (
unsigned int i=0; i < idxs.size(); ++i) {
75 index_set::const_iterator f=std::find(idxnums.begin(), idxnums.end(), idxs[i]);
76 if (f == idxnums.end()) {
77 ilst2idxnums.push_back(dim_type(idxnums.size()));
78 idxnums.push_back(idxs[i]);
79 ranges.push_back(r[i]);
81 ilst2idxnums.push_back(dim_type(f-idxnums.begin()));
82 assert(ranges[ilst2idxnums.back()] == r[i]);
90 strides.assign(N*idxnums.size(),0);
91 for (
unsigned int i=0; i < N; ++i) {
92 for (
unsigned int j=0; j < slst[i]->size()-1; ++j) {
93 stride_type s = (*slst[i])[j];
94 strides[ilst2idxnums[c]*N + i] = s;
99 for (
unsigned int i=0; i < idxnums.size(); ++i) {
100 for (
unsigned int j=0; j < N; ++j)
104 cnt.assign(idxnums.size(),0);
109 bool next(
unsigned int b) {
return next(b,N); }
110 bool next(
unsigned int b,
unsigned int NN) {
113 if (++cnt[i0] < ranges[i0]) {
114 for (
unsigned int i=b; i < NN; ++i)
115 iter[i] += strides[i0*NN+i];
118 for (
unsigned int i=b; i < NN; ++i)
119 iter[i] -= strides[i0*NN+i]*(ranges[i0]-1);
126 template<
unsigned b,
unsigned NN>
bool qnext() {
return next(b,NN); }
127 IT it(
unsigned int b)
const {
return iter[b]; }
135 tensor_mask::tensor_mask(
const tensor_mask& tm1,
const tensor_mask& tm2,
bool and_op) {
136 assign(tm1,tm2,and_op); unset_card();
139 void tensor_mask::assign(
const tensor_mask& tm1,
const tensor_mask& tm2,
bool and_op) {
140 clear(); unset_card();
141 if (tm1.ndim()==0) { assign(tm2);
return; }
142 if (tm2.ndim()==0) { assign(tm1);
return; }
144 idxs = tm1.indexes();
149 tensor_ranges global_r(std::max(tm1.max_dim(),tm2.max_dim())+1, index_type(1));
151 for (index_type i = 0; i < tm1.indexes().size(); ++i)
152 global_r[tm1.indexes()[i]] = tm1.ranges()[i];
155 for (index_type i = 0; i < tm2.indexes().size(); ++i) {
156 index_set::const_iterator f=std::find(tm1.indexes().begin(), tm1.indexes().end(), tm2.indexes()[i]);
157 if (f == tm1.indexes().end()) {
158 assert(global_r[tm2.indexes()[i]]==1);
159 global_r[tm2.indexes()[i]] = tm2.ranges()[i];
160 r.push_back(tm2.ranges()[i]);
161 idxs.push_back(tm2.indexes()[i]);
163 else assert(global_r[tm2.indexes()[i]] = tm2.ranges()[i]);
167 m.assign(size(),
false);
169 for (tensor_ranges_loop l(global_r); !l.finished(); l.next()) {
171 if (tm1(l.cnt) && tm2(l.cnt))
174 if (tm1(l.cnt) || tm2(l.cnt))
183 void tensor_mask::assign(
const tensor_mask& tm1,
const tensor_mask& tm2,
bool and_op) {
184 clear(); unset_card();
187 if (tm1.ndim()==0) { assign(tm2);
return; }
188 if (tm2.ndim()==0) { assign(tm1);
return; }
191 if (tm1.indexes() == tm2.indexes() &&
192 tm1.ranges() == tm2.ranges() &&
193 tm1.strides() == tm2.strides()) {
194 r = tm1.ranges(); idxs=tm1.indexes(); s=tm1.strides();
195 assert(tm1.m.size() == tm2.m.size());
198 for (index_type i=0; i < tm2.m.size(); ++i)
202 for (index_type i=0; i < tm2.m.size(); ++i)
209 basic_multi_iterator<unsigned> bmit;
210 bmit.insert(tm1.indexes(), tm1.ranges(), tm1.strides(), 0);
211 bmit.insert(tm2.indexes(), tm2.ranges(), tm2.strides(), 0);
212 r = bmit.all_ranges(); idxs = bmit.all_indexes(); eval_strides();
214 m.assign(size(),
false);
215 bmit.insert(indexes(), ranges(), strides(), 0);
220 if (tm1.m[bmit.it(0)]) {
222 if (tm2.m[bmit.it(1)])
226 }
while (bmit.qnext<1,3>());
228 }
while (bmit.qnext<0,3>());
231 bool v1 = tm1.m[bmit.it(0)];
233 if (v1 || tm2.m[bmit.it(1)])
235 }
while (bmit.qnext<1,3>());
236 }
while (bmit.qnext<0,3>());
244 tensor_mask::tensor_mask(
const std::vector<const tensor_mask*>& tm) {
249 void tensor_mask::assign(
const std::vector<const tensor_mask*> & tm) {
250 index_set mask_start; unset_card();
252 r.reserve(255); idxs.reserve(255); mask_start.reserve(255);
253 for (dim_type i=0; i < tm.size(); ++i) {
254 mask_start.push_back(r.size());
255 for (dim_type j=0; j < tm[i]->ndim(); ++j) {
256 r.push_back(tm[i]->ranges()[j]);
257 idxs.push_back(tm[i]->indexes()[j]);
260 eval_strides(); assert(size());
261 m.assign(size(),
false);
262 for (tensor_ranges_loop l(r); !l.finished(); l.next()) {
264 for (dim_type i=0; is_in && i < tm.size(); ++i) {
266 for (dim_type j=0; j < tm[i]->ndim(); ++j)
267 s_ += l.cnt[j+mask_start[i]]*tm[i]->strides()[j];
271 if (is_in) m.add(lpos(l.cnt));
276 void tensor_mask::assign(
const std::vector<const tensor_mask*> & tm) {
278 if (tm.size() == 0) {
clear();
return; }
279 if (tm.size() == 1) { assign(*tm[0]);
return; }
281 basic_multi_iterator<unsigned> bmit;
282 for (dim_type i=0; i < tm.size(); ++i)
283 bmit.insert(tm[i]->indexes(), tm[i]->ranges(), tm[i]->strides(), 0);
284 r = bmit.all_ranges(); idxs = bmit.all_indexes(); eval_strides();
286 m.assign(size(),
false);
287 bmit.insert(indexes(), ranges(), strides(), 0);
289 unsigned N = unsigned(tm.size());
290 bool finished =
false;
294 for (b=0; b < N; ++b) {
295 if (!tm[b]->m[bmit.it(b)]) {
300 if (is_in) { m[bmit.it(N)] = 1; b = N-1; }
301 while (!finished && !bmit.next(b)) {
if (b == 0) finished =
true; b--; }
305 void tensor_mask::unpack_strides(
const tensor_strides& packed, tensor_strides& unpacked)
const {
306 if (packed.size() != card())
307 assert(packed.size()==card());
308 unpacked.assign(size(),INT_MIN);
310 for (tensor_ranges_loop l(r); !l.finished(); l.next()) {
311 if (m[lpos(l.cnt)]) unpacked[lpos(l.cnt)] = packed[i++];
315 void tensor_mask::check_assertions()
const {
316 GMM_ASSERT3(r.size() == idxs.size(),
"");
317 GMM_ASSERT3(s.size() == idxs.size()+1,
"");
318 GMM_ASSERT3(m.size() == size(),
"");
320 for (dim_type i=0; i < idxs.size(); ++i) {
321 GMM_ASSERT3(!bv.is_in(i),
"");
326 tensor_mask::tensor_mask(
const std::vector<const tensor_mask*> tm1,
const std::vector<const tensor_mask*> tm2,
bool and_op) {
327 assign(tensor_mask(tm1), tensor_mask(tm2), and_op);
330 void tensor_ref::set_sub_tensor(
const tensor_ref& tr,
const tensor_shape& sub) {
336 strides_.resize(masks().size());
337 for (dim_type i = 0; i < strides_.size(); ++i)
338 strides_[i].
resize(mask(i).card());
340 pbase_ = tr.pbase_; base_shift_ = tr.base_shift();
348 std::vector<tensor_strides> trstrides_unpacked(tr.masks().size());
349 for (dim_type im = 0; im < tr.masks().size(); ++im) {
350 tr.mask(im).check_assertions();
351 tr.mask(im).unpack_strides(tr.strides()[im], trstrides_unpacked[im]);
356 for (dim_type im = 0; im < masks().size(); ++im) {
357 const tensor_mask& m = masks()[im];
363 std::vector<dim_type> trmasks; trmasks.reserve(tr.masks().size());
364 for (dim_type i=0; i < m.indexes().size(); ++i) {
365 if (tr.index_is_valid(m.indexes()[i])) {
366 dim_type im2 = tr.index_to_mask_num(m.indexes()[i]);
367 if (std::find(trmasks.begin(), trmasks.end(), im2)==trmasks.end())
368 trmasks.push_back(im2);
372 tensor_ranges gcnt(tr.ndim(),0);
373 stride_type stcnt = 0;
375 for (tensor_ranges_loop l(m.ranges()); !l.finished(); l.next()) {
377 for (dim_type i=0; i < m.ranges().size(); ++i)
378 gcnt[m.indexes()[i]] = l.index(i);
382 stride_type tr_s = 0;
386 for (dim_type i=0; i < trmasks.size(); ++i) {
387 const tensor_mask &mm = tr.mask(trmasks[i]);
391 tr_s += trstrides_unpacked[trmasks[i]][mm.pos(gcnt)];
392 assert(trstrides_unpacked[trmasks[i]][mm.pos(gcnt)]>=0);
393 }
else { in_trm =
false;
break; }
396 if (in_m) assert(in_trm);
397 if (!in_trm) assert(!in_m);
401 strides_[im][stcnt++] = tr_s;
406 assert(stcnt == stride_type(m.card()));
414 tensor_ref::tensor_ref(
const tensor_ref& tr, tensor_mask::Slice slice) {
415 set_sub_tensor(tr, tr.slice_shape(slice));
421 const tensor_mask& m1(index_to_mask(slice.dim));
422 dim_type mdim = index_to_mask_dim(slice.dim);
424 tensor_ranges r(m1.ranges()); r.erase(r.begin()+mdim);
425 index_set idx(m1.indexes()); idx.erase(idx.begin()+mdim);
426 tensor_mask m2(r,idx);
427 index_type pos1 = 0, pos2 = 0;
428 for (tensor_ranges_loop l(m1.ranges()); !l.finished(); l.next()) {
429 if (l.index(mdim) == slice.i0)
430 m2.set_mask_val(pos2++, m1(pos1));
432 assert(m1(pos1) == 0);
438 assert(index_to_mask_num(slice.dim) < masks().size());
439 masks()[index_to_mask_num(slice.dim)] = m2;
442 remove_mask(index_to_mask_num(slice.dim));
445 shift_dim_num_ge(slice.dim,-1);
446 set_ndim_noclean(dim_type(ndim()-1));
451 struct compare_packed_range {
452 const std::vector<packed_range_info>& pri;
453 std::vector<stride_type> mean_inc;
454 compare_packed_range(
const std::vector<packed_range_info>& pri_) : pri(pri_) {}
455 bool operator()(dim_type a, dim_type b) {
456 if (pri[a].n < pri[b].n)
return true;
457 else if (pri[a].n > pri[b].n)
return false;
459 if (pri[a].mean_increm > pri[b].mean_increm)
466 void multi_tensor_iterator::init(std::vector<tensor_ref> trtab,
bool with_index_values) {
467 N = index_type(trtab.size());
468 index_type N_packed_idx = 0;
472 tensor_shape ts(trtab[0]);
473 for (dim_type i=1; i < trtab.size(); ++i)
479 for (dim_type i = 0; i < N; ++i) {
480 tensor_ref tmp = trtab[i];
481 trtab[i].set_sub_tensor(tmp,ts);
485 dal::bit_vector packed_idx; packed_idx.sup(0,ts.ndim());
486 for (index_type mi=0; mi < ts.masks().size(); ++mi) {
487 if (ts.masks()[mi].card() != 1) {
492 for (dim_type j=0; j < N; ++j) {
493 if (trtab[j].strides()[mi].size() != 0) {
494 assert(trtab[j].strides()[mi].size() == 1);
495 assert(trtab[j].strides()[mi][0] == 0);
501 pr.resize(N_packed_idx);
502 pri.resize(N_packed_idx);
507 for (dim_type mi = 0; mi < ts.masks().size(); ++mi) {
508 if (packed_idx[mi]) {
510 pri[pmi].original_masknum = mi;
511 pri[pmi].range = ts.masks()[mi].card();
512 for (n = 0; n < N; n++)
513 if (trtab[n].index_is_valid(mi))
522 std::sort(pri.begin(), pri.end());
526 bloc_rank.resize(N+1); bloc_rank[0] = 0;
527 bloc_nelt.resize(N+1); bloc_nelt[0] = 0;
528 for (index_type i=1; i <= N; ++i) {
529 bloc_rank[i] = bloc_rank[i-1];
530 while (bloc_rank[i] < pri.size() && pri[bloc_rank[i]].n == i-1) bloc_rank[i]++;
531 bloc_nelt[i] = bloc_rank[i] - bloc_rank[i-1];
535 for (pmi = 0; pmi < pri.size(); ++pmi) {
536 index_type mi = pri[pmi].original_masknum;
537 pri[pmi].mean_increm = 0;
538 pri[pmi].inc.assign(pri[pmi].range*(N-pri[pmi].n), 0);
539 pri[pmi].have_regular_strides.reset();
540 pri[pmi].have_regular_strides = std::bitset<32>((1 << N)-1);
541 for (dim_type n=pri[pmi].n; n < N; ++n) {
542 index_type pos0 = (n - pri[pmi].n);
543 for (index_type i = 0; i < pri[pmi].range; ++i) {
544 index_type pos = i * (N-pri[pmi].n) + pos0;
545 if (i != pri[pmi].range-1) {
546 stride_type increm = trtab[n].strides()[mi][i+1] - trtab[n].strides()[mi][i];
547 pri[pmi].inc[pos] = increm;
548 if (pri[pmi].inc[pos] != pri[pmi].inc[pos0])
549 pri[pmi].have_regular_strides[n] =
false;
550 pri[pmi].mean_increm += increm;
552 pri[pmi].inc[pos] = -trtab[n].strides()[mi][i];
556 pri[pmi].mean_increm /= (N-pri[pmi].n)*(pri[pmi].range-1);
560 index_set pr_reorder(pri.size());
561 for (pmi = 0; pmi < pri.size(); ++pmi) {
562 pr_reorder[pmi]=dim_type(pmi);
564 std::sort(pr_reorder.begin(), pr_reorder.end(), compare_packed_range(pri));
566 std::vector<packed_range> tmppr(pr);
567 std::vector<packed_range_info> tmppri(pri);
568 for (dim_type i =0; i < pri.size(); ++i) {
569 pr[i] = tmppr [pr_reorder[i]];
570 pri[i] = tmppri[pr_reorder[i]];
575 if (with_index_values) {
576 idxval.resize(ts.ndim());
578 for (dim_type i=0; i < ts.ndim(); ++i) {
579 idxval[i].ppinc = NULL;
580 idxval[i].pposbase = NULL;
584 for (index_type mi = 0; mi < ts.masks().size(); ++mi) {
586 pmi = index_type(-1);
587 for (dim_type i=0; i < pr.size(); ++i)
588 if (pri[i].original_masknum == mi)
591 if (pmi != index_type(-1)) {
592 ts.masks()[mi].gen_mask_pos(pri[pmi].mask_pos);
594 ts.masks()[mi].gen_mask_pos(v);
597 for (dim_type i=0; i < ts.masks()[mi].indexes().size(); ++i) {
598 dim_type ii = ts.masks()[mi].indexes()[i];
599 idxval[ii].cnt_num = dim_type(pmi);
600 idxval[ii].pos_ = (pmi != index_type(-1)) ? 0 : v[0];
601 idxval[ii].mod = ts.masks()[mi].strides()[i+1];
602 idxval[ii].div = ts.masks()[mi].strides()[i];
610 vectorized_strides_.resize(N); vectorized_size_ = 0;
611 std::fill(vectorized_strides_.begin(), vectorized_strides_.end(), 0);
612 vectorized_pr_dim = index_type(pri.size());
613 for (vectorized_pr_dim = index_type(pri.size()-1); vectorized_pr_dim != index_type(-1); vectorized_pr_dim--) {
614 std::vector<packed_range_info>::const_iterator pp = pri.begin() + vectorized_pr_dim;
615 if (vectorized_pr_dim == pri.size()-1) {
616 if (pp->have_regular_strides.count() == N) vectorized_size_ = pp->range;
617 for (dim_type n=pp->n; n < N; ++n) {
618 GMM_ASSERT3(n < pp->inc.size(),
"");
619 vectorized_strides_[n] = pp->inc[n];
622 if (pp->have_regular_strides.count() != N)
break;
623 bool still_ok =
true;
624 for (dim_type n=pp->n; n < N; ++n) {
625 if (stride_type(vectorized_strides_[n]*vectorized_size_) != pp->inc[n]) still_ok =
false;
628 vectorized_size_ *= pp->range;
633 it.resize(N); pit0.resize(N); itbase.resize(N);
634 for (dim_type n=0; n < N; ++n) {
635 pit0[n]=trtab[n].pbase();
636 itbase[n]=trtab[n].base_shift();
641 void multi_tensor_iterator::print()
const {
642 cout <<
"MTI(N=" << N <<
"): ";
643 for (dim_type i=0; i < pr.size(); ++i)
644 cout <<
" pri[" <<
int(i) <<
"]: n=" << int(pri[i].n)
645 <<
", range=" << pri[i].range <<
", mean_increm="
646 << pri[i].mean_increm <<
", regular = " << pri[i].have_regular_strides
647 <<
", inc=" << vref(pri[i].inc) <<
"\n";
648 cout <<
"bloc_rank: " << vref(bloc_rank) <<
", bloc_nelt: " << vref(bloc_nelt) <<
"\n";
649 cout <<
"vectorized_size : " << vectorized_size_ <<
", strides = " << vref(vectorized_strides_) <<
", pr_dim=" << vectorized_pr_dim <<
"\n";
652 void tensor_reduction::insert(
const tensor_ref& tr_,
const std::string& s) {
653 tensor_shape ts(tr_);
655 trtab.push_back(tref_or_reduction(tensor_ref(tr_, ts), s));
662 void tensor_reduction::insert(
const tref_or_reduction& t,
const std::string& s) {
663 if (!t.is_reduction()) {
666 trtab.push_back(t); trtab.back().ridx = s;
676 void tensor_reduction::update_reduction_chars() {
677 reduction_chars.clear();
678 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
679 assert(it->ridx.size() == it->tr().ndim());
680 for (
unsigned i =0; i < it->ridx.size(); ++i) {
681 if (it->ridx[i] !=
' ' &&
682 reduction_chars.find(it->ridx[i]) == std::string::npos)
683 reduction_chars.push_back(it->ridx[i]);
689 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
690 it->gdim.resize(it->ridx.size());
691 for (
unsigned i =0; i < it->ridx.size(); ++i) {
692 char c = it->ridx[i];
693 if (c !=
' ' && it->ridx.find(c) != i) {
694 for (c =
'A'; c <=
'Z'; ++c)
695 if (reduction_chars.find(c) == std::string::npos)
break;
697 reduction_chars.push_back(it->ridx[i]);
706 void tensor_reduction::pre_prepare() {
707 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
708 assert(it->ridx.size() == it->tr().ndim());
709 it->rdim.resize(it->ridx.size());
711 for (dim_type i =0; i < it->ridx.size(); ++i) {
712 if (it->ridx[i] ==
' ') {
713 reduced_range.push_back(it->tr().dim(i));
714 it->rdim[i] = dim_type(reduced_range.size()-1);
716 it->rdim[i] = dim_type(-1);
723 size_type tensor_reduction::find_best_sub_reduction(dal::bit_vector &best_lst, std::string &best_idxset) {
726 best_lst.clear(); best_idxset.clear();
728 update_reduction_chars();
731 GMM_ASSERT2(trtab.size() <= 32,
"wow it was assumed that nobody would "
732 "ever need a reduction on more than 32 tensors..");
734 std::vector<std::bitset<32> > idx_occurences(reduction_chars.size());
736 for (
unsigned ir=0; ir < reduction_chars.size(); ++ir) {
737 char c = reduction_chars[ir];
738 for (
unsigned tnum=0; tnum < trtab.size(); ++tnum)
739 idx_occurences[ir][tnum] = (trtab[tnum].ridx.find(c) != std::string::npos);
743 for (
unsigned ir=0; ir < reduction_chars.size(); ++ir) {
744 lst.clear(); lst.add(ir);
745 idxset.resize(0); idxset.push_back(reduction_chars[ir]);
747 for (
unsigned ir2=0; ir2 < reduction_chars.size(); ++ir2) {
749 if ((idx_occurences[ir2] | idx_occurences[ir]) == idx_occurences[ir]) {
751 idxset.push_back(reduction_chars[ir2]);
757 for (
unsigned tnum=0; tnum < trtab.size(); ++tnum) {
758 if (!idx_occurences[ir][tnum])
760 std::bitset<int(32)> once((
int)reduction_chars.size());
761 for (dim_type i=0; i < trtab[tnum].tr().ndim(); ++i) {
763 for (dal::bv_visitor j(lst); !j.finished(); ++j) {
764 if (trtab[tnum].ridx[i] == reduction_chars[(
size_t)j]) {
765 if (once[j]) ignore =
true;
770 redsz *= trtab[tnum].tr().dim(i);
774 if (redsz < best_redsz) {
777 for (
unsigned i=0; i < trtab.size(); ++i)
778 if (idx_occurences[ir][i]) best_lst.add(i);
779 best_idxset = idxset;
789 void tensor_reduction::make_sub_reductions() {
790 dal::bit_vector bv; std::string red;
793 find_best_sub_reduction(bv,red);
794 if (bv.card() < trtab.size() && red.size()) {
796 auto sub = std::make_shared<tensor_reduction>();
797 std::vector<dim_type> new_rdim; new_rdim.reserve(16);
798 std::string new_reduction;
799 for (dal::bv_visitor tnum(bv); !tnum.finished(); ++tnum) {
800 tref_or_reduction &t = trtab[tnum];
801 std::string re = t.ridx; t.ridx.clear();
802 for (
unsigned i = 0; i < re.size(); ++i) {
803 bool reduced =
false;
806 if (red.find(re[i]) == std::string::npos) c =
' ';
810 t.ridx.push_back(re[i]);
811 new_rdim.push_back(t.rdim[i]);
812 new_reduction.push_back(re[i]);
817 sub->insert(trtab[tnum], re);
823 trtab[bv.first_true()] = tref_or_reduction(sub, new_reduction);
824 trtab[bv.first_true()].rdim = new_rdim;
825 std::vector<tref_or_reduction> trtab2; trtab2.reserve(trtab.size() - bv.card() + 1);
826 for (
size_type i=0; i < trtab.size(); ++i)
827 if (!bv.is_in(i) || i == bv.first_true())
828 trtab2.push_back(trtab[i]);
842 void tensor_reduction::prepare(
const tensor_ref* tr_out) {
844 make_sub_reductions();
847 if (tr_out == NULL) {
848 trres = tensor_ref(reduced_range);
849 out_data.resize(trres.card());
850 pout_data = &out_data[0];
851 trres.set_base(pout_data);
853 GMM_ASSERT3(tr_out->ndim() == reduced_range.size(),
"");
854 for (dim_type i=0; i < tr_out->ndim(); ++i)
855 GMM_ASSERT3(tr_out->dim(i) == reduced_range[i],
"");
863 tensor_ranges global_range;
865 std::string global_chars;
867 global_range.reserve(16);
868 global_range.assign(reduced_range.begin(), reduced_range.end());
869 global_chars.insert(
size_type(0), reduced_range.size(),
' ');
870 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
871 assert(it->rdim.size() == it->tr().ndim());
873 for (dim_type i=0; i < it->ridx.size(); ++i) {
874 if (it->rdim[i] == dim_type(-1)) {
875 assert(it->ridx[i] !=
' ');
877 if (p == std::string::npos) {
878 global_chars.push_back(it->ridx[i]);
879 global_range.push_back(it->tr().dim(i));
880 it->gdim[i] = dim_type(global_range.size() - 1);
882 GMM_ASSERT1(it->tr().dim(i) == global_range[p],
883 "inconsistent dimensions for reduction index "
884 << it->ridx[i] <<
"(" <<
int(it->tr().dim(i))
885 <<
" != " <<
int(global_range[p]) <<
")");
886 it->gdim[i] = dim_type(p);
894 std::vector<dim_type> reorder(global_chars.size(), dim_type(-1));
896 for (dim_type i=0; i < reduced_range.size(); ++i) reorder[i] = i;
898 trres.permute(reorder);
899 std::vector<tensor_ref> tt; tt.reserve(trtab.size()+1);
903 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
904 std::fill(reorder.begin(), reorder.end(), dim_type(-1));
905 for (dim_type i=0; i < it->gdim.size(); ++i) {
906 reorder[it->gdim[i]] = i;
909 it->tr().permute(reorder);
910 tt.push_back(it->tr());
918 static void do_reduction1(bgeot::multi_tensor_iterator &mti) {
923 }
while (mti.bnext(1));
925 }
while (mti.bnext(0));
928 static bool do_reduction2v(bgeot::multi_tensor_iterator &mti) {
930 const std::vector<stride_type> &s = mti.vectorized_strides();
931 if (n && s[0] && s[1] && s[2] == 0) {
932 #if defined(GMM_USES_BLAS)
933 BLAS_INT nn = BLAS_INT(n), incx = s[1], incy = s[0];
935 dim_type incx = dim_type(s[1]), incy = dim_type(s[0]);
943 #if defined(GMM_USES_BLAS)
944 gmm::daxpy_(&nn, &mti.p(2), &mti.p(1), &incx, &mti.p(0), &incy);
947 scalar_type* itx = &mti.p(1);
948 scalar_type* ity = &mti.p(0);
956 }
while (mti.vnext());
961 static void do_reduction2a(bgeot::multi_tensor_iterator &mti) {
962 if (!do_reduction2v(mti)) {
964 mti.p(0) += mti.p(1)*mti.p(2);
965 }
while (mti.bnext(0));
969 static void do_reduction2b(bgeot::multi_tensor_iterator &mti) {
976 }
while (mti.bnext(2));
978 }
while (mti.bnext(1));
980 }
while (mti.bnext(0));
983 static bool do_reduction3v(bgeot::multi_tensor_iterator &mti) {
985 const std::vector<stride_type> &s = mti.vectorized_strides();
986 if (n && s[0] && s[1] && s[2] == 0 && s[3] == 0) {
987 #if defined(GMM_USES_BLAS)
988 BLAS_INT nn = BLAS_INT(n), incx = s[1], incy = s[0];
990 dim_type incx = dim_type(s[1]), incy = dim_type(s[0]);
993 double a = mti.p(2)*mti.p(3);
994 #if defined(GMM_USES_BLAS)
995 gmm::daxpy_(&nn, &a, &mti.p(1), &incx, &mti.p(0), &incy);
997 scalar_type* itx = &mti.p(1);
998 scalar_type* ity = &mti.p(0);
1006 }
while (mti.vnext());
1012 static void do_reduction3a(bgeot::multi_tensor_iterator &mti) {
1013 if (!do_reduction3v(mti)) {
1015 mti.p(0) += mti.p(1)*mti.p(2)*mti.p(3);
1016 }
while (mti.bnext(0));
1020 static void do_reduction3b(bgeot::multi_tensor_iterator &mti) {
1029 }
while (mti.bnext(3));
1031 }
while (mti.bnext(2));
1033 }
while (mti.bnext(1));
1035 }
while (mti.bnext(0));
1038 void tensor_reduction::do_reduction() {
1044 if (out_data.size()) memset(&out_data[0], 0, out_data.size()*
sizeof(out_data[0]));
1045 for (
unsigned i=0; i < trtab.size(); ++i) {
1046 if (trtab[i].is_reduction()) {
1047 trtab[i].reduction->do_reduction();
1048 trtab[i].reduction->result(trtab[i].tr());
1053 dim_type N = dim_type(trtab.size());
1056 }
else if (N == 2) {
1057 if (!mti.bnext_useful(2) && !mti.bnext_useful(1)) {
1058 do_reduction2a(mti);
1060 do_reduction2b(mti);
1062 }
else if (N == 3) {
1063 if (!mti.bnext_useful(1) && (!mti.bnext_useful(2)) && !mti.bnext_useful(3)) {
1064 do_reduction3a(mti);
1066 do_reduction3b(mti);
1068 }
else if (N == 4) {
1079 }
while (mti.bnext(4));
1081 }
while (mti.bnext(3));
1083 }
while (mti.bnext(2));
1085 }
while (mti.bnext(1));
1087 }
while (mti.bnext(0));
1089 GMM_ASSERT1(
false,
"unhandled reduction case ! (N=" <<
int(N) <<
")");
1093 void tensor_reduction::clear() {
1096 reduced_range.resize(0);
1097 reduction_chars.clear();
1100 pout_data = 0; trtab.reserve(10);
1105 void tensor_mask::print(std::ostream &o)
const {
1106 index_type c=card(
true);
1108 o <<
" mask : card=" << c <<
"(card_=" << card_ <<
", uptodate=" << card_uptodate <<
"), indexes=";
1109 for (dim_type i=0; i < idxs.size(); ++i)
1110 o << (i==0?
"":
", ") << int(idxs[i]) <<
":" << int(r[i]);
1112 if (c == size()) o <<
" FULL" << endl;
1115 if (idxs.size() == 1) {
1116 for (index_type i=0; i < m.size(); ++i)
1117 o << (m[i] ? 1 : 0);
1119 for (tensor_ranges_loop l(r); !l.finished(); l.next()) {
1120 if (l.cnt[0] == 0 && l.cnt[1] == 0 && r.size()>2) {
1122 for (dim_type i=2; i < r.size(); ++i)
1123 o <<
"," << l.cnt[i];
1126 o << (m[lpos(l.cnt)] ? 1 : 0);
1127 if (l.cnt[0] == r[0]-1) {
1128 if (l.cnt[1] != r[1]-1) o <<
",";
1129 else if (idxs.size() > 2) o <<
"}";
1139 void tensor_shape::print(std::ostream& o)
const {
1140 o <<
" tensor_shape: n=" << idx2mask.size() <<
", idx2mask=";
1141 for (dim_type i=0; i < idx2mask.size(); ++i) {
1143 if (idx2mask[i].is_valid()) {
1144 o <<
"r" << dim(i) <<
":m" << int(idx2mask[i].mask_num) <<
"/" << int(idx2mask[i].mask_dim);
1145 }
else o <<
" (na) ";
1149 for (dim_type i=0; i < masks_.size(); ++i) o << masks_[i];
1150 o <<
" ^-- end tensor_shape" << endl;
1153 void tensor_ref::print(std::ostream& o)
const {
1154 o <<
"tensor_ref, n=" << int(ndim()) <<
", card=" << card(
true) <<
", base=" << base() << endl;
1155 for (dim_type i=0; i < strides().size(); ++i) {
1156 o <<
" * strides["<<i<<
"]={";
1157 for (
size_type j=0; j < strides()[i].size(); ++j) o << (j>0?
",":
"") << strides()[i][j];
1160 multi_tensor_iterator mti(*
this,
true);
1162 for (dim_type i = 0; i < mti.ndim(); ++i) {
1163 o << (i==0?
"(":
",");
1164 if (index_is_valid(i))
1170 o <<
" = " << mti.p(0) <<
"\t@base+" << &mti.p(0) - base();
1171 }
else o <<
"\t@" << size_t(&mti.p(0))/
sizeof(scalar_type);
1173 }
while (mti.qnext1());
1175 o <<
"^---- end tensor_ref" << endl;
1178 std::ostream&
operator<<(std::ostream& o,
const tensor_mask& m) {
1179 m.print(o);
return o;
1181 std::ostream&
operator<<(std::ostream& o,
const tensor_shape& ts) {
1182 ts.print(o);
return o;
1184 std::ostream&
operator<<(std::ostream& o,
const tensor_ref& tr) {
1185 tr.print(o);
return o;
1187 void print_tm(
const tensor_mask& tm) { tm.print_(); }
1188 void print_ts(
const tensor_shape& ts) { ts.print_(); }
1189 void print_tr(
const tensor_ref& tr) { tr.print_(); }
Sparse tensors, used during the assembly.
void clear(L &l)
clear (fill with zeros) a vector or matrix.
void resize(V &v, size_type n)
*/
gmm interface for fortran BLAS.
std::ostream & operator<<(std::ostream &o, const convex_structure &cv)
Print the details of the convex structure cvs to the output stream o.
size_t size_type
used as the common size type in the library