Actual source code: baij2.c

  1: #define PETSCMAT_DLL

 3:  #include src/mat/impls/baij/seq/baij.h
 4:  #include src/inline/spops.h
 5:  #include src/inline/ilu.h
 6:  #include petscbt.h

 10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 11: {
 12:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 14:   PetscInt       row,i,j,k,l,m,n,*idx,*nidx,isz,val,ival;
 15:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 16:   PetscBT        table;

 19:   m     = a->mbs;
 20:   ai    = a->i;
 21:   aj    = a->j;
 22:   bs    = A->rmap.bs;

 24:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 26:   PetscBTCreate(m,table);
 27:   PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
 28:   PetscMalloc((A->rmap.N+1)*sizeof(PetscInt),&nidx2);

 30:   for (i=0; i<is_max; i++) {
 31:     /* Initialise the two local arrays */
 32:     isz  = 0;
 33:     PetscBTMemzero(m,table);
 34: 
 35:     /* Extract the indices, assume there can be duplicate entries */
 36:     ISGetIndices(is[i],&idx);
 37:     ISGetLocalSize(is[i],&n);

 39:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 40:     for (j=0; j<n ; ++j){
 41:       ival = idx[j]/bs; /* convert the indices into block indices */
 42:       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 43:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 44:     }
 45:     ISRestoreIndices(is[i],&idx);
 46:     ISDestroy(is[i]);
 47: 
 48:     k = 0;
 49:     for (j=0; j<ov; j++){ /* for each overlap*/
 50:       n = isz;
 51:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 52:         row   = nidx[k];
 53:         start = ai[row];
 54:         end   = ai[row+1];
 55:         for (l = start; l<end ; l++){
 56:           val = aj[l];
 57:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 58:         }
 59:       }
 60:     }
 61:     /* expand the Index Set */
 62:     for (j=0; j<isz; j++) {
 63:       for (k=0; k<bs; k++)
 64:         nidx2[j*bs+k] = nidx[j]*bs+k;
 65:     }
 66:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 67:   }
 68:   PetscBTDestroy(table);
 69:   PetscFree(nidx);
 70:   PetscFree(nidx2);
 71:   return(0);
 72: }

 76: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
 77: {
 78:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 80:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 81:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 82:   PetscInt       *irow,*icol,nrows,ncols,*ssmap,bs=A->rmap.bs,bs2=a->bs2;
 83:   PetscInt       *aj = a->j,*ai = a->i;
 84:   MatScalar      *mat_a;
 85:   Mat            C;
 86:   PetscTruth     flag;

 89:   ISSorted(iscol,(PetscTruth*)&i);
 90:   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 92:   ISGetIndices(isrow,&irow);
 93:   ISGetIndices(iscol,&icol);
 94:   ISGetLocalSize(isrow,&nrows);
 95:   ISGetLocalSize(iscol,&ncols);

 97:   PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
 98:   ssmap = smap;
 99:   PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
100:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
101:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102:   /* determine lens of each row */
103:   for (i=0; i<nrows; i++) {
104:     kstart  = ai[irow[i]];
105:     kend    = kstart + a->ilen[irow[i]];
106:     lens[i] = 0;
107:       for (k=kstart; k<kend; k++) {
108:         if (ssmap[aj[k]]) {
109:           lens[i]++;
110:         }
111:       }
112:     }
113:   /* Create and fill new matrix */
114:   if (scall == MAT_REUSE_MATRIX) {
115:     c = (Mat_SeqBAIJ *)((*B)->data);

117:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap.bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
119:     if (!flag) {
120:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
121:     }
122:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
123:     C = *B;
124:   } else {
125:     MatCreate(A->comm,&C);
126:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
127:     MatSetType(C,A->type_name);
128:     MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
129:   }
130:   c = (Mat_SeqBAIJ *)(C->data);
131:   for (i=0; i<nrows; i++) {
132:     row    = irow[i];
133:     kstart = ai[row];
134:     kend   = kstart + a->ilen[row];
135:     mat_i  = c->i[i];
136:     mat_j  = c->j + mat_i;
137:     mat_a  = c->a + mat_i*bs2;
138:     mat_ilen = c->ilen + i;
139:     for (k=kstart; k<kend; k++) {
140:       if ((tcol=ssmap[a->j[k]])) {
141:         *mat_j++ = tcol - 1;
142:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
143:         mat_a   += bs2;
144:         (*mat_ilen)++;
145:       }
146:     }
147:   }
148: 
149:   /* Free work space */
150:   ISRestoreIndices(iscol,&icol);
151:   PetscFree(smap);
152:   PetscFree(lens);
153:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
154:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
155: 
156:   ISRestoreIndices(isrow,&irow);
157:   *B = C;
158:   return(0);
159: }

163: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
164: {
165:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166:   IS             is1,is2;
168:   PetscInt       *vary,*iary,*irow,*icol,nrows,ncols,i,bs=A->rmap.bs,count;

171:   ISGetIndices(isrow,&irow);
172:   ISGetIndices(iscol,&icol);
173:   ISGetLocalSize(isrow,&nrows);
174:   ISGetLocalSize(iscol,&ncols);
175: 
176:   /* Verify if the indices corespond to each element in a block 
177:    and form the IS with compressed IS */
178:   PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
179:   iary = vary + a->mbs;
180:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
181:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182:   count = 0;
183:   for (i=0; i<a->mbs; i++) {
184:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185:     if (vary[i]==bs) iary[count++] = i;
186:   }
187:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
188: 
189:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
190:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
191:   count = 0;
192:   for (i=0; i<a->mbs; i++) {
193:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
194:     if (vary[i]==bs) iary[count++] = i;
195:   }
196:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
197:   ISRestoreIndices(isrow,&irow);
198:   ISRestoreIndices(iscol,&icol);
199:   PetscFree(vary);

201:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
202:   ISDestroy(is1);
203:   ISDestroy(is2);
204:   return(0);
205: }

209: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
210: {
212:   PetscInt       i;

215:   if (scall == MAT_INITIAL_MATRIX) {
216:     PetscMalloc((n+1)*sizeof(Mat),B);
217:   }

219:   for (i=0; i<n; i++) {
220:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
221:   }
222:   return(0);
223: }


226: /* -------------------------------------------------------*/
227: /* Should check that shapes of vectors and matrices match */
228: /* -------------------------------------------------------*/
229:  #include petscblaslapack.h

233: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
234: {
235:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
236:   PetscScalar    *x,*z,sum;
237:   MatScalar      *v;
239:   PetscInt       mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
240:   PetscTruth     usecprow=a->compressedrow.use;

243:   VecGetArray(xx,&x);
244:   VecGetArray(zz,&z);

246:   idx = a->j;
247:   v   = a->a;
248:   if (usecprow){
249:     mbs  = a->compressedrow.nrows;
250:     ii   = a->compressedrow.i;
251:     ridx = a->compressedrow.rindex;
252:   } else {
253:     mbs = a->mbs;
254:     ii  = a->i;
255:   }

257:   for (i=0; i<mbs; i++) {
258:     n    = ii[1] - ii[0]; ii++;
259:     sum  = 0.0;
260:     while (n--) sum += *v++ * x[*idx++];
261:     if (usecprow){
262:       z[ridx[i]] = sum;
263:     } else {
264:       z[i] = sum;
265:     }
266:   }
267:   VecRestoreArray(xx,&x);
268:   VecRestoreArray(zz,&z);
269:   PetscLogFlops(2*a->nz - mbs);
270:   return(0);
271: }

275: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
276: {
277:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
278:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,*zarray;
279:   PetscScalar    x1,x2;
280:   MatScalar      *v;
282:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
283:   PetscTruth     usecprow=a->compressedrow.use;

286:   VecGetArray(xx,&x);
287:   VecGetArray(zz,&zarray);

289:   idx = a->j;
290:   v   = a->a;
291:   if (usecprow){
292:     mbs  = a->compressedrow.nrows;
293:     ii   = a->compressedrow.i;
294:     ridx = a->compressedrow.rindex;
295:   } else {
296:     mbs = a->mbs;
297:     ii  = a->i;
298:     z   = zarray;
299:   }

301:   for (i=0; i<mbs; i++) {
302:     n  = ii[1] - ii[0]; ii++;
303:     sum1 = 0.0; sum2 = 0.0;
304:     for (j=0; j<n; j++) {
305:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
306:       sum1 += v[0]*x1 + v[2]*x2;
307:       sum2 += v[1]*x1 + v[3]*x2;
308:       v += 4;
309:     }
310:     if (usecprow) z = zarray + 2*ridx[i];
311:     z[0] = sum1; z[1] = sum2;
312:     if (!usecprow) z += 2;
313:   }
314:   VecRestoreArray(xx,&x);
315:   VecRestoreArray(zz,&zarray);
316:   PetscLogFlops(8*a->nz - 2*mbs);
317:   return(0);
318: }

322: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
323: {
324:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
325:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*zarray;
326:   MatScalar      *v;
328:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
329:   PetscTruth     usecprow=a->compressedrow.use;
330: 

332: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
333: #pragma disjoint(*v,*z,*xb)
334: #endif

337:   VecGetArray(xx,&x);
338:   VecGetArray(zz,&zarray);

340:   idx = a->j;
341:   v   = a->a;
342:   if (usecprow){
343:     mbs  = a->compressedrow.nrows;
344:     ii   = a->compressedrow.i;
345:     ridx = a->compressedrow.rindex;
346:   } else {
347:     mbs = a->mbs;
348:     ii  = a->i;
349:     z   = zarray;
350:   }

352:   for (i=0; i<mbs; i++) {
353:     n  = ii[1] - ii[0]; ii++;
354:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
355:     for (j=0; j<n; j++) {
356:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
357:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
358:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
359:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
360:       v += 9;
361:     }
362:     if (usecprow) z = zarray + 3*ridx[i];
363:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
364:     if (!usecprow) z += 3;
365:   }
366:   VecRestoreArray(xx,&x);
367:   VecRestoreArray(zz,&zarray);
368:   PetscLogFlops(18*a->nz - 3*mbs);
369:   return(0);
370: }

374: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
375: {
376:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
377:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
378:   MatScalar      *v;
380:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
381:   PetscTruth     usecprow=a->compressedrow.use;

384:   VecGetArray(xx,&x);
385:   VecGetArray(zz,&zarray);

387:   idx = a->j;
388:   v   = a->a;
389:   if (usecprow){
390:     mbs  = a->compressedrow.nrows;
391:     ii   = a->compressedrow.i;
392:     ridx = a->compressedrow.rindex;
393:   } else {
394:     mbs = a->mbs;
395:     ii  = a->i;
396:     z   = zarray;
397:   }

399:   for (i=0; i<mbs; i++) {
400:     n  = ii[1] - ii[0]; ii++;
401:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
402:     for (j=0; j<n; j++) {
403:       xb = x + 4*(*idx++);
404:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
405:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
406:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
407:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
408:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
409:       v += 16;
410:     }
411:     if (usecprow) z = zarray + 4*ridx[i];
412:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
413:     if (!usecprow) z += 4;
414:   }
415:   VecRestoreArray(xx,&x);
416:   VecRestoreArray(zz,&zarray);
417:   PetscLogFlops(32*a->nz - 4*mbs);
418:   return(0);
419: }

423: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
424: {
425:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
426:   PetscScalar    sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z = 0,*x,*zarray;
427:   MatScalar      *v;
429:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
430:   PetscTruth     usecprow=a->compressedrow.use;

433:   VecGetArray(xx,&x);
434:   VecGetArray(zz,&zarray);

436:   idx = a->j;
437:   v   = a->a;
438:   if (usecprow){
439:     mbs  = a->compressedrow.nrows;
440:     ii   = a->compressedrow.i;
441:     ridx = a->compressedrow.rindex;
442:   } else {
443:     mbs = a->mbs;
444:     ii  = a->i;
445:     z   = zarray;
446:   }

448:   for (i=0; i<mbs; i++) {
449:     n  = ii[1] - ii[0]; ii++;
450:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
451:     for (j=0; j<n; j++) {
452:       xb = x + 5*(*idx++);
453:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
454:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
455:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
456:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
457:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
458:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
459:       v += 25;
460:     }
461:     if (usecprow) z = zarray + 5*ridx[i];
462:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
463:     if (!usecprow) z += 5;
464:   }
465:   VecRestoreArray(xx,&x);
466:   VecRestoreArray(zz,&zarray);
467:   PetscLogFlops(50*a->nz - 5*mbs);
468:   return(0);
469: }


474: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
475: {
476:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
477:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
478:   PetscScalar    x1,x2,x3,x4,x5,x6,*zarray;
479:   MatScalar      *v;
481:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
482:   PetscTruth     usecprow=a->compressedrow.use;

485:   VecGetArray(xx,&x);
486:   VecGetArray(zz,&zarray);

488:   idx = a->j;
489:   v   = a->a;
490:   if (usecprow){
491:     mbs  = a->compressedrow.nrows;
492:     ii   = a->compressedrow.i;
493:     ridx = a->compressedrow.rindex;
494:   } else {
495:     mbs = a->mbs;
496:     ii  = a->i;
497:     z   = zarray;
498:   }

500:   for (i=0; i<mbs; i++) {
501:     n  = ii[1] - ii[0]; ii++;
502:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
503:     for (j=0; j<n; j++) {
504:       xb = x + 6*(*idx++);
505:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
506:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
507:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
508:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
509:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
510:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
511:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
512:       v += 36;
513:     }
514:     if (usecprow) z = zarray + 6*ridx[i];
515:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
516:     if (!usecprow) z += 6;
517:   }

519:   VecRestoreArray(xx,&x);
520:   VecRestoreArray(zz,&zarray);
521:   PetscLogFlops(72*a->nz - 6*mbs);
522:   return(0);
523: }
526: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
527: {
528:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
529:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
530:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*zarray;
531:   MatScalar      *v;
533:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
534:   PetscTruth     usecprow=a->compressedrow.use;

537:   VecGetArray(xx,&x);
538:   VecGetArray(zz,&zarray);

540:   idx = a->j;
541:   v   = a->a;
542:   if (usecprow){
543:     mbs    = a->compressedrow.nrows;
544:     ii     = a->compressedrow.i;
545:     ridx = a->compressedrow.rindex;
546:   } else {
547:     mbs = a->mbs;
548:     ii  = a->i;
549:     z   = zarray;
550:   }

552:   for (i=0; i<mbs; i++) {
553:     n  = ii[1] - ii[0]; ii++;
554:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
555:     for (j=0; j<n; j++) {
556:       xb = x + 7*(*idx++);
557:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
558:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
559:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
560:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
561:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
562:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
563:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
564:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
565:       v += 49;
566:     }
567:     if (usecprow) z = zarray + 7*ridx[i];
568:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
569:     if (!usecprow) z += 7;
570:   }

572:   VecRestoreArray(xx,&x);
573:   VecRestoreArray(zz,&zarray);
574:   PetscLogFlops(98*a->nz - 7*mbs);
575:   return(0);
576: }

578: /*
579:     This will not work with MatScalar == float because it calls the BLAS
580: */
583: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
584: {
585:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
586:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
587:   MatScalar      *v;
589:   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2;
590:   PetscInt       ncols,k,*ridx=PETSC_NULL;
591:   PetscTruth     usecprow=a->compressedrow.use;

594:   VecGetArray(xx,&x);
595:   VecGetArray(zz,&zarray);

597:   idx = a->j;
598:   v   = a->a;
599:   if (usecprow){
600:     mbs  = a->compressedrow.nrows;
601:     ii   = a->compressedrow.i;
602:     ridx = a->compressedrow.rindex;
603:   } else {
604:     mbs = a->mbs;
605:     ii  = a->i;
606:     z   = zarray;
607:   }

609:   if (!a->mult_work) {
610:     k    = PetscMax(A->rmap.n,A->cmap.n);
611:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
612:   }
613:   work = a->mult_work;
614:   for (i=0; i<mbs; i++) {
615:     n     = ii[1] - ii[0]; ii++;
616:     ncols = n*bs;
617:     workt = work;
618:     for (j=0; j<n; j++) {
619:       xb = x + bs*(*idx++);
620:       for (k=0; k<bs; k++) workt[k] = xb[k];
621:       workt += bs;
622:     }
623:     if (usecprow) z = zarray + bs*ridx[i];
624:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
625:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
626:     v += n*bs2;
627:     if (!usecprow) z += bs;
628:   }
629:   VecRestoreArray(xx,&x);
630:   VecRestoreArray(zz,&zarray);
631:   PetscLogFlops(2*a->nz*bs2 - bs*mbs);
632:   return(0);
633: }

638: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
639: {
640:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
641:   PetscScalar    *x,*y = 0,*z = 0,sum,*yarray,*zarray;
642:   MatScalar      *v;
644:   PetscInt       mbs=a->mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
645:   PetscTruth     usecprow=a->compressedrow.use;

648:   VecGetArray(xx,&x);
649:   VecGetArray(yy,&yarray);
650:   if (zz != yy) {
651:     VecGetArray(zz,&zarray);
652:   } else {
653:     zarray = yarray;
654:   }

656:   idx = a->j;
657:   v   = a->a;
658:   if (usecprow){
659:     if (zz != yy){
660:       PetscMemcpy(zarray,yarray,mbs*sizeof(PetscScalar));
661:     }
662:     mbs  = a->compressedrow.nrows;
663:     ii   = a->compressedrow.i;
664:     ridx = a->compressedrow.rindex;
665:   } else {
666:     ii  = a->i;
667:     y   = yarray;
668:     z   = zarray;
669:   }

671:   for (i=0; i<mbs; i++) {
672:     n    = ii[1] - ii[0]; ii++;
673:     if (usecprow){
674:       z = zarray + ridx[i];
675:       y = yarray + ridx[i];
676:     }
677:     sum = y[0];
678:     while (n--) sum += *v++ * x[*idx++];
679:     z[0] = sum;
680:     if (!usecprow){
681:       z++; y++;
682:     }
683:   }
684:   VecRestoreArray(xx,&x);
685:   VecRestoreArray(yy,&yarray);
686:   if (zz != yy) {
687:     VecRestoreArray(zz,&zarray);
688:   }
689:   PetscLogFlops(2*a->nz);
690:   return(0);
691: }

695: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
696: {
697:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
698:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
699:   PetscScalar    x1,x2,*yarray,*zarray;
700:   MatScalar      *v;
702:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
703:   PetscTruth     usecprow=a->compressedrow.use;

706:   VecGetArray(xx,&x);
707:   VecGetArray(yy,&yarray);
708:   if (zz != yy) {
709:     VecGetArray(zz,&zarray);
710:   } else {
711:     zarray = yarray;
712:   }

714:   idx = a->j;
715:   v   = a->a;
716:   if (usecprow){
717:     if (zz != yy){
718:       PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
719:     }
720:     mbs  = a->compressedrow.nrows;
721:     ii   = a->compressedrow.i;
722:     ridx = a->compressedrow.rindex;
723:     if (zz != yy){
724:       PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
725:     }
726:   } else {
727:     ii  = a->i;
728:     y   = yarray;
729:     z   = zarray;
730:   }

732:   for (i=0; i<mbs; i++) {
733:     n  = ii[1] - ii[0]; ii++;
734:     if (usecprow){
735:       z = zarray + 2*ridx[i];
736:       y = yarray + 2*ridx[i];
737:     }
738:     sum1 = y[0]; sum2 = y[1];
739:     for (j=0; j<n; j++) {
740:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
741:       sum1 += v[0]*x1 + v[2]*x2;
742:       sum2 += v[1]*x1 + v[3]*x2;
743:       v += 4;
744:     }
745:     z[0] = sum1; z[1] = sum2;
746:     if (!usecprow){
747:       z += 2; y += 2;
748:     }
749:   }
750:   VecRestoreArray(xx,&x);
751:   VecRestoreArray(yy,&yarray);
752:   if (zz != yy) {
753:     VecRestoreArray(zz,&zarray);
754:   }
755:   PetscLogFlops(4*a->nz);
756:   return(0);
757: }

761: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
762: {
763:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
764:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
765:   MatScalar      *v;
767:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
768:   PetscTruth     usecprow=a->compressedrow.use;

771:   VecGetArray(xx,&x);
772:   VecGetArray(yy,&yarray);
773:   if (zz != yy) {
774:     VecGetArray(zz,&zarray);
775:   } else {
776:     zarray = yarray;
777:   }

779:   idx = a->j;
780:   v   = a->a;
781:   if (usecprow){
782:     if (zz != yy){
783:       PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
784:     }
785:     mbs  = a->compressedrow.nrows;
786:     ii   = a->compressedrow.i;
787:     ridx = a->compressedrow.rindex;
788:   } else {
789:     ii  = a->i;
790:     y   = yarray;
791:     z   = zarray;
792:   }

794:   for (i=0; i<mbs; i++) {
795:     n  = ii[1] - ii[0]; ii++;
796:     if (usecprow){
797:       z = zarray + 3*ridx[i];
798:       y = yarray + 3*ridx[i];
799:     }
800:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
801:     for (j=0; j<n; j++) {
802:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
803:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
804:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
805:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
806:       v += 9;
807:     }
808:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
809:     if (!usecprow){
810:       z += 3; y += 3;
811:     }
812:   }
813:   VecRestoreArray(xx,&x);
814:   VecRestoreArray(yy,&yarray);
815:   if (zz != yy) {
816:     VecRestoreArray(zz,&zarray);
817:   }
818:   PetscLogFlops(18*a->nz);
819:   return(0);
820: }

824: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
825: {
826:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
827:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
828:   MatScalar      *v;
830:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
831:   PetscTruth     usecprow=a->compressedrow.use;

834:   VecGetArray(xx,&x);
835:   VecGetArray(yy,&yarray);
836:   if (zz != yy) {
837:     VecGetArray(zz,&zarray);
838:   } else {
839:     zarray = yarray;
840:   }

842:   idx   = a->j;
843:   v     = a->a;
844:   if (usecprow){
845:     if (zz != yy){
846:       PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
847:     }
848:     mbs  = a->compressedrow.nrows;
849:     ii   = a->compressedrow.i;
850:     ridx = a->compressedrow.rindex;
851:   } else {
852:     ii  = a->i;
853:     y   = yarray;
854:     z   = zarray;
855:   }

857:   for (i=0; i<mbs; i++) {
858:     n  = ii[1] - ii[0]; ii++;
859:     if (usecprow){
860:       z = zarray + 4*ridx[i];
861:       y = yarray + 4*ridx[i];
862:     }
863:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
864:     for (j=0; j<n; j++) {
865:       xb = x + 4*(*idx++);
866:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
867:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
868:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
869:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
870:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
871:       v += 16;
872:     }
873:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
874:     if (!usecprow){
875:       z += 4; y += 4;
876:     }
877:   }
878:   VecRestoreArray(xx,&x);
879:   VecRestoreArray(yy,&yarray);
880:   if (zz != yy) {
881:     VecRestoreArray(zz,&zarray);
882:   }
883:   PetscLogFlops(32*a->nz);
884:   return(0);
885: }

889: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
890: {
891:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
892:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
893:   PetscScalar    *yarray,*zarray;
894:   MatScalar      *v;
896:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
897:   PetscTruth     usecprow=a->compressedrow.use;

900:   VecGetArray(xx,&x);
901:   VecGetArray(yy,&yarray);
902:   if (zz != yy) {
903:     VecGetArray(zz,&zarray);
904:   } else {
905:     zarray = yarray;
906:   }

908:   idx = a->j;
909:   v   = a->a;
910:   if (usecprow){
911:     if (zz != yy){
912:       PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
913:     }
914:     mbs  = a->compressedrow.nrows;
915:     ii   = a->compressedrow.i;
916:     ridx = a->compressedrow.rindex;
917:   } else {
918:     ii  = a->i;
919:     y   = yarray;
920:     z   = zarray;
921:   }

923:   for (i=0; i<mbs; i++) {
924:     n  = ii[1] - ii[0]; ii++;
925:     if (usecprow){
926:       z = zarray + 5*ridx[i];
927:       y = yarray + 5*ridx[i];
928:     }
929:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
930:     for (j=0; j<n; j++) {
931:       xb = x + 5*(*idx++);
932:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
933:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
934:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
935:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
936:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
937:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
938:       v += 25;
939:     }
940:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
941:     if (!usecprow){
942:       z += 5; y += 5;
943:     }
944:   }
945:   VecRestoreArray(xx,&x);
946:   VecRestoreArray(yy,&yarray);
947:   if (zz != yy) {
948:     VecRestoreArray(zz,&zarray);
949:   }
950:   PetscLogFlops(50*a->nz);
951:   return(0);
952: }
955: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
956: {
957:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
958:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
959:   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
960:   MatScalar      *v;
962:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
963:   PetscTruth     usecprow=a->compressedrow.use;

966:   VecGetArray(xx,&x);
967:   VecGetArray(yy,&yarray);
968:   if (zz != yy) {
969:     VecGetArray(zz,&zarray);
970:   } else {
971:     zarray = yarray;
972:   }

974:   idx = a->j;
975:   v   = a->a;
976:   if (usecprow){
977:     if (zz != yy){
978:       PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
979:     }
980:     mbs  = a->compressedrow.nrows;
981:     ii   = a->compressedrow.i;
982:     ridx = a->compressedrow.rindex;
983:   } else {
984:     ii  = a->i;
985:     y   = yarray;
986:     z   = zarray;
987:   }

989:   for (i=0; i<mbs; i++) {
990:     n  = ii[1] - ii[0]; ii++;
991:     if (usecprow){
992:       z = zarray + 6*ridx[i];
993:       y = yarray + 6*ridx[i];
994:     }
995:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
996:     for (j=0; j<n; j++) {
997:       xb = x + 6*(*idx++);
998:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
999:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1000:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1001:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1002:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1003:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1004:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1005:       v += 36;
1006:     }
1007:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1008:     if (!usecprow){
1009:       z += 6; y += 6;
1010:     }
1011:   }
1012:   VecRestoreArray(xx,&x);
1013:   VecRestoreArray(yy,&yarray);
1014:   if (zz != yy) {
1015:     VecRestoreArray(zz,&zarray);
1016:   }
1017:   PetscLogFlops(72*a->nz);
1018:   return(0);
1019: }

1023: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1024: {
1025:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1026:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1027:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1028:   MatScalar      *v;
1030:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1031:   PetscTruth     usecprow=a->compressedrow.use;

1034:   VecGetArray(xx,&x);
1035:   VecGetArray(yy,&yarray);
1036:   if (zz != yy) {
1037:     VecGetArray(zz,&zarray);
1038:   } else {
1039:     zarray = yarray;
1040:   }

1042:   idx = a->j;
1043:   v   = a->a;
1044:   if (usecprow){
1045:     if (zz != yy){
1046:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1047:     }
1048:     mbs  = a->compressedrow.nrows;
1049:     ii   = a->compressedrow.i;
1050:     ridx = a->compressedrow.rindex;
1051:   } else {
1052:     ii  = a->i;
1053:     y   = yarray;
1054:     z   = zarray;
1055:   }

1057:   for (i=0; i<mbs; i++) {
1058:     n  = ii[1] - ii[0]; ii++;
1059:     if (usecprow){
1060:       z = zarray + 7*ridx[i];
1061:       y = yarray + 7*ridx[i];
1062:     }
1063:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1064:     for (j=0; j<n; j++) {
1065:       xb = x + 7*(*idx++);
1066:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1067:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1068:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1069:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1070:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1071:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1072:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1073:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1074:       v += 49;
1075:     }
1076:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1077:     if (!usecprow){
1078:       z += 7; y += 7;
1079:     }
1080:   }
1081:   VecRestoreArray(xx,&x);
1082:   VecRestoreArray(yy,&yarray);
1083:   if (zz != yy) {
1084:     VecRestoreArray(zz,&zarray);
1085:   }
1086:   PetscLogFlops(98*a->nz);
1087:   return(0);
1088: }

1092: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1093: {
1094:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1095:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1096:   MatScalar      *v;
1098:   PetscInt       mbs,i,*idx,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2;
1099:   PetscInt       ncols,k,*ridx=PETSC_NULL;
1100:   PetscTruth     usecprow=a->compressedrow.use;

1103:   VecCopy_Seq(yy,zz);
1104:   VecGetArray(xx,&x);
1105:   VecGetArray(zz,&zarray);

1107:   idx = a->j;
1108:   v   = a->a;
1109:   if (usecprow){
1110:     mbs    = a->compressedrow.nrows;
1111:     ii     = a->compressedrow.i;
1112:     ridx = a->compressedrow.rindex;
1113:   } else {
1114:     mbs = a->mbs;
1115:     ii  = a->i;
1116:     z   = zarray;
1117:   }

1119:   if (!a->mult_work) {
1120:     k    = PetscMax(A->rmap.n,A->cmap.n);
1121:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1122:   }
1123:   work = a->mult_work;
1124:   for (i=0; i<mbs; i++) {
1125:     n     = ii[1] - ii[0]; ii++;
1126:     ncols = n*bs;
1127:     workt = work;
1128:     for (j=0; j<n; j++) {
1129:       xb = x + bs*(*idx++);
1130:       for (k=0; k<bs; k++) workt[k] = xb[k];
1131:       workt += bs;
1132:     }
1133:     if (usecprow) z = zarray + bs*ridx[i];
1134:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1135:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1136:     v += n*bs2;
1137:     if (!usecprow){
1138:       z += bs;
1139:     }
1140:   }
1141:   VecRestoreArray(xx,&x);
1142:   VecRestoreArray(zz,&zarray);
1143:   PetscLogFlops(2*a->nz*bs2);
1144:   return(0);
1145: }

1149: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1150: {
1151:   PetscScalar    zero = 0.0;
1153: #if defined (PETSC_USE_LOG)
1154:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1155: #endif

1158:   VecSet(zz,zero);
1159:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);

1161:   PetscLogFlops(2*a->nz*a->bs2 - A->cmap.n);
1162:   return(0);
1163: }

1167: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1169: {
1170:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1171:   PetscScalar    *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1172:   MatScalar      *v;
1174:   PetscInt       mbs,i,*idx,*ii,rval,bs=A->rmap.bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1175:   Mat_CompressedRow cprow = a->compressedrow;
1176:   PetscTruth        usecprow=cprow.use;

1179:   if (yy != zz) { VecCopy_Seq(yy,zz); }
1180:   VecGetArray(xx,&x);
1181:   VecGetArray(zz,&z);

1183:   idx = a->j;
1184:   v   = a->a;
1185:   if (usecprow){
1186:     mbs  = cprow.nrows;
1187:     ii   = cprow.i;
1188:     ridx = cprow.rindex;
1189:   } else {
1190:     mbs=a->mbs;
1191:     ii = a->i;
1192:     xb = x;
1193:   }

1195:   switch (bs) {
1196:   case 1:
1197:     for (i=0; i<mbs; i++) {
1198:       if (usecprow) xb = x + ridx[i];
1199:       x1 = xb[0];
1200:       ib = idx + ii[0];
1201:       n  = ii[1] - ii[0]; ii++;
1202:       for (j=0; j<n; j++) {
1203:         rval    = ib[j];
1204:         z[rval] += *v * x1;
1205:         v++;
1206:       }
1207:       if (!usecprow) xb++;
1208:     }
1209:     break;
1210:   case 2:
1211:     for (i=0; i<mbs; i++) {
1212:       if (usecprow) xb = x + 2*ridx[i];
1213:       x1 = xb[0]; x2 = xb[1];
1214:       ib = idx + ii[0];
1215:       n  = ii[1] - ii[0]; ii++;
1216:       for (j=0; j<n; j++) {
1217:         rval      = ib[j]*2;
1218:         z[rval++] += v[0]*x1 + v[1]*x2;
1219:         z[rval++] += v[2]*x1 + v[3]*x2;
1220:         v  += 4;
1221:       }
1222:       if (!usecprow) xb += 2;
1223:     }
1224:     break;
1225:   case 3:
1226:     for (i=0; i<mbs; i++) {
1227:       if (usecprow) xb = x + 3*ridx[i];
1228:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1229:       ib = idx + ii[0];
1230:       n  = ii[1] - ii[0]; ii++;
1231:       for (j=0; j<n; j++) {
1232:         rval      = ib[j]*3;
1233:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1234:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1235:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1236:         v  += 9;
1237:       }
1238:       if (!usecprow) xb += 3;
1239:     }
1240:     break;
1241:   case 4:
1242:     for (i=0; i<mbs; i++) {
1243:       if (usecprow) xb = x + 4*ridx[i];
1244:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1245:       ib = idx + ii[0];
1246:       n  = ii[1] - ii[0]; ii++;
1247:       for (j=0; j<n; j++) {
1248:         rval      = ib[j]*4;
1249:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1250:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1251:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1252:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1253:         v  += 16;
1254:       }
1255:       if (!usecprow) xb += 4;
1256:     }
1257:     break;
1258:   case 5:
1259:     for (i=0; i<mbs; i++) {
1260:       if (usecprow) xb = x + 5*ridx[i];
1261:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1262:       x4 = xb[3]; x5 = xb[4];
1263:       ib = idx + ii[0];
1264:       n  = ii[1] - ii[0]; ii++;
1265:       for (j=0; j<n; j++) {
1266:         rval      = ib[j]*5;
1267:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1268:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1269:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1270:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1271:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1272:         v  += 25;
1273:       }
1274:       if (!usecprow) xb += 5;
1275:     }
1276:     break;
1277:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1278:       PetscInt     ncols,k;
1279:       PetscScalar  *work,*workt,*xtmp;

1281:       if (!a->mult_work) {
1282:         k = PetscMax(A->rmap.n,A->cmap.n);
1283:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1284:       }
1285:       work = a->mult_work;
1286:       xtmp = x;
1287:       for (i=0; i<mbs; i++) {
1288:         n     = ii[1] - ii[0]; ii++;
1289:         ncols = n*bs;
1290:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1291:         if (usecprow) {
1292:           xtmp = x + bs*ridx[i];
1293:         }
1294:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1295:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1296:         v += n*bs2;
1297:         if (!usecprow) xtmp += bs;
1298:         workt = work;
1299:         for (j=0; j<n; j++) {
1300:           zb = z + bs*(*idx++);
1301:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1302:           workt += bs;
1303:         }
1304:       }
1305:     }
1306:   }
1307:   VecRestoreArray(xx,&x);
1308:   VecRestoreArray(zz,&z);
1309:   PetscLogFlops(2*a->nz*a->bs2);
1310:   return(0);
1311: }

1315: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1316: {
1317:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)inA->data;
1318:   PetscInt     totalnz = a->bs2*a->nz;
1319:   PetscScalar  oalpha = alpha;
1321: #if defined(PETSC_USE_MAT_SINGLE)
1322:   PetscInt     i;
1323: #else
1324:   PetscBLASInt tnz = (PetscBLASInt) totalnz,one = 1;
1325: #endif

1328: #if defined(PETSC_USE_MAT_SINGLE)
1329:   for (i=0; i<totalnz; i++) a->a[i] *= oalpha;
1330: #else
1331:   BLASscal_(&tnz,&oalpha,a->a,&one);
1332: #endif
1333:   PetscLogFlops(totalnz);
1334:   return(0);
1335: }

1339: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1340: {
1342:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1343:   MatScalar      *v = a->a;
1344:   PetscReal      sum = 0.0;
1345:   PetscInt       i,j,k,bs=A->rmap.bs,nz=a->nz,bs2=a->bs2,k1;

1348:   if (type == NORM_FROBENIUS) {
1349:     for (i=0; i< bs2*nz; i++) {
1350: #if defined(PETSC_USE_COMPLEX)
1351:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1352: #else
1353:       sum += (*v)*(*v); v++;
1354: #endif
1355:     }
1356:     *norm = sqrt(sum);
1357:   } else if (type == NORM_1) { /* maximum column sum */
1358:     PetscReal *tmp;
1359:     PetscInt  *bcol = a->j;
1360:     PetscMalloc((A->cmap.n+1)*sizeof(PetscReal),&tmp);
1361:     PetscMemzero(tmp,A->cmap.n*sizeof(PetscReal));
1362:     for (i=0; i<nz; i++){
1363:       for (j=0; j<bs; j++){
1364:         k1 = bs*(*bcol) + j; /* column index */
1365:         for (k=0; k<bs; k++){
1366:           tmp[k1] += PetscAbsScalar(*v); v++;
1367:         }
1368:       }
1369:       bcol++;
1370:     }
1371:     *norm = 0.0;
1372:     for (j=0; j<A->cmap.n; j++) {
1373:       if (tmp[j] > *norm) *norm = tmp[j];
1374:     }
1375:     PetscFree(tmp);
1376:   } else if (type == NORM_INFINITY) { /* maximum row sum */
1377:     *norm = 0.0;
1378:     for (k=0; k<bs; k++) {
1379:       for (j=0; j<a->mbs; j++) {
1380:         v = a->a + bs2*a->i[j] + k;
1381:         sum = 0.0;
1382:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1383:           for (k1=0; k1<bs; k1++){
1384:             sum += PetscAbsScalar(*v);
1385:             v   += bs;
1386:           }
1387:         }
1388:         if (sum > *norm) *norm = sum;
1389:       }
1390:     }
1391:   } else {
1392:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1393:   }
1394:   return(0);
1395: }


1400: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1401: {
1402:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;

1406:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1407:   if ((A->rmap.N != B->rmap.N) || (A->cmap.n != B->cmap.n) || (A->rmap.bs != B->rmap.bs)|| (a->nz != b->nz)) {
1408:     *flg = PETSC_FALSE;
1409:     return(0);
1410:   }
1411: 
1412:   /* if the a->i are the same */
1413:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1414:   if (!*flg) {
1415:     return(0);
1416:   }
1417: 
1418:   /* if a->j are the same */
1419:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1420:   if (!*flg) {
1421:     return(0);
1422:   }
1423:   /* if a->a are the same */
1424:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap.bs)*(B->rmap.bs)*sizeof(PetscScalar),flg);
1425:   return(0);
1426: 
1427: }

1431: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1432: {
1433:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1435:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1436:   PetscScalar    *x,zero = 0.0;
1437:   MatScalar      *aa,*aa_j;

1440:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1441:   bs   = A->rmap.bs;
1442:   aa   = a->a;
1443:   ai   = a->i;
1444:   aj   = a->j;
1445:   ambs = a->mbs;
1446:   bs2  = a->bs2;

1448:   VecSet(v,zero);
1449:   VecGetArray(v,&x);
1450:   VecGetLocalSize(v,&n);
1451:   if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1452:   for (i=0; i<ambs; i++) {
1453:     for (j=ai[i]; j<ai[i+1]; j++) {
1454:       if (aj[j] == i) {
1455:         row  = i*bs;
1456:         aa_j = aa+j*bs2;
1457:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1458:         break;
1459:       }
1460:     }
1461:   }
1462:   VecRestoreArray(v,&x);
1463:   return(0);
1464: }

1468: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1469: {
1470:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1471:   PetscScalar    *l,*r,x,*li,*ri;
1472:   MatScalar      *aa,*v;
1474:   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

1477:   ai  = a->i;
1478:   aj  = a->j;
1479:   aa  = a->a;
1480:   m   = A->rmap.n;
1481:   n   = A->cmap.n;
1482:   bs  = A->rmap.bs;
1483:   mbs = a->mbs;
1484:   bs2 = a->bs2;
1485:   if (ll) {
1486:     VecGetArray(ll,&l);
1487:     VecGetLocalSize(ll,&lm);
1488:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1489:     for (i=0; i<mbs; i++) { /* for each block row */
1490:       M  = ai[i+1] - ai[i];
1491:       li = l + i*bs;
1492:       v  = aa + bs2*ai[i];
1493:       for (j=0; j<M; j++) { /* for each block */
1494:         for (k=0; k<bs2; k++) {
1495:           (*v++) *= li[k%bs];
1496:         }
1497:       }
1498:     }
1499:     VecRestoreArray(ll,&l);
1500:     PetscLogFlops(a->nz);
1501:   }
1502: 
1503:   if (rr) {
1504:     VecGetArray(rr,&r);
1505:     VecGetLocalSize(rr,&rn);
1506:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1507:     for (i=0; i<mbs; i++) { /* for each block row */
1508:       M  = ai[i+1] - ai[i];
1509:       v  = aa + bs2*ai[i];
1510:       for (j=0; j<M; j++) { /* for each block */
1511:         ri = r + bs*aj[ai[i]+j];
1512:         for (k=0; k<bs; k++) {
1513:           x = ri[k];
1514:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1515:         }
1516:       }
1517:     }
1518:     VecRestoreArray(rr,&r);
1519:     PetscLogFlops(a->nz);
1520:   }
1521:   return(0);
1522: }


1527: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1528: {
1529:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

1532:   info->rows_global    = (double)A->rmap.N;
1533:   info->columns_global = (double)A->cmap.n;
1534:   info->rows_local     = (double)A->rmap.n;
1535:   info->columns_local  = (double)A->cmap.n;
1536:   info->block_size     = a->bs2;
1537:   info->nz_allocated   = a->maxnz;
1538:   info->nz_used        = a->bs2*a->nz;
1539:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1540:   info->assemblies   = A->num_ass;
1541:   info->mallocs      = a->reallocs;
1542:   info->memory       = A->mem;
1543:   if (A->factor) {
1544:     info->fill_ratio_given  = A->info.fill_ratio_given;
1545:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1546:     info->factor_mallocs    = A->info.factor_mallocs;
1547:   } else {
1548:     info->fill_ratio_given  = 0;
1549:     info->fill_ratio_needed = 0;
1550:     info->factor_mallocs    = 0;
1551:   }
1552:   return(0);
1553: }


1558: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1559: {
1560:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

1564:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1565:   return(0);
1566: }