Actual source code: mpiov.c
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <petscsf.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
22: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23: {
25: PetscInt i;
28: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29: for (i=0; i<ov; ++i) {
30: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
31: }
32: return(0);
33: }
35: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36: {
38: PetscInt i;
41: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42: for (i=0; i<ov; ++i) {
43: MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
44: }
45: return(0);
46: }
49: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50: {
52: MPI_Comm comm;
53: PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
54: PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55: PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL;
56: const PetscInt *indices_i,**indices;
57: PetscLayout rmap;
58: PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner;
59: PetscSF sf;
60: PetscSFNode *remote;
63: PetscObjectGetComm((PetscObject)mat,&comm);
64: MPI_Comm_rank(comm,&rank);
65: MPI_Comm_size(comm,&size);
66: /* get row map to determine where rows should be going */
67: MatGetLayouts(mat,&rmap,NULL);
68: /* retrieve IS data and put all together so that we
69: * can optimize communication
70: * */
71: PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
72: for (i=0,tlength=0; i<nidx; i++){
73: ISGetLocalSize(is[i],&length[i]);
74: tlength += length[i];
75: ISGetIndices(is[i],&indices[i]);
76: }
77: /* find these rows on remote processors */
78: PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
79: PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
80: nrrows = 0;
81: for (i=0; i<nidx; i++) {
82: length_i = length[i];
83: indices_i = indices[i];
84: for (j=0; j<length_i; j++) {
85: owner = -1;
86: PetscLayoutFindOwner(rmap,indices_i[j],&owner);
87: /* remote processors */
88: if (owner != rank) {
89: tosizes_temp[owner]++; /* number of rows to owner */
90: rrow_ranks[nrrows] = owner; /* processor */
91: rrow_isids[nrrows] = i; /* is id */
92: remoterows[nrrows++] = indices_i[j]; /* row */
93: }
94: }
95: ISRestoreIndices(is[i],&indices[i]);
96: }
97: PetscFree2(*(PetscInt***)&indices,length);
98: /* test if we need to exchange messages
99: * generally speaking, we do not need to exchange
100: * data when overlap is 1
101: * */
102: MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
103: /* we do not have any messages
104: * It usually corresponds to overlap 1
105: * */
106: if (!reducednrrows) {
107: PetscFree3(toranks,tosizes,tosizes_temp);
108: PetscFree3(remoterows,rrow_ranks,rrow_isids);
109: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
110: return(0);
111: }
112: nto = 0;
113: /* send sizes and ranks for building a two-sided communcation */
114: for (i=0; i<size; i++) {
115: if (tosizes_temp[i]) {
116: tosizes[nto*2] = tosizes_temp[i]*2; /* size */
117: tosizes_temp[i] = nto; /* a map from processor to index */
118: toranks[nto++] = i; /* processor */
119: }
120: }
121: PetscMalloc1(nto+1,&toffsets);
122: toffsets[0] = 0;
123: for (i=0; i<nto; i++) {
124: toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */
125: tosizes[2*i+1] = toffsets[i]; /* offsets to send */
126: }
127: /* send information to other processors */
128: PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
129: nrecvrows = 0;
130: for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
131: PetscMalloc1(nrecvrows,&remote);
132: nrecvrows = 0;
133: for (i=0; i<nfrom; i++) {
134: for (j=0; j<fromsizes[2*i]; j++) {
135: remote[nrecvrows].rank = fromranks[i];
136: remote[nrecvrows++].index = fromsizes[2*i+1]+j;
137: }
138: }
139: PetscSFCreate(comm,&sf);
140: PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
141: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
142: PetscSFSetType(sf,PETSCSFBASIC);
143: PetscSFSetFromOptions(sf);
144: /* message pair <no of is, row> */
145: PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
146: for (i=0; i<nrrows; i++) {
147: owner = rrow_ranks[i]; /* processor */
148: j = tosizes_temp[owner]; /* index */
149: todata[toffsets[j]++] = rrow_isids[i];
150: todata[toffsets[j]++] = remoterows[i];
151: }
152: PetscFree3(toranks,tosizes,tosizes_temp);
153: PetscFree3(remoterows,rrow_ranks,rrow_isids);
154: PetscFree(toffsets);
155: PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
156: PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
157: PetscSFDestroy(&sf);
158: /* send rows belonging to the remote so that then we could get the overlapping data back */
159: MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
160: PetscFree2(todata,fromdata);
161: PetscFree(fromsizes);
162: PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
163: PetscFree(fromranks);
164: nrecvrows = 0;
165: for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
166: PetscCalloc1(nrecvrows,&todata);
167: PetscMalloc1(nrecvrows,&remote);
168: nrecvrows = 0;
169: for (i=0; i<nto; i++) {
170: for (j=0; j<tosizes[2*i]; j++) {
171: remote[nrecvrows].rank = toranks[i];
172: remote[nrecvrows++].index = tosizes[2*i+1]+j;
173: }
174: }
175: PetscSFCreate(comm,&sf);
176: PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
177: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
178: PetscSFSetType(sf,PETSCSFBASIC);
179: PetscSFSetFromOptions(sf);
180: /* overlap communication and computation */
181: PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
182: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
183: PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
184: PetscSFDestroy(&sf);
185: PetscFree2(sbdata,sbsizes);
186: MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
187: PetscFree(toranks);
188: PetscFree(tosizes);
189: PetscFree(todata);
190: return(0);
191: }
193: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
194: {
195: PetscInt *isz,isz_i,i,j,is_id, data_size;
196: PetscInt col,lsize,max_lsize,*indices_temp, *indices_i;
197: const PetscInt *indices_i_temp;
198: MPI_Comm *iscomms;
199: PetscErrorCode ierr;
202: max_lsize = 0;
203: PetscMalloc1(nidx,&isz);
204: for (i=0; i<nidx; i++){
205: ISGetLocalSize(is[i],&lsize);
206: max_lsize = lsize>max_lsize ? lsize:max_lsize;
207: isz[i] = lsize;
208: }
209: PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);
210: for (i=0; i<nidx; i++){
211: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
212: ISGetIndices(is[i],&indices_i_temp);
213: PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);
214: ISRestoreIndices(is[i],&indices_i_temp);
215: ISDestroy(&is[i]);
216: }
217: /* retrieve information to get row id and its overlap */
218: for (i=0; i<nrecvs;){
219: is_id = recvdata[i++];
220: data_size = recvdata[i++];
221: indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
222: isz_i = isz[is_id];
223: for (j=0; j< data_size; j++){
224: col = recvdata[i++];
225: indices_i[isz_i++] = col;
226: }
227: isz[is_id] = isz_i;
228: }
229: /* remove duplicate entities */
230: for (i=0; i<nidx; i++){
231: indices_i = indices_temp+(max_lsize+nrecvs)*i;
232: isz_i = isz[i];
233: PetscSortRemoveDupsInt(&isz_i,indices_i);
234: ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
235: PetscCommDestroy(&iscomms[i]);
236: }
237: PetscFree(isz);
238: PetscFree2(indices_temp,iscomms);
239: return(0);
240: }
242: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
243: {
244: PetscLayout rmap,cmap;
245: PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
246: PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
247: PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
248: const PetscInt *gcols,*ai,*aj,*bi,*bj;
249: Mat amat,bmat;
250: PetscMPIInt rank;
251: PetscBool done;
252: MPI_Comm comm;
253: PetscErrorCode ierr;
256: PetscObjectGetComm((PetscObject)mat,&comm);
257: MPI_Comm_rank(comm,&rank);
258: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
259: /* Even if the mat is symmetric, we still assume it is not symmetric */
260: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
261: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
263: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
264: /* total number of nonzero values is used to estimate the memory usage in the next step */
265: tnz = ai[an]+bi[bn];
266: MatGetLayouts(mat,&rmap,&cmap);
267: PetscLayoutGetRange(rmap,&rstart,NULL);
268: PetscLayoutGetRange(cmap,&cstart,NULL);
269: /* to find the longest message */
270: max_fszs = 0;
271: for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
272: /* better way to estimate number of nonzero in the mat??? */
273: PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
274: for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
275: rows_pos = 0;
276: totalrows = 0;
277: for (i=0; i<nfrom; i++){
278: PetscArrayzero(rows_pos_i,nidx);
279: /* group data together */
280: for (j=0; j<fromsizes[2*i]; j+=2){
281: is_id = fromrows[rows_pos++];/* no of is */
282: rows_i = rows_data[is_id];
283: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
284: }
285: /* estimate a space to avoid multiple allocations */
286: for (j=0; j<nidx; j++){
287: indvc_ij = 0;
288: rows_i = rows_data[j];
289: for (l=0; l<rows_pos_i[j]; l++){
290: row = rows_i[l]-rstart;
291: start = ai[row];
292: end = ai[row+1];
293: for (k=start; k<end; k++){ /* Amat */
294: col = aj[k] + cstart;
295: indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
296: }
297: start = bi[row];
298: end = bi[row+1];
299: for (k=start; k<end; k++) { /* Bmat */
300: col = gcols[bj[k]];
301: indices_tmp[indvc_ij++] = col;
302: }
303: }
304: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
305: indv_counts[i*nidx+j] = indvc_ij;
306: totalrows += indvc_ij;
307: }
308: }
309: /* message triple <no of is, number of rows, rows> */
310: PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
311: totalrows = 0;
312: rows_pos = 0;
313: /* use this code again */
314: for (i=0;i<nfrom;i++){
315: PetscArrayzero(rows_pos_i,nidx);
316: for (j=0; j<fromsizes[2*i]; j+=2){
317: is_id = fromrows[rows_pos++];
318: rows_i = rows_data[is_id];
319: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
320: }
321: /* add data */
322: for (j=0; j<nidx; j++){
323: if (!indv_counts[i*nidx+j]) continue;
324: indvc_ij = 0;
325: sbdata[totalrows++] = j;
326: sbdata[totalrows++] = indv_counts[i*nidx+j];
327: sbsizes[2*i] += 2;
328: rows_i = rows_data[j];
329: for (l=0; l<rows_pos_i[j]; l++){
330: row = rows_i[l]-rstart;
331: start = ai[row];
332: end = ai[row+1];
333: for (k=start; k<end; k++){ /* Amat */
334: col = aj[k] + cstart;
335: indices_tmp[indvc_ij++] = col;
336: }
337: start = bi[row];
338: end = bi[row+1];
339: for (k=start; k<end; k++) { /* Bmat */
340: col = gcols[bj[k]];
341: indices_tmp[indvc_ij++] = col;
342: }
343: }
344: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
345: sbsizes[2*i] += indvc_ij;
346: PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);
347: totalrows += indvc_ij;
348: }
349: }
350: PetscMalloc1(nfrom+1,&offsets);
351: offsets[0] = 0;
352: for (i=0; i<nfrom; i++){
353: offsets[i+1] = offsets[i] + sbsizes[2*i];
354: sbsizes[2*i+1] = offsets[i];
355: }
356: PetscFree(offsets);
357: if (sbrowsizes) *sbrowsizes = sbsizes;
358: if (sbrows) *sbrows = sbdata;
359: PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
360: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
361: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
362: return(0);
363: }
365: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
366: {
367: const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices;
368: PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
369: PetscInt lsize,lsize_tmp;
370: PetscMPIInt rank,owner;
371: Mat amat,bmat;
372: PetscBool done;
373: PetscLayout cmap,rmap;
374: MPI_Comm comm;
375: PetscErrorCode ierr;
378: PetscObjectGetComm((PetscObject)mat,&comm);
379: MPI_Comm_rank(comm,&rank);
380: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
381: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
382: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
384: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
385: /* is it a safe way to compute number of nonzero values ? */
386: tnz = ai[an]+bi[bn];
387: MatGetLayouts(mat,&rmap,&cmap);
388: PetscLayoutGetRange(rmap,&rstart,NULL);
389: PetscLayoutGetRange(cmap,&cstart,NULL);
390: /* it is a better way to estimate memory than the old implementation
391: * where global size of matrix is used
392: * */
393: PetscMalloc1(tnz,&indices_temp);
394: for (i=0; i<nidx; i++) {
395: MPI_Comm iscomm;
397: ISGetLocalSize(is[i],&lsize);
398: ISGetIndices(is[i],&indices);
399: lsize_tmp = 0;
400: for (j=0; j<lsize; j++) {
401: owner = -1;
402: row = indices[j];
403: PetscLayoutFindOwner(rmap,row,&owner);
404: if (owner != rank) continue;
405: /* local number */
406: row -= rstart;
407: start = ai[row];
408: end = ai[row+1];
409: for (k=start; k<end; k++) { /* Amat */
410: col = aj[k] + cstart;
411: indices_temp[lsize_tmp++] = col;
412: }
413: start = bi[row];
414: end = bi[row+1];
415: for (k=start; k<end; k++) { /* Bmat */
416: col = gcols[bj[k]];
417: indices_temp[lsize_tmp++] = col;
418: }
419: }
420: ISRestoreIndices(is[i],&indices);
421: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);
422: ISDestroy(&is[i]);
423: PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
424: ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
425: PetscCommDestroy(&iscomm);
426: }
427: PetscFree(indices_temp);
428: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
429: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
430: return(0);
431: }
434: /*
435: Sample message format:
436: If a processor A wants processor B to process some elements corresponding
437: to index sets is[1],is[5]
438: mesg [0] = 2 (no of index sets in the mesg)
439: -----------
440: mesg [1] = 1 => is[1]
441: mesg [2] = sizeof(is[1]);
442: -----------
443: mesg [3] = 5 => is[5]
444: mesg [4] = sizeof(is[5]);
445: -----------
446: mesg [5]
447: mesg [n] datas[1]
448: -----------
449: mesg[n+1]
450: mesg[m] data(is[5])
451: -----------
453: Notes:
454: nrqs - no of requests sent (or to be sent out)
455: nrqr - no of requests received (which have to be or which have been processed)
456: */
457: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
458: {
459: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
460: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
461: const PetscInt **idx,*idx_i;
462: PetscInt *n,**data,len;
463: #if defined(PETSC_USE_CTABLE)
464: PetscTable *table_data,table_data_i;
465: PetscInt *tdata,tcount,tcount_max;
466: #else
467: PetscInt *data_i,*d_p;
468: #endif
470: PetscMPIInt size,rank,tag1,tag2,proc = 0;
471: PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
472: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
473: PetscBT *table;
474: MPI_Comm comm;
475: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476: MPI_Status *s_status,*recv_status;
477: MPI_Comm *iscomms;
478: char *t_p;
481: PetscObjectGetComm((PetscObject)C,&comm);
482: size = c->size;
483: rank = c->rank;
484: M = C->rmap->N;
486: PetscObjectGetNewTag((PetscObject)C,&tag1);
487: PetscObjectGetNewTag((PetscObject)C,&tag2);
489: PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);
491: for (i=0; i<imax; i++) {
492: ISGetIndices(is[i],&idx[i]);
493: ISGetLocalSize(is[i],&n[i]);
494: }
496: /* evaluate communication - mesg to who,length of mesg, and buffer space
497: required. Based on this, buffers are allocated, and data copied into them */
498: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
499: for (i=0; i<imax; i++) {
500: PetscArrayzero(w4,size); /* initialise work vector*/
501: idx_i = idx[i];
502: len = n[i];
503: for (j=0; j<len; j++) {
504: row = idx_i[j];
505: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
506: PetscLayoutFindOwner(C->rmap,row,&proc);
507: w4[proc]++;
508: }
509: for (j=0; j<size; j++) {
510: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
511: }
512: }
514: nrqs = 0; /* no of outgoing messages */
515: msz = 0; /* total mesg length (for all proc */
516: w1[rank] = 0; /* no mesg sent to intself */
517: w3[rank] = 0;
518: for (i=0; i<size; i++) {
519: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
520: }
521: /* pa - is list of processors to communicate with */
522: PetscMalloc1(nrqs+1,&pa);
523: for (i=0,j=0; i<size; i++) {
524: if (w1[i]) {pa[j] = i; j++;}
525: }
527: /* Each message would have a header = 1 + 2*(no of IS) + data */
528: for (i=0; i<nrqs; i++) {
529: j = pa[i];
530: w1[j] += w2[j] + 2*w3[j];
531: msz += w1[j];
532: }
534: /* Determine the number of messages to expect, their lengths, from from-ids */
535: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
536: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
538: /* Now post the Irecvs corresponding to these messages */
539: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
541: /* Allocate Memory for outgoing messages */
542: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
543: PetscArrayzero(outdat,size);
544: PetscArrayzero(ptr,size);
546: {
547: PetscInt *iptr = tmp,ict = 0;
548: for (i=0; i<nrqs; i++) {
549: j = pa[i];
550: iptr += ict;
551: outdat[j] = iptr;
552: ict = w1[j];
553: }
554: }
556: /* Form the outgoing messages */
557: /* plug in the headers */
558: for (i=0; i<nrqs; i++) {
559: j = pa[i];
560: outdat[j][0] = 0;
561: PetscArrayzero(outdat[j]+1,2*w3[j]);
562: ptr[j] = outdat[j] + 2*w3[j] + 1;
563: }
565: /* Memory for doing local proc's work */
566: {
567: PetscInt M_BPB_imax = 0;
568: #if defined(PETSC_USE_CTABLE)
569: PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
570: PetscMalloc1(imax,&table_data);
571: for (i=0; i<imax; i++) {
572: PetscTableCreate(n[i]+1,M+1,&table_data[i]);
573: }
574: PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);
575: for (i=0; i<imax; i++) {
576: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
577: }
578: #else
579: PetscInt Mimax = 0;
580: PetscIntMultError(M,imax, &Mimax);
581: PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
582: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);
583: for (i=0; i<imax; i++) {
584: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
585: data[i] = d_p + M*i;
586: }
587: #endif
588: }
590: /* Parse the IS and update local tables and the outgoing buf with the data */
591: {
592: PetscInt n_i,isz_i,*outdat_j,ctr_j;
593: PetscBT table_i;
595: for (i=0; i<imax; i++) {
596: PetscArrayzero(ctr,size);
597: n_i = n[i];
598: table_i = table[i];
599: idx_i = idx[i];
600: #if defined(PETSC_USE_CTABLE)
601: table_data_i = table_data[i];
602: #else
603: data_i = data[i];
604: #endif
605: isz_i = isz[i];
606: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
607: row = idx_i[j];
608: PetscLayoutFindOwner(C->rmap,row,&proc);
609: if (proc != rank) { /* copy to the outgoing buffer */
610: ctr[proc]++;
611: *ptr[proc] = row;
612: ptr[proc]++;
613: } else if (!PetscBTLookupSet(table_i,row)) {
614: #if defined(PETSC_USE_CTABLE)
615: PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
616: #else
617: data_i[isz_i] = row; /* Update the local table */
618: #endif
619: isz_i++;
620: }
621: }
622: /* Update the headers for the current IS */
623: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
624: if ((ctr_j = ctr[j])) {
625: outdat_j = outdat[j];
626: k = ++outdat_j[0];
627: outdat_j[2*k] = ctr_j;
628: outdat_j[2*k-1] = i;
629: }
630: }
631: isz[i] = isz_i;
632: }
633: }
635: /* Now post the sends */
636: PetscMalloc1(nrqs+1,&s_waits1);
637: for (i=0; i<nrqs; ++i) {
638: j = pa[i];
639: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
640: }
642: /* No longer need the original indices */
643: PetscMalloc1(imax,&iscomms);
644: for (i=0; i<imax; ++i) {
645: ISRestoreIndices(is[i],idx+i);
646: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
647: }
648: PetscFree2(*(PetscInt***)&idx,n);
650: for (i=0; i<imax; ++i) {
651: ISDestroy(&is[i]);
652: }
654: /* Do Local work */
655: #if defined(PETSC_USE_CTABLE)
656: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);
657: #else
658: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);
659: #endif
661: /* Receive messages */
662: PetscMalloc1(nrqr+1,&recv_status);
663: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
665: PetscMalloc1(nrqs+1,&s_status);
666: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
668: /* Phase 1 sends are complete - deallocate buffers */
669: PetscFree4(outdat,ptr,tmp,ctr);
670: PetscFree4(w1,w2,w3,w4);
672: PetscMalloc1(nrqr+1,&xdata);
673: PetscMalloc1(nrqr+1,&isz1);
674: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
675: PetscFree(rbuf[0]);
676: PetscFree(rbuf);
679: /* Send the data back */
680: /* Do a global reduction to know the buffer space req for incoming messages */
681: {
682: PetscMPIInt *rw1;
684: PetscCalloc1(size,&rw1);
686: for (i=0; i<nrqr; ++i) {
687: proc = recv_status[i].MPI_SOURCE;
689: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
690: rw1[proc] = isz1[i];
691: }
692: PetscFree(onodes1);
693: PetscFree(olengths1);
695: /* Determine the number of messages to expect, their lengths, from from-ids */
696: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
697: PetscFree(rw1);
698: }
699: /* Now post the Irecvs corresponding to these messages */
700: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
702: /* Now post the sends */
703: PetscMalloc1(nrqr+1,&s_waits2);
704: for (i=0; i<nrqr; ++i) {
705: j = recv_status[i].MPI_SOURCE;
706: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
707: }
709: /* receive work done on other processors */
710: {
711: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax;
712: PetscMPIInt idex;
713: PetscBT table_i;
714: MPI_Status *status2;
716: PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);
717: for (i=0; i<nrqs; ++i) {
718: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
719: /* Process the message */
720: rbuf2_i = rbuf2[idex];
721: ct1 = 2*rbuf2_i[0]+1;
722: jmax = rbuf2[idex][0];
723: for (j=1; j<=jmax; j++) {
724: max = rbuf2_i[2*j];
725: is_no = rbuf2_i[2*j-1];
726: isz_i = isz[is_no];
727: table_i = table[is_no];
728: #if defined(PETSC_USE_CTABLE)
729: table_data_i = table_data[is_no];
730: #else
731: data_i = data[is_no];
732: #endif
733: for (k=0; k<max; k++,ct1++) {
734: row = rbuf2_i[ct1];
735: if (!PetscBTLookupSet(table_i,row)) {
736: #if defined(PETSC_USE_CTABLE)
737: PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
738: #else
739: data_i[isz_i] = row;
740: #endif
741: isz_i++;
742: }
743: }
744: isz[is_no] = isz_i;
745: }
746: }
748: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
749: PetscFree(status2);
750: }
752: #if defined(PETSC_USE_CTABLE)
753: tcount_max = 0;
754: for (i=0; i<imax; ++i) {
755: table_data_i = table_data[i];
756: PetscTableGetCount(table_data_i,&tcount);
757: if (tcount_max < tcount) tcount_max = tcount;
758: }
759: PetscMalloc1(tcount_max+1,&tdata);
760: #endif
762: for (i=0; i<imax; ++i) {
763: #if defined(PETSC_USE_CTABLE)
764: PetscTablePosition tpos;
765: table_data_i = table_data[i];
767: PetscTableGetHeadPosition(table_data_i,&tpos);
768: while (tpos) {
769: PetscTableGetNext(table_data_i,&tpos,&k,&j);
770: tdata[--j] = --k;
771: }
772: ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);
773: #else
774: ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
775: #endif
776: PetscCommDestroy(&iscomms[i]);
777: }
779: PetscFree(iscomms);
780: PetscFree(onodes2);
781: PetscFree(olengths2);
783: PetscFree(pa);
784: PetscFree(rbuf2[0]);
785: PetscFree(rbuf2);
786: PetscFree(s_waits1);
787: PetscFree(r_waits1);
788: PetscFree(s_waits2);
789: PetscFree(r_waits2);
790: PetscFree(s_status);
791: PetscFree(recv_status);
792: PetscFree(xdata[0]);
793: PetscFree(xdata);
794: PetscFree(isz1);
795: #if defined(PETSC_USE_CTABLE)
796: for (i=0; i<imax; i++) {
797: PetscTableDestroy((PetscTable*)&table_data[i]);
798: }
799: PetscFree(table_data);
800: PetscFree(tdata);
801: PetscFree4(table,data,isz,t_p);
802: #else
803: PetscFree5(table,data,isz,d_p,t_p);
804: #endif
805: return(0);
806: }
808: /*
809: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
810: the work on the local processor.
812: Inputs:
813: C - MAT_MPIAIJ;
814: imax - total no of index sets processed at a time;
815: table - an array of char - size = m bits.
817: Output:
818: isz - array containing the count of the solution elements corresponding
819: to each index set;
820: data or table_data - pointer to the solutions
821: */
822: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
823: {
824: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
825: Mat A = c->A,B = c->B;
826: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
827: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
828: PetscInt *bi,*bj,*garray,i,j,k,row,isz_i;
829: PetscBT table_i;
830: #if defined(PETSC_USE_CTABLE)
831: PetscTable table_data_i;
832: PetscErrorCode ierr;
833: PetscTablePosition tpos;
834: PetscInt tcount,*tdata;
835: #else
836: PetscInt *data_i;
837: #endif
840: rstart = C->rmap->rstart;
841: cstart = C->cmap->rstart;
842: ai = a->i;
843: aj = a->j;
844: bi = b->i;
845: bj = b->j;
846: garray = c->garray;
848: for (i=0; i<imax; i++) {
849: #if defined(PETSC_USE_CTABLE)
850: /* copy existing entries of table_data_i into tdata[] */
851: table_data_i = table_data[i];
852: PetscTableGetCount(table_data_i,&tcount);
853: if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
855: PetscMalloc1(tcount,&tdata);
856: PetscTableGetHeadPosition(table_data_i,&tpos);
857: while (tpos) {
858: PetscTableGetNext(table_data_i,&tpos,&row,&j);
859: tdata[--j] = --row;
860: if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
861: }
862: #else
863: data_i = data[i];
864: #endif
865: table_i = table[i];
866: isz_i = isz[i];
867: max = isz[i];
869: for (j=0; j<max; j++) {
870: #if defined(PETSC_USE_CTABLE)
871: row = tdata[j] - rstart;
872: #else
873: row = data_i[j] - rstart;
874: #endif
875: start = ai[row];
876: end = ai[row+1];
877: for (k=start; k<end; k++) { /* Amat */
878: val = aj[k] + cstart;
879: if (!PetscBTLookupSet(table_i,val)) {
880: #if defined(PETSC_USE_CTABLE)
881: PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
882: #else
883: data_i[isz_i] = val;
884: #endif
885: isz_i++;
886: }
887: }
888: start = bi[row];
889: end = bi[row+1];
890: for (k=start; k<end; k++) { /* Bmat */
891: val = garray[bj[k]];
892: if (!PetscBTLookupSet(table_i,val)) {
893: #if defined(PETSC_USE_CTABLE)
894: PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
895: #else
896: data_i[isz_i] = val;
897: #endif
898: isz_i++;
899: }
900: }
901: }
902: isz[i] = isz_i;
904: #if defined(PETSC_USE_CTABLE)
905: PetscFree(tdata);
906: #endif
907: }
908: return(0);
909: }
911: /*
912: MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
913: and return the output
915: Input:
916: C - the matrix
917: nrqr - no of messages being processed.
918: rbuf - an array of pointers to the received requests
920: Output:
921: xdata - array of messages to be sent back
922: isz1 - size of each message
924: For better efficiency perhaps we should malloc separately each xdata[i],
925: then if a remalloc is required we need only copy the data for that one row
926: rather then all previous rows as it is now where a single large chunck of
927: memory is used.
929: */
930: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
931: {
932: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
933: Mat A = c->A,B = c->B;
934: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
936: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
937: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
938: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
939: PetscInt *rbuf_i,kmax,rbuf_0;
940: PetscBT xtable;
943: m = C->rmap->N;
944: rstart = C->rmap->rstart;
945: cstart = C->cmap->rstart;
946: ai = a->i;
947: aj = a->j;
948: bi = b->i;
949: bj = b->j;
950: garray = c->garray;
953: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
954: rbuf_i = rbuf[i];
955: rbuf_0 = rbuf_i[0];
956: ct += rbuf_0;
957: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
958: }
960: if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
961: else max1 = 1;
962: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
963: PetscMalloc1(mem_estimate,&xdata[0]);
964: ++no_malloc;
965: PetscBTCreate(m,&xtable);
966: PetscArrayzero(isz1,nrqr);
968: ct3 = 0;
969: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
970: rbuf_i = rbuf[i];
971: rbuf_0 = rbuf_i[0];
972: ct1 = 2*rbuf_0+1;
973: ct2 = ct1;
974: ct3 += ct1;
975: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
976: PetscBTMemzero(m,xtable);
977: oct2 = ct2;
978: kmax = rbuf_i[2*j];
979: for (k=0; k<kmax; k++,ct1++) {
980: row = rbuf_i[ct1];
981: if (!PetscBTLookupSet(xtable,row)) {
982: if (!(ct3 < mem_estimate)) {
983: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
984: PetscMalloc1(new_estimate,&tmp);
985: PetscArraycpy(tmp,xdata[0],mem_estimate);
986: PetscFree(xdata[0]);
987: xdata[0] = tmp;
988: mem_estimate = new_estimate; ++no_malloc;
989: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
990: }
991: xdata[i][ct2++] = row;
992: ct3++;
993: }
994: }
995: for (k=oct2,max2=ct2; k<max2; k++) {
996: row = xdata[i][k] - rstart;
997: start = ai[row];
998: end = ai[row+1];
999: for (l=start; l<end; l++) {
1000: val = aj[l] + cstart;
1001: if (!PetscBTLookupSet(xtable,val)) {
1002: if (!(ct3 < mem_estimate)) {
1003: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1004: PetscMalloc1(new_estimate,&tmp);
1005: PetscArraycpy(tmp,xdata[0],mem_estimate);
1006: PetscFree(xdata[0]);
1007: xdata[0] = tmp;
1008: mem_estimate = new_estimate; ++no_malloc;
1009: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1010: }
1011: xdata[i][ct2++] = val;
1012: ct3++;
1013: }
1014: }
1015: start = bi[row];
1016: end = bi[row+1];
1017: for (l=start; l<end; l++) {
1018: val = garray[bj[l]];
1019: if (!PetscBTLookupSet(xtable,val)) {
1020: if (!(ct3 < mem_estimate)) {
1021: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1022: PetscMalloc1(new_estimate,&tmp);
1023: PetscArraycpy(tmp,xdata[0],mem_estimate);
1024: PetscFree(xdata[0]);
1025: xdata[0] = tmp;
1026: mem_estimate = new_estimate; ++no_malloc;
1027: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1028: }
1029: xdata[i][ct2++] = val;
1030: ct3++;
1031: }
1032: }
1033: }
1034: /* Update the header*/
1035: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1036: xdata[i][2*j-1] = rbuf_i[2*j-1];
1037: }
1038: xdata[i][0] = rbuf_0;
1039: xdata[i+1] = xdata[i] + ct2;
1040: isz1[i] = ct2; /* size of each message */
1041: }
1042: PetscBTDestroy(&xtable);
1043: PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
1044: return(0);
1045: }
1046: /* -------------------------------------------------------------------------*/
1047: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1048: /*
1049: Every processor gets the entire matrix
1050: */
1051: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1052: {
1053: Mat B;
1054: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
1055: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1057: PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL;
1058: PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1059: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1062: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1063: MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);
1064: if (scall == MAT_INITIAL_MATRIX) {
1065: /* ----------------------------------------------------------------
1066: Tell every processor the number of nonzeros per row
1067: */
1068: PetscMalloc1(A->rmap->N,&lens);
1069: for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1070: lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1071: }
1072: PetscMalloc2(size,&recvcounts,size,&displs);
1073: for (i=0; i<size; i++) {
1074: recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1075: displs[i] = A->rmap->range[i];
1076: }
1077: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1078: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1079: #else
1080: sendcount = A->rmap->rend - A->rmap->rstart;
1081: MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1082: #endif
1083: /* ---------------------------------------------------------------
1084: Create the sequential matrix of the same type as the local block diagonal
1085: */
1086: MatCreate(PETSC_COMM_SELF,&B);
1087: MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
1088: MatSetBlockSizesFromMats(B,A,A);
1089: MatSetType(B,((PetscObject)a->A)->type_name);
1090: MatSeqAIJSetPreallocation(B,0,lens);
1091: PetscCalloc1(2,Bin);
1092: **Bin = B;
1093: b = (Mat_SeqAIJ*)B->data;
1095: /*--------------------------------------------------------------------
1096: Copy my part of matrix column indices over
1097: */
1098: sendcount = ad->nz + bd->nz;
1099: jsendbuf = b->j + b->i[rstarts[rank]];
1100: a_jsendbuf = ad->j;
1101: b_jsendbuf = bd->j;
1102: n = A->rmap->rend - A->rmap->rstart;
1103: cnt = 0;
1104: for (i=0; i<n; i++) {
1105: /* put in lower diagonal portion */
1106: m = bd->i[i+1] - bd->i[i];
1107: while (m > 0) {
1108: /* is it above diagonal (in bd (compressed) numbering) */
1109: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1110: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1111: m--;
1112: }
1114: /* put in diagonal portion */
1115: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1116: jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1117: }
1119: /* put in upper diagonal portion */
1120: while (m-- > 0) {
1121: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1122: }
1123: }
1124: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1126: /*--------------------------------------------------------------------
1127: Gather all column indices to all processors
1128: */
1129: for (i=0; i<size; i++) {
1130: recvcounts[i] = 0;
1131: for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1132: recvcounts[i] += lens[j];
1133: }
1134: }
1135: displs[0] = 0;
1136: for (i=1; i<size; i++) {
1137: displs[i] = displs[i-1] + recvcounts[i-1];
1138: }
1139: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1140: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1141: #else
1142: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1143: #endif
1144: /*--------------------------------------------------------------------
1145: Assemble the matrix into useable form (note numerical values not yet set)
1146: */
1147: /* set the b->ilen (length of each row) values */
1148: PetscArraycpy(b->ilen,lens,A->rmap->N);
1149: /* set the b->i indices */
1150: b->i[0] = 0;
1151: for (i=1; i<=A->rmap->N; i++) {
1152: b->i[i] = b->i[i-1] + lens[i-1];
1153: }
1154: PetscFree(lens);
1155: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1156: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
1158: } else {
1159: B = **Bin;
1160: b = (Mat_SeqAIJ*)B->data;
1161: }
1163: /*--------------------------------------------------------------------
1164: Copy my part of matrix numerical values into the values location
1165: */
1166: if (flag == MAT_GET_VALUES) {
1167: const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1168: MatScalar *sendbuf,*recvbuf;
1170: MatSeqAIJGetArrayRead(a->A,&ada);
1171: MatSeqAIJGetArrayRead(a->B,&bda);
1172: sendcount = ad->nz + bd->nz;
1173: sendbuf = b->a + b->i[rstarts[rank]];
1174: a_sendbuf = ada;
1175: b_sendbuf = bda;
1176: b_sendj = bd->j;
1177: n = A->rmap->rend - A->rmap->rstart;
1178: cnt = 0;
1179: for (i=0; i<n; i++) {
1180: /* put in lower diagonal portion */
1181: m = bd->i[i+1] - bd->i[i];
1182: while (m > 0) {
1183: /* is it above diagonal (in bd (compressed) numbering) */
1184: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1185: sendbuf[cnt++] = *b_sendbuf++;
1186: m--;
1187: b_sendj++;
1188: }
1190: /* put in diagonal portion */
1191: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1192: sendbuf[cnt++] = *a_sendbuf++;
1193: }
1195: /* put in upper diagonal portion */
1196: while (m-- > 0) {
1197: sendbuf[cnt++] = *b_sendbuf++;
1198: b_sendj++;
1199: }
1200: }
1201: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1203: /* -----------------------------------------------------------------
1204: Gather all numerical values to all processors
1205: */
1206: if (!recvcounts) {
1207: PetscMalloc2(size,&recvcounts,size,&displs);
1208: }
1209: for (i=0; i<size; i++) {
1210: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1211: }
1212: displs[0] = 0;
1213: for (i=1; i<size; i++) {
1214: displs[i] = displs[i-1] + recvcounts[i-1];
1215: }
1216: recvbuf = b->a;
1217: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1218: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1219: #else
1220: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1221: #endif
1222: MatSeqAIJRestoreArrayRead(a->A,&ada);
1223: MatSeqAIJRestoreArrayRead(a->B,&bda);
1224: } /* endof (flag == MAT_GET_VALUES) */
1225: PetscFree2(recvcounts,displs);
1227: if (A->symmetric) {
1228: MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1229: } else if (A->hermitian) {
1230: MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1231: } else if (A->structurally_symmetric) {
1232: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1233: }
1234: return(0);
1235: }
1237: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1238: {
1239: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
1240: Mat submat,A = c->A,B = c->B;
1241: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1242: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1243: PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1244: const PetscInt *icol,*irow;
1245: PetscInt nrow,ncol,start;
1247: PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1248: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1249: PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1250: PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1251: PetscInt *lens,rmax,ncols,*cols,Crow;
1252: #if defined(PETSC_USE_CTABLE)
1253: PetscTable cmap,rmap;
1254: PetscInt *cmap_loc,*rmap_loc;
1255: #else
1256: PetscInt *cmap,*rmap;
1257: #endif
1258: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1259: PetscInt *cworkB,lwrite,*subcols,*row2proc;
1260: PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1261: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1262: MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4;
1263: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1264: MPI_Status *r_status3 = NULL,*r_status4,*s_status4;
1265: MPI_Comm comm;
1266: PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1267: PetscMPIInt *onodes1,*olengths1,idex,end;
1268: Mat_SubSppt *smatis1;
1269: PetscBool isrowsorted,iscolsorted;
1274: if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1275: MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
1276: MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);
1277: PetscObjectGetComm((PetscObject)C,&comm);
1278: size = c->size;
1279: rank = c->rank;
1281: ISSorted(iscol[0],&iscolsorted);
1282: ISSorted(isrow[0],&isrowsorted);
1283: ISGetIndices(isrow[0],&irow);
1284: ISGetLocalSize(isrow[0],&nrow);
1285: if (allcolumns) {
1286: icol = NULL;
1287: ncol = C->cmap->N;
1288: } else {
1289: ISGetIndices(iscol[0],&icol);
1290: ISGetLocalSize(iscol[0],&ncol);
1291: }
1293: if (scall == MAT_INITIAL_MATRIX) {
1294: PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1296: /* Get some new tags to keep the communication clean */
1297: tag1 = ((PetscObject)C)->tag;
1298: PetscObjectGetNewTag((PetscObject)C,&tag2);
1299: PetscObjectGetNewTag((PetscObject)C,&tag3);
1301: /* evaluate communication - mesg to who, length of mesg, and buffer space
1302: required. Based on this, buffers are allocated, and data copied into them */
1303: PetscCalloc2(size,&w1,size,&w2);
1304: PetscMalloc1(nrow,&row2proc);
1306: /* w1[proc] = num of rows owned by proc -- to be requested */
1307: proc = 0;
1308: nrqs = 0; /* num of outgoing messages */
1309: for (j=0; j<nrow; j++) {
1310: row = irow[j];
1311: if (!isrowsorted) proc = 0;
1312: while (row >= C->rmap->range[proc+1]) proc++;
1313: w1[proc]++;
1314: row2proc[j] = proc; /* map row index to proc */
1316: if (proc != rank && !w2[proc]) {
1317: w2[proc] = 1; nrqs++;
1318: }
1319: }
1320: w1[rank] = 0; /* rows owned by self will not be requested */
1322: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
1323: for (proc=0,j=0; proc<size; proc++) {
1324: if (w1[proc]) { pa[j++] = proc;}
1325: }
1327: /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1328: msz = 0; /* total mesg length (for all procs) */
1329: for (i=0; i<nrqs; i++) {
1330: proc = pa[i];
1331: w1[proc] += 3;
1332: msz += w1[proc];
1333: }
1334: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
1336: /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1337: /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1338: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
1340: /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1341: Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1342: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
1344: /* Now post the Irecvs corresponding to these messages */
1345: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
1347: PetscFree(onodes1);
1348: PetscFree(olengths1);
1350: /* Allocate Memory for outgoing messages */
1351: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1352: PetscArrayzero(sbuf1,size);
1353: PetscArrayzero(ptr,size);
1355: /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1356: iptr = tmp;
1357: for (i=0; i<nrqs; i++) {
1358: proc = pa[i];
1359: sbuf1[proc] = iptr;
1360: iptr += w1[proc];
1361: }
1363: /* Form the outgoing messages */
1364: /* Initialize the header space */
1365: for (i=0; i<nrqs; i++) {
1366: proc = pa[i];
1367: PetscArrayzero(sbuf1[proc],3);
1368: ptr[proc] = sbuf1[proc] + 3;
1369: }
1371: /* Parse the isrow and copy data into outbuf */
1372: PetscArrayzero(ctr,size);
1373: for (j=0; j<nrow; j++) { /* parse the indices of each IS */
1374: proc = row2proc[j];
1375: if (proc != rank) { /* copy to the outgoing buf*/
1376: *ptr[proc] = irow[j];
1377: ctr[proc]++; ptr[proc]++;
1378: }
1379: }
1381: /* Update the headers for the current IS */
1382: for (j=0; j<size; j++) { /* Can Optimise this loop too */
1383: if ((ctr_j = ctr[j])) {
1384: sbuf1_j = sbuf1[j];
1385: k = ++sbuf1_j[0];
1386: sbuf1_j[2*k] = ctr_j;
1387: sbuf1_j[2*k-1] = 0;
1388: }
1389: }
1391: /* Now post the sends */
1392: PetscMalloc1(nrqs+1,&s_waits1);
1393: for (i=0; i<nrqs; ++i) {
1394: proc = pa[i];
1395: MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);
1396: }
1398: /* Post Receives to capture the buffer size */
1399: PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);
1400: PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
1402: rbuf2[0] = tmp + msz;
1403: for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1405: for (i=0; i<nrqs; ++i) {
1406: proc = pa[i];
1407: MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);
1408: }
1410: PetscFree2(w1,w2);
1412: /* Send to other procs the buf size they should allocate */
1413: /* Receive messages*/
1414: PetscMalloc1(nrqr+1,&r_status1);
1415: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
1417: MPI_Waitall(nrqr,r_waits1,r_status1);
1418: for (i=0; i<nrqr; ++i) {
1419: req_size[i] = 0;
1420: rbuf1_i = rbuf1[i];
1421: start = 2*rbuf1_i[0] + 1;
1422: MPI_Get_count(r_status1+i,MPIU_INT,&end);
1423: PetscMalloc1(end+1,&sbuf2[i]);
1424: sbuf2_i = sbuf2[i];
1425: for (j=start; j<end; j++) {
1426: k = rbuf1_i[j] - rstart;
1427: ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1428: sbuf2_i[j] = ncols;
1429: req_size[i] += ncols;
1430: }
1431: req_source1[i] = r_status1[i].MPI_SOURCE;
1433: /* form the header */
1434: sbuf2_i[0] = req_size[i];
1435: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1437: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
1438: }
1440: PetscFree(r_status1);
1441: PetscFree(r_waits1);
1443: /* rbuf2 is received, Post recv column indices a->j */
1444: MPI_Waitall(nrqs,r_waits2,r_status2);
1446: PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);
1447: for (i=0; i<nrqs; ++i) {
1448: PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
1449: req_source2[i] = r_status2[i].MPI_SOURCE;
1450: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
1451: }
1453: /* Wait on sends1 and sends2 */
1454: PetscMalloc1(nrqs+1,&s_status1);
1455: MPI_Waitall(nrqs,s_waits1,s_status1);
1456: PetscFree(s_waits1);
1457: PetscFree(s_status1);
1459: MPI_Waitall(nrqr,s_waits2,s_status2);
1460: PetscFree4(r_status2,s_waits2,r_waits2,s_status2);
1462: /* Now allocate sending buffers for a->j, and send them off */
1463: PetscMalloc1(nrqr+1,&sbuf_aj);
1464: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1465: PetscMalloc1(j+1,&sbuf_aj[0]);
1466: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1468: for (i=0; i<nrqr; i++) { /* for each requested message */
1469: rbuf1_i = rbuf1[i];
1470: sbuf_aj_i = sbuf_aj[i];
1471: ct1 = 2*rbuf1_i[0] + 1;
1472: ct2 = 0;
1473: /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1475: kmax = rbuf1[i][2];
1476: for (k=0; k<kmax; k++,ct1++) { /* for each row */
1477: row = rbuf1_i[ct1] - rstart;
1478: nzA = ai[row+1] - ai[row];
1479: nzB = bi[row+1] - bi[row];
1480: ncols = nzA + nzB;
1481: cworkA = aj + ai[row]; cworkB = bj + bi[row];
1483: /* load the column indices for this row into cols*/
1484: cols = sbuf_aj_i + ct2;
1486: lwrite = 0;
1487: for (l=0; l<nzB; l++) {
1488: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1489: }
1490: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1491: for (l=0; l<nzB; l++) {
1492: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1493: }
1495: ct2 += ncols;
1496: }
1497: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
1498: }
1500: /* create column map (cmap): global col of C -> local col of submat */
1501: #if defined(PETSC_USE_CTABLE)
1502: if (!allcolumns) {
1503: PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);
1504: PetscCalloc1(C->cmap->n,&cmap_loc);
1505: for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1506: if (icol[j] >= cstart && icol[j] <cend) {
1507: cmap_loc[icol[j] - cstart] = j+1;
1508: } else { /* use PetscTable for non-local col indices */
1509: PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);
1510: }
1511: }
1512: } else {
1513: cmap = NULL;
1514: cmap_loc = NULL;
1515: }
1516: PetscCalloc1(C->rmap->n,&rmap_loc);
1517: #else
1518: if (!allcolumns) {
1519: PetscCalloc1(C->cmap->N,&cmap);
1520: for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1521: } else {
1522: cmap = NULL;
1523: }
1524: #endif
1526: /* Create lens for MatSeqAIJSetPreallocation() */
1527: PetscCalloc1(nrow,&lens);
1529: /* Compute lens from local part of C */
1530: for (j=0; j<nrow; j++) {
1531: row = irow[j];
1532: proc = row2proc[j];
1533: if (proc == rank) {
1534: /* diagonal part A = c->A */
1535: ncols = ai[row-rstart+1] - ai[row-rstart];
1536: cols = aj + ai[row-rstart];
1537: if (!allcolumns) {
1538: for (k=0; k<ncols; k++) {
1539: #if defined(PETSC_USE_CTABLE)
1540: tcol = cmap_loc[cols[k]];
1541: #else
1542: tcol = cmap[cols[k]+cstart];
1543: #endif
1544: if (tcol) lens[j]++;
1545: }
1546: } else { /* allcolumns */
1547: lens[j] = ncols;
1548: }
1550: /* off-diagonal part B = c->B */
1551: ncols = bi[row-rstart+1] - bi[row-rstart];
1552: cols = bj + bi[row-rstart];
1553: if (!allcolumns) {
1554: for (k=0; k<ncols; k++) {
1555: #if defined(PETSC_USE_CTABLE)
1556: PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1557: #else
1558: tcol = cmap[bmap[cols[k]]];
1559: #endif
1560: if (tcol) lens[j]++;
1561: }
1562: } else { /* allcolumns */
1563: lens[j] += ncols;
1564: }
1565: }
1566: }
1568: /* Create row map (rmap): global row of C -> local row of submat */
1569: #if defined(PETSC_USE_CTABLE)
1570: PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);
1571: for (j=0; j<nrow; j++) {
1572: row = irow[j];
1573: proc = row2proc[j];
1574: if (proc == rank) { /* a local row */
1575: rmap_loc[row - rstart] = j;
1576: } else {
1577: PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);
1578: }
1579: }
1580: #else
1581: PetscCalloc1(C->rmap->N,&rmap);
1582: for (j=0; j<nrow; j++) {
1583: rmap[irow[j]] = j;
1584: }
1585: #endif
1587: /* Update lens from offproc data */
1588: /* recv a->j is done */
1589: MPI_Waitall(nrqs,r_waits3,r_status3);
1590: for (i=0; i<nrqs; i++) {
1591: proc = pa[i];
1592: sbuf1_i = sbuf1[proc];
1593: /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1594: ct1 = 2 + 1;
1595: ct2 = 0;
1596: rbuf2_i = rbuf2[i]; /* received length of C->j */
1597: rbuf3_i = rbuf3[i]; /* received C->j */
1599: /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1600: max1 = sbuf1_i[2];
1601: for (k=0; k<max1; k++,ct1++) {
1602: #if defined(PETSC_USE_CTABLE)
1603: PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);
1604: row--;
1605: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1606: #else
1607: row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1608: #endif
1609: /* Now, store row index of submat in sbuf1_i[ct1] */
1610: sbuf1_i[ct1] = row;
1612: nnz = rbuf2_i[ct1];
1613: if (!allcolumns) {
1614: for (l=0; l<nnz; l++,ct2++) {
1615: #if defined(PETSC_USE_CTABLE)
1616: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1617: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1618: } else {
1619: PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1620: }
1621: #else
1622: tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1623: #endif
1624: if (tcol) lens[row]++;
1625: }
1626: } else { /* allcolumns */
1627: lens[row] += nnz;
1628: }
1629: }
1630: }
1631: MPI_Waitall(nrqr,s_waits3,s_status3);
1632: PetscFree4(r_waits3,s_waits3,r_status3,s_status3);
1634: /* Create the submatrices */
1635: MatCreate(PETSC_COMM_SELF,&submat);
1636: MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);
1638: ISGetBlockSize(isrow[0],&i);
1639: ISGetBlockSize(iscol[0],&j);
1640: MatSetBlockSizes(submat,i,j);
1641: MatSetType(submat,((PetscObject)A)->type_name);
1642: MatSeqAIJSetPreallocation(submat,0,lens);
1644: /* create struct Mat_SubSppt and attached it to submat */
1645: PetscNew(&smatis1);
1646: subc = (Mat_SeqAIJ*)submat->data;
1647: subc->submatis1 = smatis1;
1649: smatis1->id = 0;
1650: smatis1->nrqs = nrqs;
1651: smatis1->nrqr = nrqr;
1652: smatis1->rbuf1 = rbuf1;
1653: smatis1->rbuf2 = rbuf2;
1654: smatis1->rbuf3 = rbuf3;
1655: smatis1->sbuf2 = sbuf2;
1656: smatis1->req_source2 = req_source2;
1658: smatis1->sbuf1 = sbuf1;
1659: smatis1->ptr = ptr;
1660: smatis1->tmp = tmp;
1661: smatis1->ctr = ctr;
1663: smatis1->pa = pa;
1664: smatis1->req_size = req_size;
1665: smatis1->req_source1 = req_source1;
1667: smatis1->allcolumns = allcolumns;
1668: smatis1->singleis = PETSC_TRUE;
1669: smatis1->row2proc = row2proc;
1670: smatis1->rmap = rmap;
1671: smatis1->cmap = cmap;
1672: #if defined(PETSC_USE_CTABLE)
1673: smatis1->rmap_loc = rmap_loc;
1674: smatis1->cmap_loc = cmap_loc;
1675: #endif
1677: smatis1->destroy = submat->ops->destroy;
1678: submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1679: submat->factortype = C->factortype;
1681: /* compute rmax */
1682: rmax = 0;
1683: for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1685: } else { /* scall == MAT_REUSE_MATRIX */
1686: submat = submats[0];
1687: if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1689: subc = (Mat_SeqAIJ*)submat->data;
1690: rmax = subc->rmax;
1691: smatis1 = subc->submatis1;
1692: nrqs = smatis1->nrqs;
1693: nrqr = smatis1->nrqr;
1694: rbuf1 = smatis1->rbuf1;
1695: rbuf2 = smatis1->rbuf2;
1696: rbuf3 = smatis1->rbuf3;
1697: req_source2 = smatis1->req_source2;
1699: sbuf1 = smatis1->sbuf1;
1700: sbuf2 = smatis1->sbuf2;
1701: ptr = smatis1->ptr;
1702: tmp = smatis1->tmp;
1703: ctr = smatis1->ctr;
1705: pa = smatis1->pa;
1706: req_size = smatis1->req_size;
1707: req_source1 = smatis1->req_source1;
1709: allcolumns = smatis1->allcolumns;
1710: row2proc = smatis1->row2proc;
1711: rmap = smatis1->rmap;
1712: cmap = smatis1->cmap;
1713: #if defined(PETSC_USE_CTABLE)
1714: rmap_loc = smatis1->rmap_loc;
1715: cmap_loc = smatis1->cmap_loc;
1716: #endif
1717: }
1719: /* Post recv matrix values */
1720: PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);
1721: PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);
1722: PetscObjectGetNewTag((PetscObject)C,&tag4);
1723: for (i=0; i<nrqs; ++i) {
1724: PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);
1725: MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1726: }
1728: /* Allocate sending buffers for a->a, and send them off */
1729: PetscMalloc1(nrqr+1,&sbuf_aa);
1730: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1731: PetscMalloc1(j+1,&sbuf_aa[0]);
1732: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1734: for (i=0; i<nrqr; i++) {
1735: rbuf1_i = rbuf1[i];
1736: sbuf_aa_i = sbuf_aa[i];
1737: ct1 = 2*rbuf1_i[0]+1;
1738: ct2 = 0;
1739: /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1741: kmax = rbuf1_i[2];
1742: for (k=0; k<kmax; k++,ct1++) {
1743: row = rbuf1_i[ct1] - rstart;
1744: nzA = ai[row+1] - ai[row];
1745: nzB = bi[row+1] - bi[row];
1746: ncols = nzA + nzB;
1747: cworkB = bj + bi[row];
1748: vworkA = a_a + ai[row];
1749: vworkB = b_a + bi[row];
1751: /* load the column values for this row into vals*/
1752: vals = sbuf_aa_i + ct2;
1754: lwrite = 0;
1755: for (l=0; l<nzB; l++) {
1756: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1757: }
1758: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1759: for (l=0; l<nzB; l++) {
1760: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1761: }
1763: ct2 += ncols;
1764: }
1765: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1766: }
1768: /* Assemble submat */
1769: /* First assemble the local rows */
1770: for (j=0; j<nrow; j++) {
1771: row = irow[j];
1772: proc = row2proc[j];
1773: if (proc == rank) {
1774: Crow = row - rstart; /* local row index of C */
1775: #if defined(PETSC_USE_CTABLE)
1776: row = rmap_loc[Crow]; /* row index of submat */
1777: #else
1778: row = rmap[row];
1779: #endif
1781: if (allcolumns) {
1782: /* diagonal part A = c->A */
1783: ncols = ai[Crow+1] - ai[Crow];
1784: cols = aj + ai[Crow];
1785: vals = a_a + ai[Crow];
1786: i = 0;
1787: for (k=0; k<ncols; k++) {
1788: subcols[i] = cols[k] + cstart;
1789: subvals[i++] = vals[k];
1790: }
1792: /* off-diagonal part B = c->B */
1793: ncols = bi[Crow+1] - bi[Crow];
1794: cols = bj + bi[Crow];
1795: vals = b_a + bi[Crow];
1796: for (k=0; k<ncols; k++) {
1797: subcols[i] = bmap[cols[k]];
1798: subvals[i++] = vals[k];
1799: }
1801: MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1803: } else { /* !allcolumns */
1804: #if defined(PETSC_USE_CTABLE)
1805: /* diagonal part A = c->A */
1806: ncols = ai[Crow+1] - ai[Crow];
1807: cols = aj + ai[Crow];
1808: vals = a_a + ai[Crow];
1809: i = 0;
1810: for (k=0; k<ncols; k++) {
1811: tcol = cmap_loc[cols[k]];
1812: if (tcol) {
1813: subcols[i] = --tcol;
1814: subvals[i++] = vals[k];
1815: }
1816: }
1818: /* off-diagonal part B = c->B */
1819: ncols = bi[Crow+1] - bi[Crow];
1820: cols = bj + bi[Crow];
1821: vals = b_a + bi[Crow];
1822: for (k=0; k<ncols; k++) {
1823: PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1824: if (tcol) {
1825: subcols[i] = --tcol;
1826: subvals[i++] = vals[k];
1827: }
1828: }
1829: #else
1830: /* diagonal part A = c->A */
1831: ncols = ai[Crow+1] - ai[Crow];
1832: cols = aj + ai[Crow];
1833: vals = a_a + ai[Crow];
1834: i = 0;
1835: for (k=0; k<ncols; k++) {
1836: tcol = cmap[cols[k]+cstart];
1837: if (tcol) {
1838: subcols[i] = --tcol;
1839: subvals[i++] = vals[k];
1840: }
1841: }
1843: /* off-diagonal part B = c->B */
1844: ncols = bi[Crow+1] - bi[Crow];
1845: cols = bj + bi[Crow];
1846: vals = b_a + bi[Crow];
1847: for (k=0; k<ncols; k++) {
1848: tcol = cmap[bmap[cols[k]]];
1849: if (tcol) {
1850: subcols[i] = --tcol;
1851: subvals[i++] = vals[k];
1852: }
1853: }
1854: #endif
1855: MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1856: }
1857: }
1858: }
1860: /* Now assemble the off-proc rows */
1861: for (i=0; i<nrqs; i++) { /* for each requested message */
1862: /* recv values from other processes */
1863: MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);
1864: proc = pa[idex];
1865: sbuf1_i = sbuf1[proc];
1866: /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1867: ct1 = 2 + 1;
1868: ct2 = 0; /* count of received C->j */
1869: ct3 = 0; /* count of received C->j that will be inserted into submat */
1870: rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1871: rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1872: rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1874: /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1875: max1 = sbuf1_i[2]; /* num of rows */
1876: for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1877: row = sbuf1_i[ct1]; /* row index of submat */
1878: if (!allcolumns) {
1879: idex = 0;
1880: if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1881: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1882: for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1883: #if defined(PETSC_USE_CTABLE)
1884: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1885: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1886: } else {
1887: PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1888: }
1889: #else
1890: tcol = cmap[rbuf3_i[ct2]];
1891: #endif
1892: if (tcol) {
1893: subcols[idex] = --tcol; /* may not be sorted */
1894: subvals[idex++] = rbuf4_i[ct2];
1896: /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1897: For reuse, we replace received C->j with index that should be inserted to submat */
1898: if (iscolsorted) rbuf3_i[ct3++] = ct2;
1899: }
1900: }
1901: MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);
1902: } else { /* scall == MAT_REUSE_MATRIX */
1903: submat = submats[0];
1904: subc = (Mat_SeqAIJ*)submat->data;
1906: nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1907: for (l=0; l<nnz; l++) {
1908: ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1909: subvals[idex++] = rbuf4_i[ct2];
1910: }
1912: bj = subc->j + subc->i[row]; /* sorted column indices */
1913: MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);
1914: }
1915: } else { /* allcolumns */
1916: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1917: MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);
1918: ct2 += nnz;
1919: }
1920: }
1921: }
1923: /* sending a->a are done */
1924: MPI_Waitall(nrqr,s_waits4,s_status4);
1925: PetscFree4(r_waits4,s_waits4,r_status4,s_status4);
1927: MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);
1928: MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);
1929: submats[0] = submat;
1931: /* Restore the indices */
1932: ISRestoreIndices(isrow[0],&irow);
1933: if (!allcolumns) {
1934: ISRestoreIndices(iscol[0],&icol);
1935: }
1937: /* Destroy allocated memory */
1938: for (i=0; i<nrqs; ++i) {
1939: PetscFree3(rbuf4[i],subcols,subvals);
1940: }
1941: PetscFree3(rbuf4,subcols,subvals);
1942: PetscFree(sbuf_aa[0]);
1943: PetscFree(sbuf_aa);
1945: if (scall == MAT_INITIAL_MATRIX) {
1946: PetscFree(lens);
1947: PetscFree(sbuf_aj[0]);
1948: PetscFree(sbuf_aj);
1949: }
1950: MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
1951: MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);
1952: return(0);
1953: }
1955: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1956: {
1958: PetscInt ncol;
1959: PetscBool colflag,allcolumns=PETSC_FALSE;
1962: /* Allocate memory to hold all the submatrices */
1963: if (scall == MAT_INITIAL_MATRIX) {
1964: PetscCalloc1(2,submat);
1965: }
1967: /* Check for special case: each processor gets entire matrix columns */
1968: ISIdentity(iscol[0],&colflag);
1969: ISGetLocalSize(iscol[0],&ncol);
1970: if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1972: MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);
1973: return(0);
1974: }
1976: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1977: {
1979: PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1980: PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE;
1981: Mat_SeqAIJ *subc;
1982: Mat_SubSppt *smat;
1985: /* Check for special case: each processor has a single IS */
1986: if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1987: MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1988: C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1989: return(0);
1990: }
1992: /* Collect global wantallmatrix and nstages */
1993: if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1994: else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1995: if (!nmax) nmax = 1;
1997: if (scall == MAT_INITIAL_MATRIX) {
1998: /* Collect global wantallmatrix and nstages */
1999: if (ismax == 1 && C->rmap->N == C->cmap->N) {
2000: ISIdentity(*isrow,&rowflag);
2001: ISIdentity(*iscol,&colflag);
2002: ISGetLocalSize(*isrow,&nrow);
2003: ISGetLocalSize(*iscol,&ncol);
2004: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2005: wantallmatrix = PETSC_TRUE;
2007: PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
2008: }
2009: }
2011: /* Determine the number of stages through which submatrices are done
2012: Each stage will extract nmax submatrices.
2013: nmax is determined by the matrix column dimension.
2014: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2015: */
2016: nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2018: in[0] = -1*(PetscInt)wantallmatrix;
2019: in[1] = nstages;
2020: MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
2021: wantallmatrix = (PetscBool)(-out[0]);
2022: nstages = out[1]; /* Make sure every processor loops through the global nstages */
2024: } else { /* MAT_REUSE_MATRIX */
2025: if (ismax) {
2026: subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2027: smat = subc->submatis1;
2028: } else { /* (*submat)[0] is a dummy matrix */
2029: smat = (Mat_SubSppt*)(*submat)[0]->data;
2030: }
2031: if (!smat) {
2032: /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2033: wantallmatrix = PETSC_TRUE;
2034: } else if (smat->singleis) {
2035: MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
2036: return(0);
2037: } else {
2038: nstages = smat->nstages;
2039: }
2040: }
2042: if (wantallmatrix) {
2043: MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
2044: return(0);
2045: }
2047: /* Allocate memory to hold all the submatrices and dummy submatrices */
2048: if (scall == MAT_INITIAL_MATRIX) {
2049: PetscCalloc1(ismax+nstages,submat);
2050: }
2052: for (i=0,pos=0; i<nstages; i++) {
2053: if (pos+nmax <= ismax) max_no = nmax;
2054: else if (pos >= ismax) max_no = 0;
2055: else max_no = ismax-pos;
2057: MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
2058: if (!max_no) {
2059: if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2060: smat = (Mat_SubSppt*)(*submat)[pos]->data;
2061: smat->nstages = nstages;
2062: }
2063: pos++; /* advance to next dummy matrix if any */
2064: } else pos += max_no;
2065: }
2067: if (ismax && scall == MAT_INITIAL_MATRIX) {
2068: /* save nstages for reuse */
2069: subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2070: smat = subc->submatis1;
2071: smat->nstages = nstages;
2072: }
2073: return(0);
2074: }
2076: /* -------------------------------------------------------------------------*/
2077: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2078: {
2079: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
2080: Mat A = c->A;
2081: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2082: const PetscInt **icol,**irow;
2083: PetscInt *nrow,*ncol,start;
2085: PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2086: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2087: PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2088: PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2089: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2090: #if defined(PETSC_USE_CTABLE)
2091: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2092: #else
2093: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2094: #endif
2095: const PetscInt *irow_i;
2096: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2097: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2098: MPI_Request *r_waits4,*s_waits3,*s_waits4;
2099: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2100: MPI_Status *r_status3,*r_status4,*s_status4;
2101: MPI_Comm comm;
2102: PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2103: PetscMPIInt *onodes1,*olengths1,end;
2104: PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2105: Mat_SubSppt *smat_i;
2106: PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2107: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2110: PetscObjectGetComm((PetscObject)C,&comm);
2111: size = c->size;
2112: rank = c->rank;
2114: PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);
2115: PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);
2117: for (i=0; i<ismax; i++) {
2118: ISSorted(iscol[i],&issorted[i]);
2119: if (!issorted[i]) iscsorted = issorted[i];
2121: ISSorted(isrow[i],&issorted[i]);
2123: ISGetIndices(isrow[i],&irow[i]);
2124: ISGetLocalSize(isrow[i],&nrow[i]);
2126: /* Check for special case: allcolumn */
2127: ISIdentity(iscol[i],&colflag);
2128: ISGetLocalSize(iscol[i],&ncol[i]);
2129: if (colflag && ncol[i] == C->cmap->N) {
2130: allcolumns[i] = PETSC_TRUE;
2131: icol[i] = NULL;
2132: } else {
2133: allcolumns[i] = PETSC_FALSE;
2134: ISGetIndices(iscol[i],&icol[i]);
2135: }
2136: }
2138: if (scall == MAT_REUSE_MATRIX) {
2139: /* Assumes new rows are same length as the old rows */
2140: for (i=0; i<ismax; i++) {
2141: if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2142: subc = (Mat_SeqAIJ*)submats[i]->data;
2143: if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2145: /* Initial matrix as if empty */
2146: PetscArrayzero(subc->ilen,submats[i]->rmap->n);
2148: smat_i = subc->submatis1;
2150: nrqs = smat_i->nrqs;
2151: nrqr = smat_i->nrqr;
2152: rbuf1 = smat_i->rbuf1;
2153: rbuf2 = smat_i->rbuf2;
2154: rbuf3 = smat_i->rbuf3;
2155: req_source2 = smat_i->req_source2;
2157: sbuf1 = smat_i->sbuf1;
2158: sbuf2 = smat_i->sbuf2;
2159: ptr = smat_i->ptr;
2160: tmp = smat_i->tmp;
2161: ctr = smat_i->ctr;
2163: pa = smat_i->pa;
2164: req_size = smat_i->req_size;
2165: req_source1 = smat_i->req_source1;
2167: allcolumns[i] = smat_i->allcolumns;
2168: row2proc[i] = smat_i->row2proc;
2169: rmap[i] = smat_i->rmap;
2170: cmap[i] = smat_i->cmap;
2171: }
2173: if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2174: if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2175: smat_i = (Mat_SubSppt*)submats[0]->data;
2177: nrqs = smat_i->nrqs;
2178: nrqr = smat_i->nrqr;
2179: rbuf1 = smat_i->rbuf1;
2180: rbuf2 = smat_i->rbuf2;
2181: rbuf3 = smat_i->rbuf3;
2182: req_source2 = smat_i->req_source2;
2184: sbuf1 = smat_i->sbuf1;
2185: sbuf2 = smat_i->sbuf2;
2186: ptr = smat_i->ptr;
2187: tmp = smat_i->tmp;
2188: ctr = smat_i->ctr;
2190: pa = smat_i->pa;
2191: req_size = smat_i->req_size;
2192: req_source1 = smat_i->req_source1;
2194: allcolumns[0] = PETSC_FALSE;
2195: }
2196: } else { /* scall == MAT_INITIAL_MATRIX */
2197: /* Get some new tags to keep the communication clean */
2198: PetscObjectGetNewTag((PetscObject)C,&tag2);
2199: PetscObjectGetNewTag((PetscObject)C,&tag3);
2201: /* evaluate communication - mesg to who, length of mesg, and buffer space
2202: required. Based on this, buffers are allocated, and data copied into them*/
2203: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size, initialize work vectors */
2205: for (i=0; i<ismax; i++) {
2206: jmax = nrow[i];
2207: irow_i = irow[i];
2209: PetscMalloc1(jmax,&row2proc_i);
2210: row2proc[i] = row2proc_i;
2212: if (issorted[i]) proc = 0;
2213: for (j=0; j<jmax; j++) {
2214: if (!issorted[i]) proc = 0;
2215: row = irow_i[j];
2216: while (row >= C->rmap->range[proc+1]) proc++;
2217: w4[proc]++;
2218: row2proc_i[j] = proc; /* map row index to proc */
2219: }
2220: for (j=0; j<size; j++) {
2221: if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
2222: }
2223: }
2225: nrqs = 0; /* no of outgoing messages */
2226: msz = 0; /* total mesg length (for all procs) */
2227: w1[rank] = 0; /* no mesg sent to self */
2228: w3[rank] = 0;
2229: for (i=0; i<size; i++) {
2230: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2231: }
2232: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
2233: for (i=0,j=0; i<size; i++) {
2234: if (w1[i]) { pa[j] = i; j++; }
2235: }
2237: /* Each message would have a header = 1 + 2*(no of IS) + data */
2238: for (i=0; i<nrqs; i++) {
2239: j = pa[i];
2240: w1[j] += w2[j] + 2* w3[j];
2241: msz += w1[j];
2242: }
2243: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
2245: /* Determine the number of messages to expect, their lengths, from from-ids */
2246: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
2247: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
2249: /* Now post the Irecvs corresponding to these messages */
2250: tag0 = ((PetscObject)C)->tag;
2251: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
2253: PetscFree(onodes1);
2254: PetscFree(olengths1);
2256: /* Allocate Memory for outgoing messages */
2257: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
2258: PetscArrayzero(sbuf1,size);
2259: PetscArrayzero(ptr,size);
2261: {
2262: PetscInt *iptr = tmp;
2263: k = 0;
2264: for (i=0; i<nrqs; i++) {
2265: j = pa[i];
2266: iptr += k;
2267: sbuf1[j] = iptr;
2268: k = w1[j];
2269: }
2270: }
2272: /* Form the outgoing messages. Initialize the header space */
2273: for (i=0; i<nrqs; i++) {
2274: j = pa[i];
2275: sbuf1[j][0] = 0;
2276: PetscArrayzero(sbuf1[j]+1,2*w3[j]);
2277: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
2278: }
2280: /* Parse the isrow and copy data into outbuf */
2281: for (i=0; i<ismax; i++) {
2282: row2proc_i = row2proc[i];
2283: PetscArrayzero(ctr,size);
2284: irow_i = irow[i];
2285: jmax = nrow[i];
2286: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
2287: proc = row2proc_i[j];
2288: if (proc != rank) { /* copy to the outgoing buf*/
2289: ctr[proc]++;
2290: *ptr[proc] = irow_i[j];
2291: ptr[proc]++;
2292: }
2293: }
2294: /* Update the headers for the current IS */
2295: for (j=0; j<size; j++) { /* Can Optimise this loop too */
2296: if ((ctr_j = ctr[j])) {
2297: sbuf1_j = sbuf1[j];
2298: k = ++sbuf1_j[0];
2299: sbuf1_j[2*k] = ctr_j;
2300: sbuf1_j[2*k-1] = i;
2301: }
2302: }
2303: }
2305: /* Now post the sends */
2306: PetscMalloc1(nrqs+1,&s_waits1);
2307: for (i=0; i<nrqs; ++i) {
2308: j = pa[i];
2309: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
2310: }
2312: /* Post Receives to capture the buffer size */
2313: PetscMalloc1(nrqs+1,&r_waits2);
2314: PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
2315: rbuf2[0] = tmp + msz;
2316: for (i=1; i<nrqs; ++i) {
2317: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2318: }
2319: for (i=0; i<nrqs; ++i) {
2320: j = pa[i];
2321: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
2322: }
2324: /* Send to other procs the buf size they should allocate */
2325: /* Receive messages*/
2326: PetscMalloc1(nrqr+1,&s_waits2);
2327: PetscMalloc1(nrqr+1,&r_status1);
2328: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
2329: {
2330: PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2331: PetscInt *sbuf2_i;
2333: MPI_Waitall(nrqr,r_waits1,r_status1);
2334: for (i=0; i<nrqr; ++i) {
2335: req_size[i] = 0;
2336: rbuf1_i = rbuf1[i];
2337: start = 2*rbuf1_i[0] + 1;
2338: MPI_Get_count(r_status1+i,MPIU_INT,&end);
2339: PetscMalloc1(end+1,&sbuf2[i]);
2340: sbuf2_i = sbuf2[i];
2341: for (j=start; j<end; j++) {
2342: id = rbuf1_i[j] - rstart;
2343: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2344: sbuf2_i[j] = ncols;
2345: req_size[i] += ncols;
2346: }
2347: req_source1[i] = r_status1[i].MPI_SOURCE;
2348: /* form the header */
2349: sbuf2_i[0] = req_size[i];
2350: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2352: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
2353: }
2354: }
2355: PetscFree(r_status1);
2356: PetscFree(r_waits1);
2357: PetscFree4(w1,w2,w3,w4);
2359: /* Receive messages*/
2360: PetscMalloc1(nrqs+1,&r_waits3);
2361: PetscMalloc1(nrqs+1,&r_status2);
2363: MPI_Waitall(nrqs,r_waits2,r_status2);
2364: for (i=0; i<nrqs; ++i) {
2365: PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
2366: req_source2[i] = r_status2[i].MPI_SOURCE;
2367: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
2368: }
2369: PetscFree(r_status2);
2370: PetscFree(r_waits2);
2372: /* Wait on sends1 and sends2 */
2373: PetscMalloc1(nrqs+1,&s_status1);
2374: PetscMalloc1(nrqr+1,&s_status2);
2376: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
2377: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
2378: PetscFree(s_status1);
2379: PetscFree(s_status2);
2380: PetscFree(s_waits1);
2381: PetscFree(s_waits2);
2383: /* Now allocate sending buffers for a->j, and send them off */
2384: PetscMalloc1(nrqr+1,&sbuf_aj);
2385: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2386: PetscMalloc1(j+1,&sbuf_aj[0]);
2387: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2389: PetscMalloc1(nrqr+1,&s_waits3);
2390: {
2391: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2392: PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2393: PetscInt cend = C->cmap->rend;
2394: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2396: for (i=0; i<nrqr; i++) {
2397: rbuf1_i = rbuf1[i];
2398: sbuf_aj_i = sbuf_aj[i];
2399: ct1 = 2*rbuf1_i[0] + 1;
2400: ct2 = 0;
2401: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2402: kmax = rbuf1[i][2*j];
2403: for (k=0; k<kmax; k++,ct1++) {
2404: row = rbuf1_i[ct1] - rstart;
2405: nzA = a_i[row+1] - a_i[row];
2406: nzB = b_i[row+1] - b_i[row];
2407: ncols = nzA + nzB;
2408: cworkA = a_j + a_i[row];
2409: cworkB = b_j + b_i[row];
2411: /* load the column indices for this row into cols */
2412: cols = sbuf_aj_i + ct2;
2414: lwrite = 0;
2415: for (l=0; l<nzB; l++) {
2416: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2417: }
2418: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2419: for (l=0; l<nzB; l++) {
2420: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2421: }
2423: ct2 += ncols;
2424: }
2425: }
2426: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
2427: }
2428: }
2429: PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);
2431: /* create col map: global col of C -> local col of submatrices */
2432: {
2433: const PetscInt *icol_i;
2434: #if defined(PETSC_USE_CTABLE)
2435: for (i=0; i<ismax; i++) {
2436: if (!allcolumns[i]) {
2437: PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);
2439: jmax = ncol[i];
2440: icol_i = icol[i];
2441: cmap_i = cmap[i];
2442: for (j=0; j<jmax; j++) {
2443: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
2444: }
2445: } else cmap[i] = NULL;
2446: }
2447: #else
2448: for (i=0; i<ismax; i++) {
2449: if (!allcolumns[i]) {
2450: PetscCalloc1(C->cmap->N,&cmap[i]);
2451: jmax = ncol[i];
2452: icol_i = icol[i];
2453: cmap_i = cmap[i];
2454: for (j=0; j<jmax; j++) {
2455: cmap_i[icol_i[j]] = j+1;
2456: }
2457: } else cmap[i] = NULL;
2458: }
2459: #endif
2460: }
2462: /* Create lens which is required for MatCreate... */
2463: for (i=0,j=0; i<ismax; i++) j += nrow[i];
2464: PetscMalloc1(ismax,&lens);
2466: if (ismax) {
2467: PetscCalloc1(j,&lens[0]);
2468: }
2469: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2471: /* Update lens from local data */
2472: for (i=0; i<ismax; i++) {
2473: row2proc_i = row2proc[i];
2474: jmax = nrow[i];
2475: if (!allcolumns[i]) cmap_i = cmap[i];
2476: irow_i = irow[i];
2477: lens_i = lens[i];
2478: for (j=0; j<jmax; j++) {
2479: row = irow_i[j];
2480: proc = row2proc_i[j];
2481: if (proc == rank) {
2482: MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2483: if (!allcolumns[i]) {
2484: for (k=0; k<ncols; k++) {
2485: #if defined(PETSC_USE_CTABLE)
2486: PetscTableFind(cmap_i,cols[k]+1,&tcol);
2487: #else
2488: tcol = cmap_i[cols[k]];
2489: #endif
2490: if (tcol) lens_i[j]++;
2491: }
2492: } else { /* allcolumns */
2493: lens_i[j] = ncols;
2494: }
2495: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2496: }
2497: }
2498: }
2500: /* Create row map: global row of C -> local row of submatrices */
2501: #if defined(PETSC_USE_CTABLE)
2502: for (i=0; i<ismax; i++) {
2503: PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);
2504: irow_i = irow[i];
2505: jmax = nrow[i];
2506: for (j=0; j<jmax; j++) {
2507: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
2508: }
2509: }
2510: #else
2511: for (i=0; i<ismax; i++) {
2512: PetscCalloc1(C->rmap->N,&rmap[i]);
2513: rmap_i = rmap[i];
2514: irow_i = irow[i];
2515: jmax = nrow[i];
2516: for (j=0; j<jmax; j++) {
2517: rmap_i[irow_i[j]] = j;
2518: }
2519: }
2520: #endif
2522: /* Update lens from offproc data */
2523: {
2524: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2526: MPI_Waitall(nrqs,r_waits3,r_status3);
2527: for (tmp2=0; tmp2<nrqs; tmp2++) {
2528: sbuf1_i = sbuf1[pa[tmp2]];
2529: jmax = sbuf1_i[0];
2530: ct1 = 2*jmax+1;
2531: ct2 = 0;
2532: rbuf2_i = rbuf2[tmp2];
2533: rbuf3_i = rbuf3[tmp2];
2534: for (j=1; j<=jmax; j++) {
2535: is_no = sbuf1_i[2*j-1];
2536: max1 = sbuf1_i[2*j];
2537: lens_i = lens[is_no];
2538: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2539: rmap_i = rmap[is_no];
2540: for (k=0; k<max1; k++,ct1++) {
2541: #if defined(PETSC_USE_CTABLE)
2542: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
2543: row--;
2544: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2545: #else
2546: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2547: #endif
2548: max2 = rbuf2_i[ct1];
2549: for (l=0; l<max2; l++,ct2++) {
2550: if (!allcolumns[is_no]) {
2551: #if defined(PETSC_USE_CTABLE)
2552: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2553: #else
2554: tcol = cmap_i[rbuf3_i[ct2]];
2555: #endif
2556: if (tcol) lens_i[row]++;
2557: } else { /* allcolumns */
2558: lens_i[row]++; /* lens_i[row] += max2 ? */
2559: }
2560: }
2561: }
2562: }
2563: }
2564: }
2565: PetscFree(r_waits3);
2566: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
2567: PetscFree2(r_status3,s_status3);
2568: PetscFree(s_waits3);
2570: /* Create the submatrices */
2571: for (i=0; i<ismax; i++) {
2572: PetscInt rbs,cbs;
2574: ISGetBlockSize(isrow[i],&rbs);
2575: ISGetBlockSize(iscol[i],&cbs);
2577: MatCreate(PETSC_COMM_SELF,submats+i);
2578: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
2580: MatSetBlockSizes(submats[i],rbs,cbs);
2581: MatSetType(submats[i],((PetscObject)A)->type_name);
2582: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
2584: /* create struct Mat_SubSppt and attached it to submat */
2585: PetscNew(&smat_i);
2586: subc = (Mat_SeqAIJ*)submats[i]->data;
2587: subc->submatis1 = smat_i;
2589: smat_i->destroy = submats[i]->ops->destroy;
2590: submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2591: submats[i]->factortype = C->factortype;
2593: smat_i->id = i;
2594: smat_i->nrqs = nrqs;
2595: smat_i->nrqr = nrqr;
2596: smat_i->rbuf1 = rbuf1;
2597: smat_i->rbuf2 = rbuf2;
2598: smat_i->rbuf3 = rbuf3;
2599: smat_i->sbuf2 = sbuf2;
2600: smat_i->req_source2 = req_source2;
2602: smat_i->sbuf1 = sbuf1;
2603: smat_i->ptr = ptr;
2604: smat_i->tmp = tmp;
2605: smat_i->ctr = ctr;
2607: smat_i->pa = pa;
2608: smat_i->req_size = req_size;
2609: smat_i->req_source1 = req_source1;
2611: smat_i->allcolumns = allcolumns[i];
2612: smat_i->singleis = PETSC_FALSE;
2613: smat_i->row2proc = row2proc[i];
2614: smat_i->rmap = rmap[i];
2615: smat_i->cmap = cmap[i];
2616: }
2618: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2619: MatCreate(PETSC_COMM_SELF,&submats[0]);
2620: MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
2621: MatSetType(submats[0],MATDUMMY);
2623: /* create struct Mat_SubSppt and attached it to submat */
2624: PetscNewLog(submats[0],&smat_i);
2625: submats[0]->data = (void*)smat_i;
2627: smat_i->destroy = submats[0]->ops->destroy;
2628: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2629: submats[0]->factortype = C->factortype;
2631: smat_i->id = 0;
2632: smat_i->nrqs = nrqs;
2633: smat_i->nrqr = nrqr;
2634: smat_i->rbuf1 = rbuf1;
2635: smat_i->rbuf2 = rbuf2;
2636: smat_i->rbuf3 = rbuf3;
2637: smat_i->sbuf2 = sbuf2;
2638: smat_i->req_source2 = req_source2;
2640: smat_i->sbuf1 = sbuf1;
2641: smat_i->ptr = ptr;
2642: smat_i->tmp = tmp;
2643: smat_i->ctr = ctr;
2645: smat_i->pa = pa;
2646: smat_i->req_size = req_size;
2647: smat_i->req_source1 = req_source1;
2649: smat_i->allcolumns = PETSC_FALSE;
2650: smat_i->singleis = PETSC_FALSE;
2651: smat_i->row2proc = NULL;
2652: smat_i->rmap = NULL;
2653: smat_i->cmap = NULL;
2654: }
2656: if (ismax) {PetscFree(lens[0]);}
2657: PetscFree(lens);
2658: PetscFree(sbuf_aj[0]);
2659: PetscFree(sbuf_aj);
2661: } /* endof scall == MAT_INITIAL_MATRIX */
2663: /* Post recv matrix values */
2664: PetscObjectGetNewTag((PetscObject)C,&tag4);
2665: PetscMalloc1(nrqs+1,&rbuf4);
2666: PetscMalloc1(nrqs+1,&r_waits4);
2667: PetscMalloc1(nrqs+1,&r_status4);
2668: PetscMalloc1(nrqr+1,&s_status4);
2669: for (i=0; i<nrqs; ++i) {
2670: PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);
2671: MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
2672: }
2674: /* Allocate sending buffers for a->a, and send them off */
2675: PetscMalloc1(nrqr+1,&sbuf_aa);
2676: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2677: PetscMalloc1(j+1,&sbuf_aa[0]);
2678: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2680: PetscMalloc1(nrqr+1,&s_waits4);
2681: {
2682: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2683: PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2684: PetscInt cend = C->cmap->rend;
2685: PetscInt *b_j = b->j;
2686: PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2688: MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
2689: MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);
2690: for (i=0; i<nrqr; i++) {
2691: rbuf1_i = rbuf1[i];
2692: sbuf_aa_i = sbuf_aa[i];
2693: ct1 = 2*rbuf1_i[0]+1;
2694: ct2 = 0;
2695: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2696: kmax = rbuf1_i[2*j];
2697: for (k=0; k<kmax; k++,ct1++) {
2698: row = rbuf1_i[ct1] - rstart;
2699: nzA = a_i[row+1] - a_i[row];
2700: nzB = b_i[row+1] - b_i[row];
2701: ncols = nzA + nzB;
2702: cworkB = b_j + b_i[row];
2703: vworkA = a_a + a_i[row];
2704: vworkB = b_a + b_i[row];
2706: /* load the column values for this row into vals*/
2707: vals = sbuf_aa_i+ct2;
2709: lwrite = 0;
2710: for (l=0; l<nzB; l++) {
2711: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2712: }
2713: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2714: for (l=0; l<nzB; l++) {
2715: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2716: }
2718: ct2 += ncols;
2719: }
2720: }
2721: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
2722: }
2723: MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
2724: MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);
2725: }
2727: /* Assemble the matrices */
2728: /* First assemble the local rows */
2729: for (i=0; i<ismax; i++) {
2730: row2proc_i = row2proc[i];
2731: subc = (Mat_SeqAIJ*)submats[i]->data;
2732: imat_ilen = subc->ilen;
2733: imat_j = subc->j;
2734: imat_i = subc->i;
2735: imat_a = subc->a;
2737: if (!allcolumns[i]) cmap_i = cmap[i];
2738: rmap_i = rmap[i];
2739: irow_i = irow[i];
2740: jmax = nrow[i];
2741: for (j=0; j<jmax; j++) {
2742: row = irow_i[j];
2743: proc = row2proc_i[j];
2744: if (proc == rank) {
2745: old_row = row;
2746: #if defined(PETSC_USE_CTABLE)
2747: PetscTableFind(rmap_i,row+1,&row);
2748: row--;
2749: #else
2750: row = rmap_i[row];
2751: #endif
2752: ilen_row = imat_ilen[row];
2753: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2754: mat_i = imat_i[row];
2755: mat_a = imat_a + mat_i;
2756: mat_j = imat_j + mat_i;
2757: if (!allcolumns[i]) {
2758: for (k=0; k<ncols; k++) {
2759: #if defined(PETSC_USE_CTABLE)
2760: PetscTableFind(cmap_i,cols[k]+1,&tcol);
2761: #else
2762: tcol = cmap_i[cols[k]];
2763: #endif
2764: if (tcol) {
2765: *mat_j++ = tcol - 1;
2766: *mat_a++ = vals[k];
2767: ilen_row++;
2768: }
2769: }
2770: } else { /* allcolumns */
2771: for (k=0; k<ncols; k++) {
2772: *mat_j++ = cols[k]; /* global col index! */
2773: *mat_a++ = vals[k];
2774: ilen_row++;
2775: }
2776: }
2777: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2779: imat_ilen[row] = ilen_row;
2780: }
2781: }
2782: }
2784: /* Now assemble the off proc rows */
2785: MPI_Waitall(nrqs,r_waits4,r_status4);
2786: for (tmp2=0; tmp2<nrqs; tmp2++) {
2787: sbuf1_i = sbuf1[pa[tmp2]];
2788: jmax = sbuf1_i[0];
2789: ct1 = 2*jmax + 1;
2790: ct2 = 0;
2791: rbuf2_i = rbuf2[tmp2];
2792: rbuf3_i = rbuf3[tmp2];
2793: rbuf4_i = rbuf4[tmp2];
2794: for (j=1; j<=jmax; j++) {
2795: is_no = sbuf1_i[2*j-1];
2796: rmap_i = rmap[is_no];
2797: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2798: subc = (Mat_SeqAIJ*)submats[is_no]->data;
2799: imat_ilen = subc->ilen;
2800: imat_j = subc->j;
2801: imat_i = subc->i;
2802: imat_a = subc->a;
2803: max1 = sbuf1_i[2*j];
2804: for (k=0; k<max1; k++,ct1++) {
2805: row = sbuf1_i[ct1];
2806: #if defined(PETSC_USE_CTABLE)
2807: PetscTableFind(rmap_i,row+1,&row);
2808: row--;
2809: #else
2810: row = rmap_i[row];
2811: #endif
2812: ilen = imat_ilen[row];
2813: mat_i = imat_i[row];
2814: mat_a = imat_a + mat_i;
2815: mat_j = imat_j + mat_i;
2816: max2 = rbuf2_i[ct1];
2817: if (!allcolumns[is_no]) {
2818: for (l=0; l<max2; l++,ct2++) {
2819: #if defined(PETSC_USE_CTABLE)
2820: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2821: #else
2822: tcol = cmap_i[rbuf3_i[ct2]];
2823: #endif
2824: if (tcol) {
2825: *mat_j++ = tcol - 1;
2826: *mat_a++ = rbuf4_i[ct2];
2827: ilen++;
2828: }
2829: }
2830: } else { /* allcolumns */
2831: for (l=0; l<max2; l++,ct2++) {
2832: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2833: *mat_a++ = rbuf4_i[ct2];
2834: ilen++;
2835: }
2836: }
2837: imat_ilen[row] = ilen;
2838: }
2839: }
2840: }
2842: if (!iscsorted) { /* sort column indices of the rows */
2843: for (i=0; i<ismax; i++) {
2844: subc = (Mat_SeqAIJ*)submats[i]->data;
2845: imat_j = subc->j;
2846: imat_i = subc->i;
2847: imat_a = subc->a;
2848: imat_ilen = subc->ilen;
2850: if (allcolumns[i]) continue;
2851: jmax = nrow[i];
2852: for (j=0; j<jmax; j++) {
2853: mat_i = imat_i[j];
2854: mat_a = imat_a + mat_i;
2855: mat_j = imat_j + mat_i;
2856: PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);
2857: }
2858: }
2859: }
2861: PetscFree(r_status4);
2862: PetscFree(r_waits4);
2863: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
2864: PetscFree(s_waits4);
2865: PetscFree(s_status4);
2867: /* Restore the indices */
2868: for (i=0; i<ismax; i++) {
2869: ISRestoreIndices(isrow[i],irow+i);
2870: if (!allcolumns[i]) {
2871: ISRestoreIndices(iscol[i],icol+i);
2872: }
2873: }
2875: for (i=0; i<ismax; i++) {
2876: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
2877: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
2878: }
2880: /* Destroy allocated memory */
2881: PetscFree(sbuf_aa[0]);
2882: PetscFree(sbuf_aa);
2883: PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);
2885: for (i=0; i<nrqs; ++i) {
2886: PetscFree(rbuf4[i]);
2887: }
2888: PetscFree(rbuf4);
2890: PetscFree4(row2proc,cmap,rmap,allcolumns);
2891: return(0);
2892: }
2894: /*
2895: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2896: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2897: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2898: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2899: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2900: state, and needs to be "assembled" later by compressing B's column space.
2902: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2903: Following this call, C->A & C->B have been created, even if empty.
2904: */
2905: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2906: {
2907: /* If making this function public, change the error returned in this function away from _PLIB. */
2909: Mat_MPIAIJ *aij;
2910: Mat_SeqAIJ *Baij;
2911: PetscBool seqaij,Bdisassembled;
2912: PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2913: PetscScalar v;
2914: const PetscInt *rowindices,*colindices;
2917: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2918: if (A) {
2919: PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
2920: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2921: if (rowemb) {
2922: ISGetLocalSize(rowemb,&m);
2923: if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2924: } else {
2925: if (C->rmap->n != A->rmap->n) {
2926: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2927: }
2928: }
2929: if (dcolemb) {
2930: ISGetLocalSize(dcolemb,&n);
2931: if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2932: } else {
2933: if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2934: }
2935: }
2936: if (B) {
2937: PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
2938: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2939: if (rowemb) {
2940: ISGetLocalSize(rowemb,&m);
2941: if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2942: } else {
2943: if (C->rmap->n != B->rmap->n) {
2944: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2945: }
2946: }
2947: if (ocolemb) {
2948: ISGetLocalSize(ocolemb,&n);
2949: if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2950: } else {
2951: if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2952: }
2953: }
2955: aij = (Mat_MPIAIJ*)C->data;
2956: if (!aij->A) {
2957: /* Mimic parts of MatMPIAIJSetPreallocation() */
2958: MatCreate(PETSC_COMM_SELF,&aij->A);
2959: MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2960: MatSetBlockSizesFromMats(aij->A,C,C);
2961: MatSetType(aij->A,MATSEQAIJ);
2962: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2963: }
2964: if (A) {
2965: MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2966: } else {
2967: MatSetUp(aij->A);
2968: }
2969: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2970: /*
2971: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2972: need to "disassemble" B -- convert it to using C's global indices.
2973: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2975: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2976: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2978: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2979: At least avoid calling MatSetValues() and the implied searches?
2980: */
2982: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2983: #if defined(PETSC_USE_CTABLE)
2984: PetscTableDestroy(&aij->colmap);
2985: #else
2986: PetscFree(aij->colmap);
2987: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2988: if (aij->B) {
2989: PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2990: }
2991: #endif
2992: ngcol = 0;
2993: if (aij->lvec) {
2994: VecGetSize(aij->lvec,&ngcol);
2995: }
2996: if (aij->garray) {
2997: PetscFree(aij->garray);
2998: PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2999: }
3000: VecDestroy(&aij->lvec);
3001: VecScatterDestroy(&aij->Mvctx);
3002: }
3003: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
3004: MatDestroy(&aij->B);
3005: }
3006: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3007: MatZeroEntries(aij->B);
3008: }
3009: }
3010: Bdisassembled = PETSC_FALSE;
3011: if (!aij->B) {
3012: MatCreate(PETSC_COMM_SELF,&aij->B);
3013: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
3014: MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
3015: MatSetBlockSizesFromMats(aij->B,B,B);
3016: MatSetType(aij->B,MATSEQAIJ);
3017: Bdisassembled = PETSC_TRUE;
3018: }
3019: if (B) {
3020: Baij = (Mat_SeqAIJ*)B->data;
3021: if (pattern == DIFFERENT_NONZERO_PATTERN) {
3022: PetscMalloc1(B->rmap->n,&nz);
3023: for (i=0; i<B->rmap->n; i++) {
3024: nz[i] = Baij->i[i+1] - Baij->i[i];
3025: }
3026: MatSeqAIJSetPreallocation(aij->B,0,nz);
3027: PetscFree(nz);
3028: }
3030: PetscLayoutGetRange(C->rmap,&rstart,&rend);
3031: shift = rend-rstart;
3032: count = 0;
3033: rowindices = NULL;
3034: colindices = NULL;
3035: if (rowemb) {
3036: ISGetIndices(rowemb,&rowindices);
3037: }
3038: if (ocolemb) {
3039: ISGetIndices(ocolemb,&colindices);
3040: }
3041: for (i=0; i<B->rmap->n; i++) {
3042: PetscInt row;
3043: row = i;
3044: if (rowindices) row = rowindices[i];
3045: for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3046: col = Baij->j[count];
3047: if (colindices) col = colindices[col];
3048: if (Bdisassembled && col>=rstart) col += shift;
3049: v = Baij->a[count];
3050: MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
3051: ++count;
3052: }
3053: }
3054: /* No assembly for aij->B is necessary. */
3055: /* FIXME: set aij->B's nonzerostate correctly. */
3056: } else {
3057: MatSetUp(aij->B);
3058: }
3059: C->preallocated = PETSC_TRUE;
3060: C->was_assembled = PETSC_FALSE;
3061: C->assembled = PETSC_FALSE;
3062: /*
3063: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3064: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3065: */
3066: return(0);
3067: }
3069: /*
3070: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
3071: */
3072: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3073: {
3074: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3079: /* FIXME: make sure C is assembled */
3080: *A = aij->A;
3081: *B = aij->B;
3082: /* Note that we don't incref *A and *B, so be careful! */
3083: return(0);
3084: }
3086: /*
3087: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3088: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3089: */
3090: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3091: PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3092: PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3093: PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3094: PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3095: {
3097: PetscMPIInt size,flag;
3098: PetscInt i,ii,cismax,ispar;
3099: Mat *A,*B;
3100: IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3103: if (!ismax) return(0);
3105: for (i = 0, cismax = 0; i < ismax; ++i) {
3106: MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
3107: if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3108: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3109: if (size > 1) ++cismax;
3110: }
3112: /*
3113: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3114: ispar counts the number of parallel ISs across C's comm.
3115: */
3116: MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
3117: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3118: (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
3119: return(0);
3120: }
3122: /* if (ispar) */
3123: /*
3124: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3125: These are used to extract the off-diag portion of the resulting parallel matrix.
3126: The row IS for the off-diag portion is the same as for the diag portion,
3127: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3128: */
3129: PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
3130: PetscMalloc1(cismax,&ciscol_p);
3131: for (i = 0, ii = 0; i < ismax; ++i) {
3132: MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3133: if (size > 1) {
3134: /*
3135: TODO: This is the part that's ***NOT SCALABLE***.
3136: To fix this we need to extract just the indices of C's nonzero columns
3137: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3138: part of iscol[i] -- without actually computing ciscol[ii]. This also has
3139: to be done without serializing on the IS list, so, most likely, it is best
3140: done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3141: */
3142: ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
3143: /* Now we have to
3144: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3145: were sorted on each rank, concatenated they might no longer be sorted;
3146: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3147: indices in the nondecreasing order to the original index positions.
3148: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3149: */
3150: ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
3151: ISSort(ciscol[ii]);
3152: ++ii;
3153: }
3154: }
3155: PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
3156: for (i = 0, ii = 0; i < ismax; ++i) {
3157: PetscInt j,issize;
3158: const PetscInt *indices;
3160: /*
3161: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3162: */
3163: ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
3164: ISSort(isrow[i]);
3165: ISGetLocalSize(isrow[i],&issize);
3166: ISGetIndices(isrow[i],&indices);
3167: for (j = 1; j < issize; ++j) {
3168: if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3169: }
3170: ISRestoreIndices(isrow[i],&indices);
3171: ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
3172: ISSort(iscol[i]);
3173: ISGetLocalSize(iscol[i],&issize);
3174: ISGetIndices(iscol[i],&indices);
3175: for (j = 1; j < issize; ++j) {
3176: if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3177: }
3178: ISRestoreIndices(iscol[i],&indices);
3179: MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3180: if (size > 1) {
3181: cisrow[ii] = isrow[i];
3182: ++ii;
3183: }
3184: }
3185: /*
3186: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3187: array of sequential matrices underlying the resulting parallel matrices.
3188: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3189: contain duplicates.
3191: There are as many diag matrices as there are original index sets. There are only as many parallel
3192: and off-diag matrices, as there are parallel (comm size > 1) index sets.
3194: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3195: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3196: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3197: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3198: with A[i] and B[ii] extracted from the corresponding MPI submat.
3199: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3200: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
3201: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3202: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3203: values into A[i] and B[ii] sitting inside the corresponding submat.
3204: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3205: A[i], B[ii], AA[i] or BB[ii] matrices.
3206: */
3207: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3208: if (scall == MAT_INITIAL_MATRIX) {
3209: PetscMalloc1(ismax,submat);
3210: }
3212: /* Now obtain the sequential A and B submatrices separately. */
3213: /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3214: (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);
3215: (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);
3217: /*
3218: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3219: matrices A & B have been extracted directly into the parallel matrices containing them, or
3220: simply into the sequential matrix identical with the corresponding A (if size == 1).
3221: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3222: to have the same sparsity pattern.
3223: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3224: must be constructed for C. This is done by setseqmat(s).
3225: */
3226: for (i = 0, ii = 0; i < ismax; ++i) {
3227: /*
3228: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3229: That way we can avoid sorting and computing permutations when reusing.
3230: To this end:
3231: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3232: - if caching arrays to hold the ISs, make and compose a container for them so that it can
3233: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3234: */
3235: MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3237: MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3238: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3239: if (size > 1) {
3240: if (scall == MAT_INITIAL_MATRIX) {
3241: MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
3242: MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
3243: MatSetType((*submat)[i],MATMPIAIJ);
3244: PetscLayoutSetUp((*submat)[i]->rmap);
3245: PetscLayoutSetUp((*submat)[i]->cmap);
3246: }
3247: /*
3248: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3249: */
3250: {
3251: Mat AA = A[i],BB = B[ii];
3253: if (AA || BB) {
3254: setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
3255: MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
3256: MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
3257: }
3258: MatDestroy(&AA);
3259: }
3260: ISDestroy(ciscol+ii);
3261: ISDestroy(ciscol_p+ii);
3262: ++ii;
3263: } else { /* if (size == 1) */
3264: if (scall == MAT_REUSE_MATRIX) {
3265: MatDestroy(&(*submat)[i]);
3266: }
3267: if (isrow_p[i] || iscol_p[i]) {
3268: MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
3269: setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
3270: /* Otherwise A is extracted straight into (*submats)[i]. */
3271: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3272: MatDestroy(A+i);
3273: } else (*submat)[i] = A[i];
3274: }
3275: ISDestroy(&isrow_p[i]);
3276: ISDestroy(&iscol_p[i]);
3277: }
3278: PetscFree2(cisrow,ciscol);
3279: PetscFree2(isrow_p,iscol_p);
3280: PetscFree(ciscol_p);
3281: PetscFree(A);
3282: MatDestroySubMatrices(cismax,&B);
3283: return(0);
3284: }
3286: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3287: {
3291: MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
3292: return(0);
3293: }