Actual source code: mpiov.c

  1: /*
  2:    Routines to compute overlapping regions of a parallel MPI matrix
  3:   and to find submatrices that were shared across processors.
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <petscbt.h>
  8: #include <petscsf.h>

 10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscTable *);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
 13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
 14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);

 16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
 17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
 18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
 19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);

 21: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
 22: {
 23:   PetscInt i;

 26:   for (i = 0; i < ov; ++i) MatIncreaseOverlap_MPIAIJ_Once(C, imax, is);
 27:   return 0;
 28: }

 30: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
 31: {
 32:   PetscInt i;

 35:   for (i = 0; i < ov; ++i) MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is);
 36:   return 0;
 37: }

 39: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
 40: {
 41:   MPI_Comm        comm;
 42:   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
 43:   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
 44:   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
 45:   const PetscInt *indices_i, **indices;
 46:   PetscLayout     rmap;
 47:   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
 48:   PetscSF         sf;
 49:   PetscSFNode    *remote;

 51:   PetscObjectGetComm((PetscObject)mat, &comm);
 52:   MPI_Comm_rank(comm, &rank);
 53:   MPI_Comm_size(comm, &size);
 54:   /* get row map to determine where rows should be going */
 55:   MatGetLayouts(mat, &rmap, NULL);
 56:   /* retrieve IS data and put all together so that we
 57:    * can optimize communication
 58:    *  */
 59:   PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length);
 60:   for (i = 0, tlength = 0; i < nidx; i++) {
 61:     ISGetLocalSize(is[i], &length[i]);
 62:     tlength += length[i];
 63:     ISGetIndices(is[i], &indices[i]);
 64:   }
 65:   /* find these rows on remote processors */
 66:   PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids);
 67:   PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp);
 68:   nrrows = 0;
 69:   for (i = 0; i < nidx; i++) {
 70:     length_i  = length[i];
 71:     indices_i = indices[i];
 72:     for (j = 0; j < length_i; j++) {
 73:       owner = -1;
 74:       PetscLayoutFindOwner(rmap, indices_i[j], &owner);
 75:       /* remote processors */
 76:       if (owner != rank) {
 77:         tosizes_temp[owner]++;               /* number of rows to owner */
 78:         rrow_ranks[nrrows]   = owner;        /* processor */
 79:         rrow_isids[nrrows]   = i;            /* is id */
 80:         remoterows[nrrows++] = indices_i[j]; /* row */
 81:       }
 82:     }
 83:     ISRestoreIndices(is[i], &indices[i]);
 84:   }
 85:   PetscFree2(*(PetscInt ***)&indices, length);
 86:   /* test if we need to exchange messages
 87:    * generally speaking, we do not need to exchange
 88:    * data when overlap is 1
 89:    * */
 90:   MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm);
 91:   /* we do not have any messages
 92:    * It usually corresponds to overlap 1
 93:    * */
 94:   if (!reducednrrows) {
 95:     PetscFree3(toranks, tosizes, tosizes_temp);
 96:     PetscFree3(remoterows, rrow_ranks, rrow_isids);
 97:     MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is);
 98:     return 0;
 99:   }
100:   nto = 0;
101:   /* send sizes and ranks for building a two-sided communcation */
102:   for (i = 0; i < size; i++) {
103:     if (tosizes_temp[i]) {
104:       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
105:       tosizes_temp[i]  = nto;                 /* a map from processor to index */
106:       toranks[nto++]   = i;                   /* processor */
107:     }
108:   }
109:   PetscMalloc1(nto + 1, &toffsets);
110:   toffsets[0] = 0;
111:   for (i = 0; i < nto; i++) {
112:     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
113:     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
114:   }
115:   /* send information to other processors */
116:   PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes);
117:   nrecvrows = 0;
118:   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
119:   PetscMalloc1(nrecvrows, &remote);
120:   nrecvrows = 0;
121:   for (i = 0; i < nfrom; i++) {
122:     for (j = 0; j < fromsizes[2 * i]; j++) {
123:       remote[nrecvrows].rank    = fromranks[i];
124:       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
125:     }
126:   }
127:   PetscSFCreate(comm, &sf);
128:   PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER);
129:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
130:   PetscSFSetType(sf, PETSCSFBASIC);
131:   PetscSFSetFromOptions(sf);
132:   /* message pair <no of is, row>  */
133:   PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata);
134:   for (i = 0; i < nrrows; i++) {
135:     owner                 = rrow_ranks[i];       /* processor */
136:     j                     = tosizes_temp[owner]; /* index */
137:     todata[toffsets[j]++] = rrow_isids[i];
138:     todata[toffsets[j]++] = remoterows[i];
139:   }
140:   PetscFree3(toranks, tosizes, tosizes_temp);
141:   PetscFree3(remoterows, rrow_ranks, rrow_isids);
142:   PetscFree(toffsets);
143:   PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE);
144:   PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE);
145:   PetscSFDestroy(&sf);
146:   /* send rows belonging to the remote so that then we could get the overlapping data back */
147:   MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata);
148:   PetscFree2(todata, fromdata);
149:   PetscFree(fromsizes);
150:   PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes);
151:   PetscFree(fromranks);
152:   nrecvrows = 0;
153:   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
154:   PetscCalloc1(nrecvrows, &todata);
155:   PetscMalloc1(nrecvrows, &remote);
156:   nrecvrows = 0;
157:   for (i = 0; i < nto; i++) {
158:     for (j = 0; j < tosizes[2 * i]; j++) {
159:       remote[nrecvrows].rank    = toranks[i];
160:       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
161:     }
162:   }
163:   PetscSFCreate(comm, &sf);
164:   PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER);
165:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
166:   PetscSFSetType(sf, PETSCSFBASIC);
167:   PetscSFSetFromOptions(sf);
168:   /* overlap communication and computation */
169:   PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE);
170:   MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is);
171:   PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE);
172:   PetscSFDestroy(&sf);
173:   PetscFree2(sbdata, sbsizes);
174:   MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata);
175:   PetscFree(toranks);
176:   PetscFree(tosizes);
177:   PetscFree(todata);
178:   return 0;
179: }

181: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
182: {
183:   PetscInt       *isz, isz_i, i, j, is_id, data_size;
184:   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
185:   const PetscInt *indices_i_temp;
186:   MPI_Comm       *iscomms;

188:   max_lsize = 0;
189:   PetscMalloc1(nidx, &isz);
190:   for (i = 0; i < nidx; i++) {
191:     ISGetLocalSize(is[i], &lsize);
192:     max_lsize = lsize > max_lsize ? lsize : max_lsize;
193:     isz[i]    = lsize;
194:   }
195:   PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms);
196:   for (i = 0; i < nidx; i++) {
197:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL);
198:     ISGetIndices(is[i], &indices_i_temp);
199:     PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]);
200:     ISRestoreIndices(is[i], &indices_i_temp);
201:     ISDestroy(&is[i]);
202:   }
203:   /* retrieve information to get row id and its overlap */
204:   for (i = 0; i < nrecvs;) {
205:     is_id     = recvdata[i++];
206:     data_size = recvdata[i++];
207:     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
208:     isz_i     = isz[is_id];
209:     for (j = 0; j < data_size; j++) {
210:       col                = recvdata[i++];
211:       indices_i[isz_i++] = col;
212:     }
213:     isz[is_id] = isz_i;
214:   }
215:   /* remove duplicate entities */
216:   for (i = 0; i < nidx; i++) {
217:     indices_i = indices_temp + (max_lsize + nrecvs) * i;
218:     isz_i     = isz[i];
219:     PetscSortRemoveDupsInt(&isz_i, indices_i);
220:     ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]);
221:     PetscCommDestroy(&iscomms[i]);
222:   }
223:   PetscFree(isz);
224:   PetscFree2(indices_temp, iscomms);
225:   return 0;
226: }

228: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
229: {
230:   PetscLayout     rmap, cmap;
231:   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
232:   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
233:   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
234:   const PetscInt *gcols, *ai, *aj, *bi, *bj;
235:   Mat             amat, bmat;
236:   PetscMPIInt     rank;
237:   PetscBool       done;
238:   MPI_Comm        comm;

240:   PetscObjectGetComm((PetscObject)mat, &comm);
241:   MPI_Comm_rank(comm, &rank);
242:   MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols);
243:   /* Even if the mat is symmetric, we still assume it is not symmetric */
244:   MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
246:   MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
248:   /* total number of nonzero values is used to estimate the memory usage in the next step */
249:   tnz = ai[an] + bi[bn];
250:   MatGetLayouts(mat, &rmap, &cmap);
251:   PetscLayoutGetRange(rmap, &rstart, NULL);
252:   PetscLayoutGetRange(cmap, &cstart, NULL);
253:   /* to find the longest message */
254:   max_fszs = 0;
255:   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
256:   /* better way to estimate number of nonzero in the mat??? */
257:   PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp);
258:   for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
259:   rows_pos  = 0;
260:   totalrows = 0;
261:   for (i = 0; i < nfrom; i++) {
262:     PetscArrayzero(rows_pos_i, nidx);
263:     /* group data together */
264:     for (j = 0; j < fromsizes[2 * i]; j += 2) {
265:       is_id                       = fromrows[rows_pos++]; /* no of is */
266:       rows_i                      = rows_data[is_id];
267:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
268:     }
269:     /* estimate a space to avoid multiple allocations  */
270:     for (j = 0; j < nidx; j++) {
271:       indvc_ij = 0;
272:       rows_i   = rows_data[j];
273:       for (l = 0; l < rows_pos_i[j]; l++) {
274:         row   = rows_i[l] - rstart;
275:         start = ai[row];
276:         end   = ai[row + 1];
277:         for (k = start; k < end; k++) { /* Amat */
278:           col                     = aj[k] + cstart;
279:           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
280:         }
281:         start = bi[row];
282:         end   = bi[row + 1];
283:         for (k = start; k < end; k++) { /* Bmat */
284:           col                     = gcols[bj[k]];
285:           indices_tmp[indvc_ij++] = col;
286:         }
287:       }
288:       PetscSortRemoveDupsInt(&indvc_ij, indices_tmp);
289:       indv_counts[i * nidx + j] = indvc_ij;
290:       totalrows += indvc_ij;
291:     }
292:   }
293:   /* message triple <no of is, number of rows, rows> */
294:   PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes);
295:   totalrows = 0;
296:   rows_pos  = 0;
297:   /* use this code again */
298:   for (i = 0; i < nfrom; i++) {
299:     PetscArrayzero(rows_pos_i, nidx);
300:     for (j = 0; j < fromsizes[2 * i]; j += 2) {
301:       is_id                       = fromrows[rows_pos++];
302:       rows_i                      = rows_data[is_id];
303:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
304:     }
305:     /* add data  */
306:     for (j = 0; j < nidx; j++) {
307:       if (!indv_counts[i * nidx + j]) continue;
308:       indvc_ij            = 0;
309:       sbdata[totalrows++] = j;
310:       sbdata[totalrows++] = indv_counts[i * nidx + j];
311:       sbsizes[2 * i] += 2;
312:       rows_i = rows_data[j];
313:       for (l = 0; l < rows_pos_i[j]; l++) {
314:         row   = rows_i[l] - rstart;
315:         start = ai[row];
316:         end   = ai[row + 1];
317:         for (k = start; k < end; k++) { /* Amat */
318:           col                     = aj[k] + cstart;
319:           indices_tmp[indvc_ij++] = col;
320:         }
321:         start = bi[row];
322:         end   = bi[row + 1];
323:         for (k = start; k < end; k++) { /* Bmat */
324:           col                     = gcols[bj[k]];
325:           indices_tmp[indvc_ij++] = col;
326:         }
327:       }
328:       PetscSortRemoveDupsInt(&indvc_ij, indices_tmp);
329:       sbsizes[2 * i] += indvc_ij;
330:       PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij);
331:       totalrows += indvc_ij;
332:     }
333:   }
334:   PetscMalloc1(nfrom + 1, &offsets);
335:   offsets[0] = 0;
336:   for (i = 0; i < nfrom; i++) {
337:     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
338:     sbsizes[2 * i + 1] = offsets[i];
339:   }
340:   PetscFree(offsets);
341:   if (sbrowsizes) *sbrowsizes = sbsizes;
342:   if (sbrows) *sbrows = sbdata;
343:   PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp);
344:   MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
345:   MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
346:   return 0;
347: }

349: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
350: {
351:   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
352:   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
353:   PetscInt        lsize, lsize_tmp;
354:   PetscMPIInt     rank, owner;
355:   Mat             amat, bmat;
356:   PetscBool       done;
357:   PetscLayout     cmap, rmap;
358:   MPI_Comm        comm;

360:   PetscObjectGetComm((PetscObject)mat, &comm);
361:   MPI_Comm_rank(comm, &rank);
362:   MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols);
363:   MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
365:   MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
367:   /* is it a safe way to compute number of nonzero values ? */
368:   tnz = ai[an] + bi[bn];
369:   MatGetLayouts(mat, &rmap, &cmap);
370:   PetscLayoutGetRange(rmap, &rstart, NULL);
371:   PetscLayoutGetRange(cmap, &cstart, NULL);
372:   /* it is a better way to estimate memory than the old implementation
373:    * where global size of matrix is used
374:    * */
375:   PetscMalloc1(tnz, &indices_temp);
376:   for (i = 0; i < nidx; i++) {
377:     MPI_Comm iscomm;

379:     ISGetLocalSize(is[i], &lsize);
380:     ISGetIndices(is[i], &indices);
381:     lsize_tmp = 0;
382:     for (j = 0; j < lsize; j++) {
383:       owner = -1;
384:       row   = indices[j];
385:       PetscLayoutFindOwner(rmap, row, &owner);
386:       if (owner != rank) continue;
387:       /* local number */
388:       row -= rstart;
389:       start = ai[row];
390:       end   = ai[row + 1];
391:       for (k = start; k < end; k++) { /* Amat */
392:         col                       = aj[k] + cstart;
393:         indices_temp[lsize_tmp++] = col;
394:       }
395:       start = bi[row];
396:       end   = bi[row + 1];
397:       for (k = start; k < end; k++) { /* Bmat */
398:         col                       = gcols[bj[k]];
399:         indices_temp[lsize_tmp++] = col;
400:       }
401:     }
402:     ISRestoreIndices(is[i], &indices);
403:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL);
404:     ISDestroy(&is[i]);
405:     PetscSortRemoveDupsInt(&lsize_tmp, indices_temp);
406:     ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]);
407:     PetscCommDestroy(&iscomm);
408:   }
409:   PetscFree(indices_temp);
410:   MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
411:   MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
412:   return 0;
413: }

415: /*
416:   Sample message format:
417:   If a processor A wants processor B to process some elements corresponding
418:   to index sets is[1],is[5]
419:   mesg [0] = 2   (no of index sets in the mesg)
420:   -----------
421:   mesg [1] = 1 => is[1]
422:   mesg [2] = sizeof(is[1]);
423:   -----------
424:   mesg [3] = 5  => is[5]
425:   mesg [4] = sizeof(is[5]);
426:   -----------
427:   mesg [5]
428:   mesg [n]  datas[1]
429:   -----------
430:   mesg[n+1]
431:   mesg[m]  data(is[5])
432:   -----------

434:   Notes:
435:   nrqs - no of requests sent (or to be sent out)
436:   nrqr - no of requests received (which have to be or which have been processed)
437: */
438: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
439: {
440:   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
441:   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
442:   const PetscInt **idx, *idx_i;
443:   PetscInt        *n, **data, len;
444: #if defined(PETSC_USE_CTABLE)
445:   PetscTable *table_data, table_data_i;
446:   PetscInt   *tdata, tcount, tcount_max;
447: #else
448:   PetscInt *data_i, *d_p;
449: #endif
450:   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
451:   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
452:   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
453:   PetscBT     *table;
454:   MPI_Comm     comm;
455:   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
456:   MPI_Status  *recv_status;
457:   MPI_Comm    *iscomms;
458:   char        *t_p;

460:   PetscObjectGetComm((PetscObject)C, &comm);
461:   size = c->size;
462:   rank = c->rank;
463:   M    = C->rmap->N;

465:   PetscObjectGetNewTag((PetscObject)C, &tag1);
466:   PetscObjectGetNewTag((PetscObject)C, &tag2);

468:   PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n);

470:   for (i = 0; i < imax; i++) {
471:     ISGetIndices(is[i], &idx[i]);
472:     ISGetLocalSize(is[i], &n[i]);
473:   }

475:   /* evaluate communication - mesg to who,length of mesg, and buffer space
476:      required. Based on this, buffers are allocated, and data copied into them  */
477:   PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4);
478:   for (i = 0; i < imax; i++) {
479:     PetscArrayzero(w4, size); /* initialise work vector*/
480:     idx_i = idx[i];
481:     len   = n[i];
482:     for (j = 0; j < len; j++) {
483:       row = idx_i[j];
485:       PetscLayoutFindOwner(C->rmap, row, &proc);
486:       w4[proc]++;
487:     }
488:     for (j = 0; j < size; j++) {
489:       if (w4[j]) {
490:         w1[j] += w4[j];
491:         w3[j]++;
492:       }
493:     }
494:   }

496:   nrqs     = 0; /* no of outgoing messages */
497:   msz      = 0; /* total mesg length (for all proc */
498:   w1[rank] = 0; /* no mesg sent to intself */
499:   w3[rank] = 0;
500:   for (i = 0; i < size; i++) {
501:     if (w1[i]) {
502:       w2[i] = 1;
503:       nrqs++;
504:     } /* there exists a message to proc i */
505:   }
506:   /* pa - is list of processors to communicate with */
507:   PetscMalloc1(nrqs, &pa);
508:   for (i = 0, j = 0; i < size; i++) {
509:     if (w1[i]) {
510:       pa[j] = i;
511:       j++;
512:     }
513:   }

515:   /* Each message would have a header = 1 + 2*(no of IS) + data */
516:   for (i = 0; i < nrqs; i++) {
517:     j = pa[i];
518:     w1[j] += w2[j] + 2 * w3[j];
519:     msz += w1[j];
520:   }

522:   /* Determine the number of messages to expect, their lengths, from from-ids */
523:   PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);
524:   PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);

526:   /* Now post the Irecvs corresponding to these messages */
527:   PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1);

529:   /* Allocate Memory for outgoing messages */
530:   PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr);
531:   PetscArrayzero(outdat, size);
532:   PetscArrayzero(ptr, size);

534:   {
535:     PetscInt *iptr = tmp, ict = 0;
536:     for (i = 0; i < nrqs; i++) {
537:       j = pa[i];
538:       iptr += ict;
539:       outdat[j] = iptr;
540:       ict       = w1[j];
541:     }
542:   }

544:   /* Form the outgoing messages */
545:   /* plug in the headers */
546:   for (i = 0; i < nrqs; i++) {
547:     j            = pa[i];
548:     outdat[j][0] = 0;
549:     PetscArrayzero(outdat[j] + 1, 2 * w3[j]);
550:     ptr[j] = outdat[j] + 2 * w3[j] + 1;
551:   }

553:   /* Memory for doing local proc's work */
554:   {
555:     PetscInt M_BPB_imax = 0;
556: #if defined(PETSC_USE_CTABLE)
557:     PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax);
558:     PetscMalloc1(imax, &table_data);
559:     for (i = 0; i < imax; i++) PetscTableCreate(n[i], M, &table_data[i]);
560:     PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p);
561:     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
562: #else
563:     PetscInt Mimax = 0;
564:     PetscIntMultError(M, imax, &Mimax);
565:     PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax);
566:     PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p);
567:     for (i = 0; i < imax; i++) {
568:       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
569:       data[i]  = d_p + M * i;
570:     }
571: #endif
572:   }

574:   /* Parse the IS and update local tables and the outgoing buf with the data */
575:   {
576:     PetscInt n_i, isz_i, *outdat_j, ctr_j;
577:     PetscBT  table_i;

579:     for (i = 0; i < imax; i++) {
580:       PetscArrayzero(ctr, size);
581:       n_i     = n[i];
582:       table_i = table[i];
583:       idx_i   = idx[i];
584: #if defined(PETSC_USE_CTABLE)
585:       table_data_i = table_data[i];
586: #else
587:       data_i = data[i];
588: #endif
589:       isz_i = isz[i];
590:       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
591:         row = idx_i[j];
592:         PetscLayoutFindOwner(C->rmap, row, &proc);
593:         if (proc != rank) { /* copy to the outgoing buffer */
594:           ctr[proc]++;
595:           *ptr[proc] = row;
596:           ptr[proc]++;
597:         } else if (!PetscBTLookupSet(table_i, row)) {
598: #if defined(PETSC_USE_CTABLE)
599:           PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES);
600: #else
601:           data_i[isz_i] = row; /* Update the local table */
602: #endif
603:           isz_i++;
604:         }
605:       }
606:       /* Update the headers for the current IS */
607:       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
608:         if ((ctr_j = ctr[j])) {
609:           outdat_j            = outdat[j];
610:           k                   = ++outdat_j[0];
611:           outdat_j[2 * k]     = ctr_j;
612:           outdat_j[2 * k - 1] = i;
613:         }
614:       }
615:       isz[i] = isz_i;
616:     }
617:   }

619:   /*  Now  post the sends */
620:   PetscMalloc1(nrqs, &s_waits1);
621:   for (i = 0; i < nrqs; ++i) {
622:     j = pa[i];
623:     MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i);
624:   }

626:   /* No longer need the original indices */
627:   PetscMalloc1(imax, &iscomms);
628:   for (i = 0; i < imax; ++i) {
629:     ISRestoreIndices(is[i], idx + i);
630:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL);
631:   }
632:   PetscFree2(*(PetscInt ***)&idx, n);

634:   for (i = 0; i < imax; ++i) ISDestroy(&is[i]);

636:     /* Do Local work */
637: #if defined(PETSC_USE_CTABLE)
638:   MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data);
639: #else
640:   MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL);
641: #endif

643:   /* Receive messages */
644:   PetscMalloc1(nrqr, &recv_status);
645:   MPI_Waitall(nrqr, r_waits1, recv_status);
646:   MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE);

648:   /* Phase 1 sends are complete - deallocate buffers */
649:   PetscFree4(outdat, ptr, tmp, ctr);
650:   PetscFree4(w1, w2, w3, w4);

652:   PetscMalloc1(nrqr, &xdata);
653:   PetscMalloc1(nrqr, &isz1);
654:   MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1);
655:   PetscFree(rbuf[0]);
656:   PetscFree(rbuf);

658:   /* Send the data back */
659:   /* Do a global reduction to know the buffer space req for incoming messages */
660:   {
661:     PetscMPIInt *rw1;

663:     PetscCalloc1(size, &rw1);

665:     for (i = 0; i < nrqr; ++i) {
666:       proc = recv_status[i].MPI_SOURCE;

669:       rw1[proc] = isz1[i];
670:     }
671:     PetscFree(onodes1);
672:     PetscFree(olengths1);

674:     /* Determine the number of messages to expect, their lengths, from from-ids */
675:     PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2);
676:     PetscFree(rw1);
677:   }
678:   /* Now post the Irecvs corresponding to these messages */
679:   PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2);

681:   /* Now  post the sends */
682:   PetscMalloc1(nrqr, &s_waits2);
683:   for (i = 0; i < nrqr; ++i) {
684:     j = recv_status[i].MPI_SOURCE;
685:     MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i);
686:   }

688:   /* receive work done on other processors */
689:   {
690:     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
691:     PetscMPIInt idex;
692:     PetscBT     table_i;

694:     for (i = 0; i < nrqs; ++i) {
695:       MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE);
696:       /* Process the message */
697:       rbuf2_i = rbuf2[idex];
698:       ct1     = 2 * rbuf2_i[0] + 1;
699:       jmax    = rbuf2[idex][0];
700:       for (j = 1; j <= jmax; j++) {
701:         max     = rbuf2_i[2 * j];
702:         is_no   = rbuf2_i[2 * j - 1];
703:         isz_i   = isz[is_no];
704:         table_i = table[is_no];
705: #if defined(PETSC_USE_CTABLE)
706:         table_data_i = table_data[is_no];
707: #else
708:         data_i = data[is_no];
709: #endif
710:         for (k = 0; k < max; k++, ct1++) {
711:           row = rbuf2_i[ct1];
712:           if (!PetscBTLookupSet(table_i, row)) {
713: #if defined(PETSC_USE_CTABLE)
714:             PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES);
715: #else
716:             data_i[isz_i] = row;
717: #endif
718:             isz_i++;
719:           }
720:         }
721:         isz[is_no] = isz_i;
722:       }
723:     }

725:     MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE);
726:   }

728: #if defined(PETSC_USE_CTABLE)
729:   tcount_max = 0;
730:   for (i = 0; i < imax; ++i) {
731:     table_data_i = table_data[i];
732:     PetscTableGetCount(table_data_i, &tcount);
733:     if (tcount_max < tcount) tcount_max = tcount;
734:   }
735:   PetscMalloc1(tcount_max + 1, &tdata);
736: #endif

738:   for (i = 0; i < imax; ++i) {
739: #if defined(PETSC_USE_CTABLE)
740:     PetscTablePosition tpos;
741:     table_data_i = table_data[i];

743:     PetscTableGetHeadPosition(table_data_i, &tpos);
744:     while (tpos) {
745:       PetscTableGetNext(table_data_i, &tpos, &k, &j);
746:       tdata[--j] = --k;
747:     }
748:     ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i);
749: #else
750:     ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i);
751: #endif
752:     PetscCommDestroy(&iscomms[i]);
753:   }

755:   PetscFree(iscomms);
756:   PetscFree(onodes2);
757:   PetscFree(olengths2);

759:   PetscFree(pa);
760:   PetscFree(rbuf2[0]);
761:   PetscFree(rbuf2);
762:   PetscFree(s_waits1);
763:   PetscFree(r_waits1);
764:   PetscFree(s_waits2);
765:   PetscFree(r_waits2);
766:   PetscFree(recv_status);
767:   if (xdata) {
768:     PetscFree(xdata[0]);
769:     PetscFree(xdata);
770:   }
771:   PetscFree(isz1);
772: #if defined(PETSC_USE_CTABLE)
773:   for (i = 0; i < imax; i++) PetscTableDestroy((PetscTable *)&table_data[i]);
774:   PetscFree(table_data);
775:   PetscFree(tdata);
776:   PetscFree4(table, data, isz, t_p);
777: #else
778:   PetscFree5(table, data, isz, d_p, t_p);
779: #endif
780:   return 0;
781: }

783: /*
784:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
785:        the work on the local processor.

787:      Inputs:
788:       C      - MAT_MPIAIJ;
789:       imax - total no of index sets processed at a time;
790:       table  - an array of char - size = m bits.

792:      Output:
793:       isz    - array containing the count of the solution elements corresponding
794:                to each index set;
795:       data or table_data  - pointer to the solutions
796: */
797: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscTable *table_data)
798: {
799:   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
800:   Mat         A = c->A, B = c->B;
801:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
802:   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
803:   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
804:   PetscBT     table_i;
805: #if defined(PETSC_USE_CTABLE)
806:   PetscTable         table_data_i;
807:   PetscTablePosition tpos;
808:   PetscInt           tcount, *tdata;
809: #else
810:   PetscInt *data_i;
811: #endif

813:   rstart = C->rmap->rstart;
814:   cstart = C->cmap->rstart;
815:   ai     = a->i;
816:   aj     = a->j;
817:   bi     = b->i;
818:   bj     = b->j;
819:   garray = c->garray;

821:   for (i = 0; i < imax; i++) {
822: #if defined(PETSC_USE_CTABLE)
823:     /* copy existing entries of table_data_i into tdata[] */
824:     table_data_i = table_data[i];
825:     PetscTableGetCount(table_data_i, &tcount);

828:     PetscMalloc1(tcount, &tdata);
829:     PetscTableGetHeadPosition(table_data_i, &tpos);
830:     while (tpos) {
831:       PetscTableGetNext(table_data_i, &tpos, &row, &j);
832:       tdata[--j] = --row;
834:     }
835: #else
836:     data_i = data[i];
837: #endif
838:     table_i = table[i];
839:     isz_i   = isz[i];
840:     max     = isz[i];

842:     for (j = 0; j < max; j++) {
843: #if defined(PETSC_USE_CTABLE)
844:       row = tdata[j] - rstart;
845: #else
846:       row = data_i[j] - rstart;
847: #endif
848:       start = ai[row];
849:       end   = ai[row + 1];
850:       for (k = start; k < end; k++) { /* Amat */
851:         val = aj[k] + cstart;
852:         if (!PetscBTLookupSet(table_i, val)) {
853: #if defined(PETSC_USE_CTABLE)
854:           PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES);
855: #else
856:           data_i[isz_i] = val;
857: #endif
858:           isz_i++;
859:         }
860:       }
861:       start = bi[row];
862:       end   = bi[row + 1];
863:       for (k = start; k < end; k++) { /* Bmat */
864:         val = garray[bj[k]];
865:         if (!PetscBTLookupSet(table_i, val)) {
866: #if defined(PETSC_USE_CTABLE)
867:           PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES);
868: #else
869:           data_i[isz_i] = val;
870: #endif
871:           isz_i++;
872:         }
873:       }
874:     }
875:     isz[i] = isz_i;

877: #if defined(PETSC_USE_CTABLE)
878:     PetscFree(tdata);
879: #endif
880:   }
881:   return 0;
882: }

884: /*
885:       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
886:          and return the output

888:          Input:
889:            C    - the matrix
890:            nrqr - no of messages being processed.
891:            rbuf - an array of pointers to the received requests

893:          Output:
894:            xdata - array of messages to be sent back
895:            isz1  - size of each message

897:   For better efficiency perhaps we should malloc separately each xdata[i],
898: then if a remalloc is required we need only copy the data for that one row
899: rather then all previous rows as it is now where a single large chunk of
900: memory is used.

902: */
903: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
904: {
905:   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
906:   Mat         A = c->A, B = c->B;
907:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
908:   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
909:   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
910:   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
911:   PetscInt   *rbuf_i, kmax, rbuf_0;
912:   PetscBT     xtable;

914:   m      = C->rmap->N;
915:   rstart = C->rmap->rstart;
916:   cstart = C->cmap->rstart;
917:   ai     = a->i;
918:   aj     = a->j;
919:   bi     = b->i;
920:   bj     = b->j;
921:   garray = c->garray;

923:   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
924:     rbuf_i = rbuf[i];
925:     rbuf_0 = rbuf_i[0];
926:     ct += rbuf_0;
927:     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
928:   }

930:   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
931:   else max1 = 1;
932:   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
933:   if (nrqr) {
934:     PetscMalloc1(mem_estimate, &xdata[0]);
935:     ++no_malloc;
936:   }
937:   PetscBTCreate(m, &xtable);
938:   PetscArrayzero(isz1, nrqr);

940:   ct3 = 0;
941:   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
942:     rbuf_i = rbuf[i];
943:     rbuf_0 = rbuf_i[0];
944:     ct1    = 2 * rbuf_0 + 1;
945:     ct2    = ct1;
946:     ct3 += ct1;
947:     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
948:       PetscBTMemzero(m, xtable);
949:       oct2 = ct2;
950:       kmax = rbuf_i[2 * j];
951:       for (k = 0; k < kmax; k++, ct1++) {
952:         row = rbuf_i[ct1];
953:         if (!PetscBTLookupSet(xtable, row)) {
954:           if (!(ct3 < mem_estimate)) {
955:             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
956:             PetscMalloc1(new_estimate, &tmp);
957:             PetscArraycpy(tmp, xdata[0], mem_estimate);
958:             PetscFree(xdata[0]);
959:             xdata[0]     = tmp;
960:             mem_estimate = new_estimate;
961:             ++no_malloc;
962:             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
963:           }
964:           xdata[i][ct2++] = row;
965:           ct3++;
966:         }
967:       }
968:       for (k = oct2, max2 = ct2; k < max2; k++) {
969:         row   = xdata[i][k] - rstart;
970:         start = ai[row];
971:         end   = ai[row + 1];
972:         for (l = start; l < end; l++) {
973:           val = aj[l] + cstart;
974:           if (!PetscBTLookupSet(xtable, val)) {
975:             if (!(ct3 < mem_estimate)) {
976:               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
977:               PetscMalloc1(new_estimate, &tmp);
978:               PetscArraycpy(tmp, xdata[0], mem_estimate);
979:               PetscFree(xdata[0]);
980:               xdata[0]     = tmp;
981:               mem_estimate = new_estimate;
982:               ++no_malloc;
983:               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
984:             }
985:             xdata[i][ct2++] = val;
986:             ct3++;
987:           }
988:         }
989:         start = bi[row];
990:         end   = bi[row + 1];
991:         for (l = start; l < end; l++) {
992:           val = garray[bj[l]];
993:           if (!PetscBTLookupSet(xtable, val)) {
994:             if (!(ct3 < mem_estimate)) {
995:               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
996:               PetscMalloc1(new_estimate, &tmp);
997:               PetscArraycpy(tmp, xdata[0], mem_estimate);
998:               PetscFree(xdata[0]);
999:               xdata[0]     = tmp;
1000:               mem_estimate = new_estimate;
1001:               ++no_malloc;
1002:               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1003:             }
1004:             xdata[i][ct2++] = val;
1005:             ct3++;
1006:           }
1007:         }
1008:       }
1009:       /* Update the header*/
1010:       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1011:       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1012:     }
1013:     xdata[i][0] = rbuf_0;
1014:     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1015:     isz1[i] = ct2; /* size of each message */
1016:   }
1017:   PetscBTDestroy(&xtable);
1018:   PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc);
1019:   return 0;
1020: }
1021: /* -------------------------------------------------------------------------*/
1022: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1023: /*
1024:     Every processor gets the entire matrix
1025: */
1026: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1027: {
1028:   Mat         B;
1029:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1030:   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1031:   PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1032:   PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1033:   PetscInt    m, *b_sendj, *garray   = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;

1035:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1036:   MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank);
1037:   if (scall == MAT_INITIAL_MATRIX) {
1038:     /* ----------------------------------------------------------------
1039:          Tell every processor the number of nonzeros per row
1040:     */
1041:     PetscMalloc1(A->rmap->N, &lens);
1042:     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1043:     PetscMalloc2(size, &recvcounts, size, &displs);
1044:     for (i = 0; i < size; i++) {
1045:       recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1046:       displs[i]     = A->rmap->range[i];
1047:     }
1048:     MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A));
1049:     /* ---------------------------------------------------------------
1050:          Create the sequential matrix of the same type as the local block diagonal
1051:     */
1052:     MatCreate(PETSC_COMM_SELF, &B);
1053:     MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE);
1054:     MatSetBlockSizesFromMats(B, A, A);
1055:     MatSetType(B, ((PetscObject)a->A)->type_name);
1056:     MatSeqAIJSetPreallocation(B, 0, lens);
1057:     PetscCalloc1(2, Bin);
1058:     **Bin = B;
1059:     b     = (Mat_SeqAIJ *)B->data;

1061:     /*--------------------------------------------------------------------
1062:        Copy my part of matrix column indices over
1063:     */
1064:     sendcount  = ad->nz + bd->nz;
1065:     jsendbuf   = b->j + b->i[rstarts[rank]];
1066:     a_jsendbuf = ad->j;
1067:     b_jsendbuf = bd->j;
1068:     n          = A->rmap->rend - A->rmap->rstart;
1069:     cnt        = 0;
1070:     for (i = 0; i < n; i++) {
1071:       /* put in lower diagonal portion */
1072:       m = bd->i[i + 1] - bd->i[i];
1073:       while (m > 0) {
1074:         /* is it above diagonal (in bd (compressed) numbering) */
1075:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1076:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1077:         m--;
1078:       }

1080:       /* put in diagonal portion */
1081:       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;

1083:       /* put in upper diagonal portion */
1084:       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1085:     }

1088:     /*--------------------------------------------------------------------
1089:        Gather all column indices to all processors
1090:     */
1091:     for (i = 0; i < size; i++) {
1092:       recvcounts[i] = 0;
1093:       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1094:     }
1095:     displs[0] = 0;
1096:     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1097:     MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A));
1098:     /*--------------------------------------------------------------------
1099:         Assemble the matrix into useable form (note numerical values not yet set)
1100:     */
1101:     /* set the b->ilen (length of each row) values */
1102:     PetscArraycpy(b->ilen, lens, A->rmap->N);
1103:     /* set the b->i indices */
1104:     b->i[0] = 0;
1105:     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1106:     PetscFree(lens);
1107:     MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
1108:     MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);

1110:   } else {
1111:     B = **Bin;
1112:     b = (Mat_SeqAIJ *)B->data;
1113:   }

1115:   /*--------------------------------------------------------------------
1116:        Copy my part of matrix numerical values into the values location
1117:   */
1118:   if (flag == MAT_GET_VALUES) {
1119:     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1120:     MatScalar         *sendbuf, *recvbuf;

1122:     MatSeqAIJGetArrayRead(a->A, &ada);
1123:     MatSeqAIJGetArrayRead(a->B, &bda);
1124:     sendcount = ad->nz + bd->nz;
1125:     sendbuf   = b->a + b->i[rstarts[rank]];
1126:     a_sendbuf = ada;
1127:     b_sendbuf = bda;
1128:     b_sendj   = bd->j;
1129:     n         = A->rmap->rend - A->rmap->rstart;
1130:     cnt       = 0;
1131:     for (i = 0; i < n; i++) {
1132:       /* put in lower diagonal portion */
1133:       m = bd->i[i + 1] - bd->i[i];
1134:       while (m > 0) {
1135:         /* is it above diagonal (in bd (compressed) numbering) */
1136:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1137:         sendbuf[cnt++] = *b_sendbuf++;
1138:         m--;
1139:         b_sendj++;
1140:       }

1142:       /* put in diagonal portion */
1143:       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;

1145:       /* put in upper diagonal portion */
1146:       while (m-- > 0) {
1147:         sendbuf[cnt++] = *b_sendbuf++;
1148:         b_sendj++;
1149:       }
1150:     }

1153:     /* -----------------------------------------------------------------
1154:        Gather all numerical values to all processors
1155:     */
1156:     if (!recvcounts) PetscMalloc2(size, &recvcounts, size, &displs);
1157:     for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1158:     displs[0] = 0;
1159:     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1160:     recvbuf = b->a;
1161:     MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A));
1162:     MatSeqAIJRestoreArrayRead(a->A, &ada);
1163:     MatSeqAIJRestoreArrayRead(a->B, &bda);
1164:   } /* endof (flag == MAT_GET_VALUES) */
1165:   PetscFree2(recvcounts, displs);

1167:   MatPropagateSymmetryOptions(A, B);
1168:   return 0;
1169: }

1171: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1172: {
1173:   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1174:   Mat             submat, A = c->A, B = c->B;
1175:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1176:   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1177:   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1178:   const PetscInt *icol, *irow;
1179:   PetscInt        nrow, ncol, start;
1180:   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1181:   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1182:   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1183:   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1184:   PetscInt       *lens, rmax, ncols, *cols, Crow;
1185: #if defined(PETSC_USE_CTABLE)
1186:   PetscTable cmap, rmap;
1187:   PetscInt  *cmap_loc, *rmap_loc;
1188: #else
1189:   PetscInt *cmap, *rmap;
1190: #endif
1191:   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1192:   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1193:   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1194:   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1195:   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1196:   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1197:   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1198:   MPI_Comm      comm;
1199:   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1200:   PetscMPIInt  *onodes1, *olengths1, idex, end;
1201:   Mat_SubSppt  *smatis1;
1202:   PetscBool     isrowsorted, iscolsorted;

1207:   MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a);
1208:   MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a);
1209:   PetscObjectGetComm((PetscObject)C, &comm);
1210:   size = c->size;
1211:   rank = c->rank;

1213:   ISSorted(iscol[0], &iscolsorted);
1214:   ISSorted(isrow[0], &isrowsorted);
1215:   ISGetIndices(isrow[0], &irow);
1216:   ISGetLocalSize(isrow[0], &nrow);
1217:   if (allcolumns) {
1218:     icol = NULL;
1219:     ncol = C->cmap->N;
1220:   } else {
1221:     ISGetIndices(iscol[0], &icol);
1222:     ISGetLocalSize(iscol[0], &ncol);
1223:   }

1225:   if (scall == MAT_INITIAL_MATRIX) {
1226:     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;

1228:     /* Get some new tags to keep the communication clean */
1229:     tag1 = ((PetscObject)C)->tag;
1230:     PetscObjectGetNewTag((PetscObject)C, &tag2);
1231:     PetscObjectGetNewTag((PetscObject)C, &tag3);

1233:     /* evaluate communication - mesg to who, length of mesg, and buffer space
1234:      required. Based on this, buffers are allocated, and data copied into them */
1235:     PetscCalloc2(size, &w1, size, &w2);
1236:     PetscMalloc1(nrow, &row2proc);

1238:     /* w1[proc] = num of rows owned by proc -- to be requested */
1239:     proc = 0;
1240:     nrqs = 0; /* num of outgoing messages */
1241:     for (j = 0; j < nrow; j++) {
1242:       row = irow[j];
1243:       if (!isrowsorted) proc = 0;
1244:       while (row >= C->rmap->range[proc + 1]) proc++;
1245:       w1[proc]++;
1246:       row2proc[j] = proc; /* map row index to proc */

1248:       if (proc != rank && !w2[proc]) {
1249:         w2[proc] = 1;
1250:         nrqs++;
1251:       }
1252:     }
1253:     w1[rank] = 0; /* rows owned by self will not be requested */

1255:     PetscMalloc1(nrqs, &pa); /*(proc -array)*/
1256:     for (proc = 0, j = 0; proc < size; proc++) {
1257:       if (w1[proc]) pa[j++] = proc;
1258:     }

1260:     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1261:     msz = 0; /* total mesg length (for all procs) */
1262:     for (i = 0; i < nrqs; i++) {
1263:       proc = pa[i];
1264:       w1[proc] += 3;
1265:       msz += w1[proc];
1266:     }
1267:     PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz);

1269:     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1270:     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1271:     PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);

1273:     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1274:        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1275:     PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);

1277:     /* Now post the Irecvs corresponding to these messages */
1278:     PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1);

1280:     PetscFree(onodes1);
1281:     PetscFree(olengths1);

1283:     /* Allocate Memory for outgoing messages */
1284:     PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr);
1285:     PetscArrayzero(sbuf1, size);
1286:     PetscArrayzero(ptr, size);

1288:     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1289:     iptr = tmp;
1290:     for (i = 0; i < nrqs; i++) {
1291:       proc        = pa[i];
1292:       sbuf1[proc] = iptr;
1293:       iptr += w1[proc];
1294:     }

1296:     /* Form the outgoing messages */
1297:     /* Initialize the header space */
1298:     for (i = 0; i < nrqs; i++) {
1299:       proc = pa[i];
1300:       PetscArrayzero(sbuf1[proc], 3);
1301:       ptr[proc] = sbuf1[proc] + 3;
1302:     }

1304:     /* Parse the isrow and copy data into outbuf */
1305:     PetscArrayzero(ctr, size);
1306:     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1307:       proc = row2proc[j];
1308:       if (proc != rank) { /* copy to the outgoing buf*/
1309:         *ptr[proc] = irow[j];
1310:         ctr[proc]++;
1311:         ptr[proc]++;
1312:       }
1313:     }

1315:     /* Update the headers for the current IS */
1316:     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1317:       if ((ctr_j = ctr[j])) {
1318:         sbuf1_j            = sbuf1[j];
1319:         k                  = ++sbuf1_j[0];
1320:         sbuf1_j[2 * k]     = ctr_j;
1321:         sbuf1_j[2 * k - 1] = 0;
1322:       }
1323:     }

1325:     /* Now post the sends */
1326:     PetscMalloc1(nrqs, &s_waits1);
1327:     for (i = 0; i < nrqs; ++i) {
1328:       proc = pa[i];
1329:       MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i);
1330:     }

1332:     /* Post Receives to capture the buffer size */
1333:     PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2);
1334:     PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3);

1336:     if (nrqs) rbuf2[0] = tmp + msz;
1337:     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];

1339:     for (i = 0; i < nrqs; ++i) {
1340:       proc = pa[i];
1341:       MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i);
1342:     }

1344:     PetscFree2(w1, w2);

1346:     /* Send to other procs the buf size they should allocate */
1347:     /* Receive messages*/
1348:     PetscMalloc1(nrqr, &r_status1);
1349:     PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1);

1351:     MPI_Waitall(nrqr, r_waits1, r_status1);
1352:     for (i = 0; i < nrqr; ++i) {
1353:       req_size[i] = 0;
1354:       rbuf1_i     = rbuf1[i];
1355:       start       = 2 * rbuf1_i[0] + 1;
1356:       MPI_Get_count(r_status1 + i, MPIU_INT, &end);
1357:       PetscMalloc1(end, &sbuf2[i]);
1358:       sbuf2_i = sbuf2[i];
1359:       for (j = start; j < end; j++) {
1360:         k          = rbuf1_i[j] - rstart;
1361:         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1362:         sbuf2_i[j] = ncols;
1363:         req_size[i] += ncols;
1364:       }
1365:       req_source1[i] = r_status1[i].MPI_SOURCE;

1367:       /* form the header */
1368:       sbuf2_i[0] = req_size[i];
1369:       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];

1371:       MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i);
1372:     }

1374:     PetscFree(r_status1);
1375:     PetscFree(r_waits1);

1377:     /* rbuf2 is received, Post recv column indices a->j */
1378:     MPI_Waitall(nrqs, r_waits2, r_status2);

1380:     PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3);
1381:     for (i = 0; i < nrqs; ++i) {
1382:       PetscMalloc1(rbuf2[i][0], &rbuf3[i]);
1383:       req_source2[i] = r_status2[i].MPI_SOURCE;
1384:       MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i);
1385:     }

1387:     /* Wait on sends1 and sends2 */
1388:     PetscMalloc1(nrqs, &s_status1);
1389:     MPI_Waitall(nrqs, s_waits1, s_status1);
1390:     PetscFree(s_waits1);
1391:     PetscFree(s_status1);

1393:     MPI_Waitall(nrqr, s_waits2, s_status2);
1394:     PetscFree4(r_status2, s_waits2, r_waits2, s_status2);

1396:     /* Now allocate sending buffers for a->j, and send them off */
1397:     PetscMalloc1(nrqr, &sbuf_aj);
1398:     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1399:     if (nrqr) PetscMalloc1(j, &sbuf_aj[0]);
1400:     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];

1402:     for (i = 0; i < nrqr; i++) { /* for each requested message */
1403:       rbuf1_i   = rbuf1[i];
1404:       sbuf_aj_i = sbuf_aj[i];
1405:       ct1       = 2 * rbuf1_i[0] + 1;
1406:       ct2       = 0;

1409:       kmax = rbuf1[i][2];
1410:       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1411:         row    = rbuf1_i[ct1] - rstart;
1412:         nzA    = ai[row + 1] - ai[row];
1413:         nzB    = bi[row + 1] - bi[row];
1414:         ncols  = nzA + nzB;
1415:         cworkA = aj + ai[row];
1416:         cworkB = bj + bi[row];

1418:         /* load the column indices for this row into cols*/
1419:         cols = sbuf_aj_i + ct2;

1421:         lwrite = 0;
1422:         for (l = 0; l < nzB; l++) {
1423:           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1424:         }
1425:         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1426:         for (l = 0; l < nzB; l++) {
1427:           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1428:         }

1430:         ct2 += ncols;
1431:       }
1432:       MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i);
1433:     }

1435:     /* create column map (cmap): global col of C -> local col of submat */
1436: #if defined(PETSC_USE_CTABLE)
1437:     if (!allcolumns) {
1438:       PetscTableCreate(ncol, C->cmap->N, &cmap);
1439:       PetscCalloc1(C->cmap->n, &cmap_loc);
1440:       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1441:         if (icol[j] >= cstart && icol[j] < cend) {
1442:           cmap_loc[icol[j] - cstart] = j + 1;
1443:         } else { /* use PetscTable for non-local col indices */
1444:           PetscTableAdd(cmap, icol[j] + 1, j + 1, INSERT_VALUES);
1445:         }
1446:       }
1447:     } else {
1448:       cmap     = NULL;
1449:       cmap_loc = NULL;
1450:     }
1451:     PetscCalloc1(C->rmap->n, &rmap_loc);
1452: #else
1453:     if (!allcolumns) {
1454:       PetscCalloc1(C->cmap->N, &cmap);
1455:       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1456:     } else {
1457:       cmap = NULL;
1458:     }
1459: #endif

1461:     /* Create lens for MatSeqAIJSetPreallocation() */
1462:     PetscCalloc1(nrow, &lens);

1464:     /* Compute lens from local part of C */
1465:     for (j = 0; j < nrow; j++) {
1466:       row  = irow[j];
1467:       proc = row2proc[j];
1468:       if (proc == rank) {
1469:         /* diagonal part A = c->A */
1470:         ncols = ai[row - rstart + 1] - ai[row - rstart];
1471:         cols  = aj + ai[row - rstart];
1472:         if (!allcolumns) {
1473:           for (k = 0; k < ncols; k++) {
1474: #if defined(PETSC_USE_CTABLE)
1475:             tcol = cmap_loc[cols[k]];
1476: #else
1477:             tcol = cmap[cols[k] + cstart];
1478: #endif
1479:             if (tcol) lens[j]++;
1480:           }
1481:         } else { /* allcolumns */
1482:           lens[j] = ncols;
1483:         }

1485:         /* off-diagonal part B = c->B */
1486:         ncols = bi[row - rstart + 1] - bi[row - rstart];
1487:         cols  = bj + bi[row - rstart];
1488:         if (!allcolumns) {
1489:           for (k = 0; k < ncols; k++) {
1490: #if defined(PETSC_USE_CTABLE)
1491:             PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol);
1492: #else
1493:             tcol = cmap[bmap[cols[k]]];
1494: #endif
1495:             if (tcol) lens[j]++;
1496:           }
1497:         } else { /* allcolumns */
1498:           lens[j] += ncols;
1499:         }
1500:       }
1501:     }

1503:     /* Create row map (rmap): global row of C -> local row of submat */
1504: #if defined(PETSC_USE_CTABLE)
1505:     PetscTableCreate(nrow, C->rmap->N, &rmap);
1506:     for (j = 0; j < nrow; j++) {
1507:       row  = irow[j];
1508:       proc = row2proc[j];
1509:       if (proc == rank) { /* a local row */
1510:         rmap_loc[row - rstart] = j;
1511:       } else {
1512:         PetscTableAdd(rmap, irow[j] + 1, j + 1, INSERT_VALUES);
1513:       }
1514:     }
1515: #else
1516:     PetscCalloc1(C->rmap->N, &rmap);
1517:     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1518: #endif

1520:     /* Update lens from offproc data */
1521:     /* recv a->j is done */
1522:     MPI_Waitall(nrqs, r_waits3, r_status3);
1523:     for (i = 0; i < nrqs; i++) {
1524:       proc    = pa[i];
1525:       sbuf1_i = sbuf1[proc];
1527:       ct1     = 2 + 1;
1528:       ct2     = 0;
1529:       rbuf2_i = rbuf2[i]; /* received length of C->j */
1530:       rbuf3_i = rbuf3[i]; /* received C->j */

1533:       max1 = sbuf1_i[2];
1534:       for (k = 0; k < max1; k++, ct1++) {
1535: #if defined(PETSC_USE_CTABLE)
1536:         PetscTableFind(rmap, sbuf1_i[ct1] + 1, &row);
1537:         row--;
1539: #else
1540:         row = rmap[sbuf1_i[ct1]];      /* the row index in submat */
1541: #endif
1542:         /* Now, store row index of submat in sbuf1_i[ct1] */
1543:         sbuf1_i[ct1] = row;

1545:         nnz = rbuf2_i[ct1];
1546:         if (!allcolumns) {
1547:           for (l = 0; l < nnz; l++, ct2++) {
1548: #if defined(PETSC_USE_CTABLE)
1549:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1550:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1551:             } else {
1552:               PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol);
1553:             }
1554: #else
1555:             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1556: #endif
1557:             if (tcol) lens[row]++;
1558:           }
1559:         } else { /* allcolumns */
1560:           lens[row] += nnz;
1561:         }
1562:       }
1563:     }
1564:     MPI_Waitall(nrqr, s_waits3, s_status3);
1565:     PetscFree4(r_waits3, s_waits3, r_status3, s_status3);

1567:     /* Create the submatrices */
1568:     MatCreate(PETSC_COMM_SELF, &submat);
1569:     MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE);

1571:     ISGetBlockSize(isrow[0], &i);
1572:     ISGetBlockSize(iscol[0], &j);
1573:     MatSetBlockSizes(submat, i, j);
1574:     MatSetType(submat, ((PetscObject)A)->type_name);
1575:     MatSeqAIJSetPreallocation(submat, 0, lens);

1577:     /* create struct Mat_SubSppt and attached it to submat */
1578:     PetscNew(&smatis1);
1579:     subc            = (Mat_SeqAIJ *)submat->data;
1580:     subc->submatis1 = smatis1;

1582:     smatis1->id          = 0;
1583:     smatis1->nrqs        = nrqs;
1584:     smatis1->nrqr        = nrqr;
1585:     smatis1->rbuf1       = rbuf1;
1586:     smatis1->rbuf2       = rbuf2;
1587:     smatis1->rbuf3       = rbuf3;
1588:     smatis1->sbuf2       = sbuf2;
1589:     smatis1->req_source2 = req_source2;

1591:     smatis1->sbuf1 = sbuf1;
1592:     smatis1->ptr   = ptr;
1593:     smatis1->tmp   = tmp;
1594:     smatis1->ctr   = ctr;

1596:     smatis1->pa          = pa;
1597:     smatis1->req_size    = req_size;
1598:     smatis1->req_source1 = req_source1;

1600:     smatis1->allcolumns = allcolumns;
1601:     smatis1->singleis   = PETSC_TRUE;
1602:     smatis1->row2proc   = row2proc;
1603:     smatis1->rmap       = rmap;
1604:     smatis1->cmap       = cmap;
1605: #if defined(PETSC_USE_CTABLE)
1606:     smatis1->rmap_loc = rmap_loc;
1607:     smatis1->cmap_loc = cmap_loc;
1608: #endif

1610:     smatis1->destroy     = submat->ops->destroy;
1611:     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1612:     submat->factortype   = C->factortype;

1614:     /* compute rmax */
1615:     rmax = 0;
1616:     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);

1618:   } else { /* scall == MAT_REUSE_MATRIX */
1619:     submat = submats[0];

1622:     subc        = (Mat_SeqAIJ *)submat->data;
1623:     rmax        = subc->rmax;
1624:     smatis1     = subc->submatis1;
1625:     nrqs        = smatis1->nrqs;
1626:     nrqr        = smatis1->nrqr;
1627:     rbuf1       = smatis1->rbuf1;
1628:     rbuf2       = smatis1->rbuf2;
1629:     rbuf3       = smatis1->rbuf3;
1630:     req_source2 = smatis1->req_source2;

1632:     sbuf1 = smatis1->sbuf1;
1633:     sbuf2 = smatis1->sbuf2;
1634:     ptr   = smatis1->ptr;
1635:     tmp   = smatis1->tmp;
1636:     ctr   = smatis1->ctr;

1638:     pa          = smatis1->pa;
1639:     req_size    = smatis1->req_size;
1640:     req_source1 = smatis1->req_source1;

1642:     allcolumns = smatis1->allcolumns;
1643:     row2proc   = smatis1->row2proc;
1644:     rmap       = smatis1->rmap;
1645:     cmap       = smatis1->cmap;
1646: #if defined(PETSC_USE_CTABLE)
1647:     rmap_loc = smatis1->rmap_loc;
1648:     cmap_loc = smatis1->cmap_loc;
1649: #endif
1650:   }

1652:   /* Post recv matrix values */
1653:   PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals);
1654:   PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4);
1655:   PetscObjectGetNewTag((PetscObject)C, &tag4);
1656:   for (i = 0; i < nrqs; ++i) {
1657:     PetscMalloc1(rbuf2[i][0], &rbuf4[i]);
1658:     MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i);
1659:   }

1661:   /* Allocate sending buffers for a->a, and send them off */
1662:   PetscMalloc1(nrqr, &sbuf_aa);
1663:   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1664:   if (nrqr) PetscMalloc1(j, &sbuf_aa[0]);
1665:   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];

1667:   for (i = 0; i < nrqr; i++) {
1668:     rbuf1_i   = rbuf1[i];
1669:     sbuf_aa_i = sbuf_aa[i];
1670:     ct1       = 2 * rbuf1_i[0] + 1;
1671:     ct2       = 0;

1674:     kmax = rbuf1_i[2];
1675:     for (k = 0; k < kmax; k++, ct1++) {
1676:       row    = rbuf1_i[ct1] - rstart;
1677:       nzA    = ai[row + 1] - ai[row];
1678:       nzB    = bi[row + 1] - bi[row];
1679:       ncols  = nzA + nzB;
1680:       cworkB = bj + bi[row];
1681:       vworkA = a_a + ai[row];
1682:       vworkB = b_a + bi[row];

1684:       /* load the column values for this row into vals*/
1685:       vals = sbuf_aa_i + ct2;

1687:       lwrite = 0;
1688:       for (l = 0; l < nzB; l++) {
1689:         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1690:       }
1691:       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1692:       for (l = 0; l < nzB; l++) {
1693:         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1694:       }

1696:       ct2 += ncols;
1697:     }
1698:     MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i);
1699:   }

1701:   /* Assemble submat */
1702:   /* First assemble the local rows */
1703:   for (j = 0; j < nrow; j++) {
1704:     row  = irow[j];
1705:     proc = row2proc[j];
1706:     if (proc == rank) {
1707:       Crow = row - rstart; /* local row index of C */
1708: #if defined(PETSC_USE_CTABLE)
1709:       row = rmap_loc[Crow]; /* row index of submat */
1710: #else
1711:       row = rmap[row];
1712: #endif

1714:       if (allcolumns) {
1715:         /* diagonal part A = c->A */
1716:         ncols = ai[Crow + 1] - ai[Crow];
1717:         cols  = aj + ai[Crow];
1718:         vals  = a_a + ai[Crow];
1719:         i     = 0;
1720:         for (k = 0; k < ncols; k++) {
1721:           subcols[i]   = cols[k] + cstart;
1722:           subvals[i++] = vals[k];
1723:         }

1725:         /* off-diagonal part B = c->B */
1726:         ncols = bi[Crow + 1] - bi[Crow];
1727:         cols  = bj + bi[Crow];
1728:         vals  = b_a + bi[Crow];
1729:         for (k = 0; k < ncols; k++) {
1730:           subcols[i]   = bmap[cols[k]];
1731:           subvals[i++] = vals[k];
1732:         }

1734:         MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES);

1736:       } else { /* !allcolumns */
1737: #if defined(PETSC_USE_CTABLE)
1738:         /* diagonal part A = c->A */
1739:         ncols = ai[Crow + 1] - ai[Crow];
1740:         cols  = aj + ai[Crow];
1741:         vals  = a_a + ai[Crow];
1742:         i     = 0;
1743:         for (k = 0; k < ncols; k++) {
1744:           tcol = cmap_loc[cols[k]];
1745:           if (tcol) {
1746:             subcols[i]   = --tcol;
1747:             subvals[i++] = vals[k];
1748:           }
1749:         }

1751:         /* off-diagonal part B = c->B */
1752:         ncols = bi[Crow + 1] - bi[Crow];
1753:         cols  = bj + bi[Crow];
1754:         vals  = b_a + bi[Crow];
1755:         for (k = 0; k < ncols; k++) {
1756:           PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol);
1757:           if (tcol) {
1758:             subcols[i]   = --tcol;
1759:             subvals[i++] = vals[k];
1760:           }
1761:         }
1762: #else
1763:         /* diagonal part A = c->A */
1764:         ncols = ai[Crow + 1] - ai[Crow];
1765:         cols  = aj + ai[Crow];
1766:         vals  = a_a + ai[Crow];
1767:         i     = 0;
1768:         for (k = 0; k < ncols; k++) {
1769:           tcol = cmap[cols[k] + cstart];
1770:           if (tcol) {
1771:             subcols[i]   = --tcol;
1772:             subvals[i++] = vals[k];
1773:           }
1774:         }

1776:         /* off-diagonal part B = c->B */
1777:         ncols = bi[Crow + 1] - bi[Crow];
1778:         cols  = bj + bi[Crow];
1779:         vals  = b_a + bi[Crow];
1780:         for (k = 0; k < ncols; k++) {
1781:           tcol = cmap[bmap[cols[k]]];
1782:           if (tcol) {
1783:             subcols[i]   = --tcol;
1784:             subvals[i++] = vals[k];
1785:           }
1786:         }
1787: #endif
1788:         MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES);
1789:       }
1790:     }
1791:   }

1793:   /* Now assemble the off-proc rows */
1794:   for (i = 0; i < nrqs; i++) { /* for each requested message */
1795:     /* recv values from other processes */
1796:     MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i);
1797:     proc    = pa[idex];
1798:     sbuf1_i = sbuf1[proc];
1800:     ct1     = 2 + 1;
1801:     ct2     = 0;           /* count of received C->j */
1802:     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1803:     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1804:     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1805:     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */

1808:     max1 = sbuf1_i[2];                  /* num of rows */
1809:     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1810:       row = sbuf1_i[ct1];               /* row index of submat */
1811:       if (!allcolumns) {
1812:         idex = 0;
1813:         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1814:           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1815:           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1816: #if defined(PETSC_USE_CTABLE)
1817:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1818:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1819:             } else {
1820:               PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol);
1821:             }
1822: #else
1823:             tcol = cmap[rbuf3_i[ct2]];
1824: #endif
1825:             if (tcol) {
1826:               subcols[idex]   = --tcol; /* may not be sorted */
1827:               subvals[idex++] = rbuf4_i[ct2];

1829:               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1830:                For reuse, we replace received C->j with index that should be inserted to submat */
1831:               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1832:             }
1833:           }
1834:           MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES);
1835:         } else { /* scall == MAT_REUSE_MATRIX */
1836:           submat = submats[0];
1837:           subc   = (Mat_SeqAIJ *)submat->data;

1839:           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1840:           for (l = 0; l < nnz; l++) {
1841:             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1842:             subvals[idex++] = rbuf4_i[ct2];
1843:           }

1845:           bj = subc->j + subc->i[row]; /* sorted column indices */
1846:           MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES);
1847:         }
1848:       } else {              /* allcolumns */
1849:         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1850:         MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES);
1851:         ct2 += nnz;
1852:       }
1853:     }
1854:   }

1856:   /* sending a->a are done */
1857:   MPI_Waitall(nrqr, s_waits4, s_status4);
1858:   PetscFree4(r_waits4, s_waits4, r_status4, s_status4);

1860:   MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY);
1861:   MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY);
1862:   submats[0] = submat;

1864:   /* Restore the indices */
1865:   ISRestoreIndices(isrow[0], &irow);
1866:   if (!allcolumns) ISRestoreIndices(iscol[0], &icol);

1868:   /* Destroy allocated memory */
1869:   for (i = 0; i < nrqs; ++i) PetscFree(rbuf4[i]);
1870:   PetscFree3(rbuf4, subcols, subvals);
1871:   if (sbuf_aa) {
1872:     PetscFree(sbuf_aa[0]);
1873:     PetscFree(sbuf_aa);
1874:   }

1876:   if (scall == MAT_INITIAL_MATRIX) {
1877:     PetscFree(lens);
1878:     if (sbuf_aj) {
1879:       PetscFree(sbuf_aj[0]);
1880:       PetscFree(sbuf_aj);
1881:     }
1882:   }
1883:   MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a);
1884:   MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a);
1885:   return 0;
1886: }

1888: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1889: {
1890:   PetscInt  ncol;
1891:   PetscBool colflag, allcolumns = PETSC_FALSE;

1893:   /* Allocate memory to hold all the submatrices */
1894:   if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(2, submat);

1896:   /* Check for special case: each processor gets entire matrix columns */
1897:   ISIdentity(iscol[0], &colflag);
1898:   ISGetLocalSize(iscol[0], &ncol);
1899:   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;

1901:   MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat);
1902:   return 0;
1903: }

1905: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1906: {
1907:   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1908:   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1909:   Mat_SeqAIJ  *subc;
1910:   Mat_SubSppt *smat;

1912:   /* Check for special case: each processor has a single IS */
1913:   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1914:     MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat);
1915:     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1916:     return 0;
1917:   }

1919:   /* Collect global wantallmatrix and nstages */
1920:   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1921:   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1922:   if (!nmax) nmax = 1;

1924:   if (scall == MAT_INITIAL_MATRIX) {
1925:     /* Collect global wantallmatrix and nstages */
1926:     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1927:       ISIdentity(*isrow, &rowflag);
1928:       ISIdentity(*iscol, &colflag);
1929:       ISGetLocalSize(*isrow, &nrow);
1930:       ISGetLocalSize(*iscol, &ncol);
1931:       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1932:         wantallmatrix = PETSC_TRUE;

1934:         PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL);
1935:       }
1936:     }

1938:     /* Determine the number of stages through which submatrices are done
1939:        Each stage will extract nmax submatrices.
1940:        nmax is determined by the matrix column dimension.
1941:        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1942:     */
1943:     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */

1945:     in[0] = -1 * (PetscInt)wantallmatrix;
1946:     in[1] = nstages;
1947:     MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C));
1948:     wantallmatrix = (PetscBool)(-out[0]);
1949:     nstages       = out[1]; /* Make sure every processor loops through the global nstages */

1951:   } else { /* MAT_REUSE_MATRIX */
1952:     if (ismax) {
1953:       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
1954:       smat = subc->submatis1;
1955:     } else { /* (*submat)[0] is a dummy matrix */
1956:       smat = (Mat_SubSppt *)(*submat)[0]->data;
1957:     }
1958:     if (!smat) {
1959:       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1960:       wantallmatrix = PETSC_TRUE;
1961:     } else if (smat->singleis) {
1962:       MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat);
1963:       return 0;
1964:     } else {
1965:       nstages = smat->nstages;
1966:     }
1967:   }

1969:   if (wantallmatrix) {
1970:     MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat);
1971:     return 0;
1972:   }

1974:   /* Allocate memory to hold all the submatrices and dummy submatrices */
1975:   if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(ismax + nstages, submat);

1977:   for (i = 0, pos = 0; i < nstages; i++) {
1978:     if (pos + nmax <= ismax) max_no = nmax;
1979:     else if (pos >= ismax) max_no = 0;
1980:     else max_no = ismax - pos;

1982:     MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos);
1983:     if (!max_no) {
1984:       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
1985:         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
1986:         smat->nstages = nstages;
1987:       }
1988:       pos++; /* advance to next dummy matrix if any */
1989:     } else pos += max_no;
1990:   }

1992:   if (ismax && scall == MAT_INITIAL_MATRIX) {
1993:     /* save nstages for reuse */
1994:     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
1995:     smat          = subc->submatis1;
1996:     smat->nstages = nstages;
1997:   }
1998:   return 0;
1999: }

2001: /* -------------------------------------------------------------------------*/
2002: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2003: {
2004:   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2005:   Mat              A = c->A;
2006:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2007:   const PetscInt **icol, **irow;
2008:   PetscInt        *nrow, *ncol, start;
2009:   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2010:   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2011:   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2012:   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2013:   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2014: #if defined(PETSC_USE_CTABLE)
2015:   PetscTable *cmap, cmap_i = NULL, *rmap, rmap_i;
2016: #else
2017:   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2018: #endif
2019:   const PetscInt *irow_i;
2020:   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2021:   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2022:   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2023:   MPI_Comm        comm;
2024:   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2025:   PetscMPIInt    *onodes1, *olengths1, end;
2026:   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2027:   Mat_SubSppt    *smat_i;
2028:   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2029:   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;

2031:   PetscObjectGetComm((PetscObject)C, &comm);
2032:   size = c->size;
2033:   rank = c->rank;

2035:   PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns);
2036:   PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted);

2038:   for (i = 0; i < ismax; i++) {
2039:     ISSorted(iscol[i], &issorted[i]);
2040:     if (!issorted[i]) iscsorted = issorted[i];

2042:     ISSorted(isrow[i], &issorted[i]);

2044:     ISGetIndices(isrow[i], &irow[i]);
2045:     ISGetLocalSize(isrow[i], &nrow[i]);

2047:     /* Check for special case: allcolumn */
2048:     ISIdentity(iscol[i], &colflag);
2049:     ISGetLocalSize(iscol[i], &ncol[i]);
2050:     if (colflag && ncol[i] == C->cmap->N) {
2051:       allcolumns[i] = PETSC_TRUE;
2052:       icol[i]       = NULL;
2053:     } else {
2054:       allcolumns[i] = PETSC_FALSE;
2055:       ISGetIndices(iscol[i], &icol[i]);
2056:     }
2057:   }

2059:   if (scall == MAT_REUSE_MATRIX) {
2060:     /* Assumes new rows are same length as the old rows */
2061:     for (i = 0; i < ismax; i++) {
2063:       subc = (Mat_SeqAIJ *)submats[i]->data;

2066:       /* Initial matrix as if empty */
2067:       PetscArrayzero(subc->ilen, submats[i]->rmap->n);

2069:       smat_i = subc->submatis1;

2071:       nrqs        = smat_i->nrqs;
2072:       nrqr        = smat_i->nrqr;
2073:       rbuf1       = smat_i->rbuf1;
2074:       rbuf2       = smat_i->rbuf2;
2075:       rbuf3       = smat_i->rbuf3;
2076:       req_source2 = smat_i->req_source2;

2078:       sbuf1 = smat_i->sbuf1;
2079:       sbuf2 = smat_i->sbuf2;
2080:       ptr   = smat_i->ptr;
2081:       tmp   = smat_i->tmp;
2082:       ctr   = smat_i->ctr;

2084:       pa          = smat_i->pa;
2085:       req_size    = smat_i->req_size;
2086:       req_source1 = smat_i->req_source1;

2088:       allcolumns[i] = smat_i->allcolumns;
2089:       row2proc[i]   = smat_i->row2proc;
2090:       rmap[i]       = smat_i->rmap;
2091:       cmap[i]       = smat_i->cmap;
2092:     }

2094:     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2096:       smat_i = (Mat_SubSppt *)submats[0]->data;

2098:       nrqs        = smat_i->nrqs;
2099:       nrqr        = smat_i->nrqr;
2100:       rbuf1       = smat_i->rbuf1;
2101:       rbuf2       = smat_i->rbuf2;
2102:       rbuf3       = smat_i->rbuf3;
2103:       req_source2 = smat_i->req_source2;

2105:       sbuf1 = smat_i->sbuf1;
2106:       sbuf2 = smat_i->sbuf2;
2107:       ptr   = smat_i->ptr;
2108:       tmp   = smat_i->tmp;
2109:       ctr   = smat_i->ctr;

2111:       pa          = smat_i->pa;
2112:       req_size    = smat_i->req_size;
2113:       req_source1 = smat_i->req_source1;

2115:       allcolumns[0] = PETSC_FALSE;
2116:     }
2117:   } else { /* scall == MAT_INITIAL_MATRIX */
2118:     /* Get some new tags to keep the communication clean */
2119:     PetscObjectGetNewTag((PetscObject)C, &tag2);
2120:     PetscObjectGetNewTag((PetscObject)C, &tag3);

2122:     /* evaluate communication - mesg to who, length of mesg, and buffer space
2123:      required. Based on this, buffers are allocated, and data copied into them*/
2124:     PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4); /* mesg size, initialize work vectors */

2126:     for (i = 0; i < ismax; i++) {
2127:       jmax   = nrow[i];
2128:       irow_i = irow[i];

2130:       PetscMalloc1(jmax, &row2proc_i);
2131:       row2proc[i] = row2proc_i;

2133:       if (issorted[i]) proc = 0;
2134:       for (j = 0; j < jmax; j++) {
2135:         if (!issorted[i]) proc = 0;
2136:         row = irow_i[j];
2137:         while (row >= C->rmap->range[proc + 1]) proc++;
2138:         w4[proc]++;
2139:         row2proc_i[j] = proc; /* map row index to proc */
2140:       }
2141:       for (j = 0; j < size; j++) {
2142:         if (w4[j]) {
2143:           w1[j] += w4[j];
2144:           w3[j]++;
2145:           w4[j] = 0;
2146:         }
2147:       }
2148:     }

2150:     nrqs     = 0; /* no of outgoing messages */
2151:     msz      = 0; /* total mesg length (for all procs) */
2152:     w1[rank] = 0; /* no mesg sent to self */
2153:     w3[rank] = 0;
2154:     for (i = 0; i < size; i++) {
2155:       if (w1[i]) {
2156:         w2[i] = 1;
2157:         nrqs++;
2158:       } /* there exists a message to proc i */
2159:     }
2160:     PetscMalloc1(nrqs, &pa); /*(proc -array)*/
2161:     for (i = 0, j = 0; i < size; i++) {
2162:       if (w1[i]) {
2163:         pa[j] = i;
2164:         j++;
2165:       }
2166:     }

2168:     /* Each message would have a header = 1 + 2*(no of IS) + data */
2169:     for (i = 0; i < nrqs; i++) {
2170:       j = pa[i];
2171:       w1[j] += w2[j] + 2 * w3[j];
2172:       msz += w1[j];
2173:     }
2174:     PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz);

2176:     /* Determine the number of messages to expect, their lengths, from from-ids */
2177:     PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);
2178:     PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);

2180:     /* Now post the Irecvs corresponding to these messages */
2181:     PetscObjectGetNewTag((PetscObject)C, &tag0);
2182:     PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1);

2184:     /* Allocate Memory for outgoing messages */
2185:     PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr);
2186:     PetscArrayzero(sbuf1, size);
2187:     PetscArrayzero(ptr, size);

2189:     {
2190:       PetscInt *iptr = tmp;
2191:       k              = 0;
2192:       for (i = 0; i < nrqs; i++) {
2193:         j = pa[i];
2194:         iptr += k;
2195:         sbuf1[j] = iptr;
2196:         k        = w1[j];
2197:       }
2198:     }

2200:     /* Form the outgoing messages. Initialize the header space */
2201:     for (i = 0; i < nrqs; i++) {
2202:       j           = pa[i];
2203:       sbuf1[j][0] = 0;
2204:       PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]);
2205:       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2206:     }

2208:     /* Parse the isrow and copy data into outbuf */
2209:     for (i = 0; i < ismax; i++) {
2210:       row2proc_i = row2proc[i];
2211:       PetscArrayzero(ctr, size);
2212:       irow_i = irow[i];
2213:       jmax   = nrow[i];
2214:       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2215:         proc = row2proc_i[j];
2216:         if (proc != rank) { /* copy to the outgoing buf*/
2217:           ctr[proc]++;
2218:           *ptr[proc] = irow_i[j];
2219:           ptr[proc]++;
2220:         }
2221:       }
2222:       /* Update the headers for the current IS */
2223:       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2224:         if ((ctr_j = ctr[j])) {
2225:           sbuf1_j            = sbuf1[j];
2226:           k                  = ++sbuf1_j[0];
2227:           sbuf1_j[2 * k]     = ctr_j;
2228:           sbuf1_j[2 * k - 1] = i;
2229:         }
2230:       }
2231:     }

2233:     /*  Now  post the sends */
2234:     PetscMalloc1(nrqs, &s_waits1);
2235:     for (i = 0; i < nrqs; ++i) {
2236:       j = pa[i];
2237:       MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i);
2238:     }

2240:     /* Post Receives to capture the buffer size */
2241:     PetscMalloc1(nrqs, &r_waits2);
2242:     PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3);
2243:     if (nrqs) rbuf2[0] = tmp + msz;
2244:     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2245:     for (i = 0; i < nrqs; ++i) {
2246:       j = pa[i];
2247:       MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i);
2248:     }

2250:     /* Send to other procs the buf size they should allocate */
2251:     /* Receive messages*/
2252:     PetscMalloc1(nrqr, &s_waits2);
2253:     PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1);
2254:     {
2255:       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2256:       PetscInt *sbuf2_i;

2258:       MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE);
2259:       for (i = 0; i < nrqr; ++i) {
2260:         req_size[i] = 0;
2261:         rbuf1_i     = rbuf1[i];
2262:         start       = 2 * rbuf1_i[0] + 1;
2263:         end         = olengths1[i];
2264:         PetscMalloc1(end, &sbuf2[i]);
2265:         sbuf2_i = sbuf2[i];
2266:         for (j = start; j < end; j++) {
2267:           id         = rbuf1_i[j] - rstart;
2268:           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2269:           sbuf2_i[j] = ncols;
2270:           req_size[i] += ncols;
2271:         }
2272:         req_source1[i] = onodes1[i];
2273:         /* form the header */
2274:         sbuf2_i[0] = req_size[i];
2275:         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];

2277:         MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i);
2278:       }
2279:     }

2281:     PetscFree(onodes1);
2282:     PetscFree(olengths1);
2283:     PetscFree(r_waits1);
2284:     PetscFree4(w1, w2, w3, w4);

2286:     /* Receive messages*/
2287:     PetscMalloc1(nrqs, &r_waits3);
2288:     MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE);
2289:     for (i = 0; i < nrqs; ++i) {
2290:       PetscMalloc1(rbuf2[i][0], &rbuf3[i]);
2291:       req_source2[i] = pa[i];
2292:       MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i);
2293:     }
2294:     PetscFree(r_waits2);

2296:     /* Wait on sends1 and sends2 */
2297:     MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE);
2298:     MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE);
2299:     PetscFree(s_waits1);
2300:     PetscFree(s_waits2);

2302:     /* Now allocate sending buffers for a->j, and send them off */
2303:     PetscMalloc1(nrqr, &sbuf_aj);
2304:     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2305:     if (nrqr) PetscMalloc1(j, &sbuf_aj[0]);
2306:     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];

2308:     PetscMalloc1(nrqr, &s_waits3);
2309:     {
2310:       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2311:       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2312:       PetscInt  cend = C->cmap->rend;
2313:       PetscInt *a_j = a->j, *b_j = b->j, ctmp;

2315:       for (i = 0; i < nrqr; i++) {
2316:         rbuf1_i   = rbuf1[i];
2317:         sbuf_aj_i = sbuf_aj[i];
2318:         ct1       = 2 * rbuf1_i[0] + 1;
2319:         ct2       = 0;
2320:         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2321:           kmax = rbuf1[i][2 * j];
2322:           for (k = 0; k < kmax; k++, ct1++) {
2323:             row    = rbuf1_i[ct1] - rstart;
2324:             nzA    = a_i[row + 1] - a_i[row];
2325:             nzB    = b_i[row + 1] - b_i[row];
2326:             ncols  = nzA + nzB;
2327:             cworkA = a_j + a_i[row];
2328:             cworkB = b_j + b_i[row];

2330:             /* load the column indices for this row into cols */
2331:             cols = sbuf_aj_i + ct2;

2333:             lwrite = 0;
2334:             for (l = 0; l < nzB; l++) {
2335:               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2336:             }
2337:             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2338:             for (l = 0; l < nzB; l++) {
2339:               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2340:             }

2342:             ct2 += ncols;
2343:           }
2344:         }
2345:         MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i);
2346:       }
2347:     }

2349:     /* create col map: global col of C -> local col of submatrices */
2350:     {
2351:       const PetscInt *icol_i;
2352: #if defined(PETSC_USE_CTABLE)
2353:       for (i = 0; i < ismax; i++) {
2354:         if (!allcolumns[i]) {
2355:           PetscTableCreate(ncol[i], C->cmap->N, &cmap[i]);

2357:           jmax   = ncol[i];
2358:           icol_i = icol[i];
2359:           cmap_i = cmap[i];
2360:           for (j = 0; j < jmax; j++) PetscTableAdd(cmap[i], icol_i[j] + 1, j + 1, INSERT_VALUES);
2361:         } else cmap[i] = NULL;
2362:       }
2363: #else
2364:       for (i = 0; i < ismax; i++) {
2365:         if (!allcolumns[i]) {
2366:           PetscCalloc1(C->cmap->N, &cmap[i]);
2367:           jmax   = ncol[i];
2368:           icol_i = icol[i];
2369:           cmap_i = cmap[i];
2370:           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2371:         } else cmap[i] = NULL;
2372:       }
2373: #endif
2374:     }

2376:     /* Create lens which is required for MatCreate... */
2377:     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2378:     PetscMalloc1(ismax, &lens);

2380:     if (ismax) PetscCalloc1(j, &lens[0]);
2381:     for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];

2383:     /* Update lens from local data */
2384:     for (i = 0; i < ismax; i++) {
2385:       row2proc_i = row2proc[i];
2386:       jmax       = nrow[i];
2387:       if (!allcolumns[i]) cmap_i = cmap[i];
2388:       irow_i = irow[i];
2389:       lens_i = lens[i];
2390:       for (j = 0; j < jmax; j++) {
2391:         row  = irow_i[j];
2392:         proc = row2proc_i[j];
2393:         if (proc == rank) {
2394:           MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL);
2395:           if (!allcolumns[i]) {
2396:             for (k = 0; k < ncols; k++) {
2397: #if defined(PETSC_USE_CTABLE)
2398:               PetscTableFind(cmap_i, cols[k] + 1, &tcol);
2399: #else
2400:               tcol = cmap_i[cols[k]];
2401: #endif
2402:               if (tcol) lens_i[j]++;
2403:             }
2404:           } else { /* allcolumns */
2405:             lens_i[j] = ncols;
2406:           }
2407:           MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL);
2408:         }
2409:       }
2410:     }

2412:     /* Create row map: global row of C -> local row of submatrices */
2413: #if defined(PETSC_USE_CTABLE)
2414:     for (i = 0; i < ismax; i++) {
2415:       PetscTableCreate(nrow[i], C->rmap->N, &rmap[i]);
2416:       irow_i = irow[i];
2417:       jmax   = nrow[i];
2418:       for (j = 0; j < jmax; j++) PetscTableAdd(rmap[i], irow_i[j] + 1, j + 1, INSERT_VALUES);
2419:     }
2420: #else
2421:     for (i = 0; i < ismax; i++) {
2422:       PetscCalloc1(C->rmap->N, &rmap[i]);
2423:       rmap_i = rmap[i];
2424:       irow_i = irow[i];
2425:       jmax   = nrow[i];
2426:       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2427:     }
2428: #endif

2430:     /* Update lens from offproc data */
2431:     {
2432:       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;

2434:       MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE);
2435:       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2436:         sbuf1_i = sbuf1[pa[tmp2]];
2437:         jmax    = sbuf1_i[0];
2438:         ct1     = 2 * jmax + 1;
2439:         ct2     = 0;
2440:         rbuf2_i = rbuf2[tmp2];
2441:         rbuf3_i = rbuf3[tmp2];
2442:         for (j = 1; j <= jmax; j++) {
2443:           is_no  = sbuf1_i[2 * j - 1];
2444:           max1   = sbuf1_i[2 * j];
2445:           lens_i = lens[is_no];
2446:           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2447:           rmap_i = rmap[is_no];
2448:           for (k = 0; k < max1; k++, ct1++) {
2449: #if defined(PETSC_USE_CTABLE)
2450:             PetscTableFind(rmap_i, sbuf1_i[ct1] + 1, &row);
2451:             row--;
2453: #else
2454:             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2455: #endif
2456:             max2 = rbuf2_i[ct1];
2457:             for (l = 0; l < max2; l++, ct2++) {
2458:               if (!allcolumns[is_no]) {
2459: #if defined(PETSC_USE_CTABLE)
2460:                 PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol);
2461: #else
2462:                 tcol = cmap_i[rbuf3_i[ct2]];
2463: #endif
2464:                 if (tcol) lens_i[row]++;
2465:               } else {         /* allcolumns */
2466:                 lens_i[row]++; /* lens_i[row] += max2 ? */
2467:               }
2468:             }
2469:           }
2470:         }
2471:       }
2472:     }
2473:     PetscFree(r_waits3);
2474:     MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE);
2475:     PetscFree(s_waits3);

2477:     /* Create the submatrices */
2478:     for (i = 0; i < ismax; i++) {
2479:       PetscInt rbs, cbs;

2481:       ISGetBlockSize(isrow[i], &rbs);
2482:       ISGetBlockSize(iscol[i], &cbs);

2484:       MatCreate(PETSC_COMM_SELF, submats + i);
2485:       MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE);

2487:       MatSetBlockSizes(submats[i], rbs, cbs);
2488:       MatSetType(submats[i], ((PetscObject)A)->type_name);
2489:       MatSeqAIJSetPreallocation(submats[i], 0, lens[i]);

2491:       /* create struct Mat_SubSppt and attached it to submat */
2492:       PetscNew(&smat_i);
2493:       subc            = (Mat_SeqAIJ *)submats[i]->data;
2494:       subc->submatis1 = smat_i;

2496:       smat_i->destroy          = submats[i]->ops->destroy;
2497:       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2498:       submats[i]->factortype   = C->factortype;

2500:       smat_i->id          = i;
2501:       smat_i->nrqs        = nrqs;
2502:       smat_i->nrqr        = nrqr;
2503:       smat_i->rbuf1       = rbuf1;
2504:       smat_i->rbuf2       = rbuf2;
2505:       smat_i->rbuf3       = rbuf3;
2506:       smat_i->sbuf2       = sbuf2;
2507:       smat_i->req_source2 = req_source2;

2509:       smat_i->sbuf1 = sbuf1;
2510:       smat_i->ptr   = ptr;
2511:       smat_i->tmp   = tmp;
2512:       smat_i->ctr   = ctr;

2514:       smat_i->pa          = pa;
2515:       smat_i->req_size    = req_size;
2516:       smat_i->req_source1 = req_source1;

2518:       smat_i->allcolumns = allcolumns[i];
2519:       smat_i->singleis   = PETSC_FALSE;
2520:       smat_i->row2proc   = row2proc[i];
2521:       smat_i->rmap       = rmap[i];
2522:       smat_i->cmap       = cmap[i];
2523:     }

2525:     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2526:       MatCreate(PETSC_COMM_SELF, &submats[0]);
2527:       MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE);
2528:       MatSetType(submats[0], MATDUMMY);

2530:       /* create struct Mat_SubSppt and attached it to submat */
2531:       PetscNew(&smat_i);
2532:       submats[0]->data = (void *)smat_i;

2534:       smat_i->destroy          = submats[0]->ops->destroy;
2535:       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2536:       submats[0]->factortype   = C->factortype;

2538:       smat_i->id          = 0;
2539:       smat_i->nrqs        = nrqs;
2540:       smat_i->nrqr        = nrqr;
2541:       smat_i->rbuf1       = rbuf1;
2542:       smat_i->rbuf2       = rbuf2;
2543:       smat_i->rbuf3       = rbuf3;
2544:       smat_i->sbuf2       = sbuf2;
2545:       smat_i->req_source2 = req_source2;

2547:       smat_i->sbuf1 = sbuf1;
2548:       smat_i->ptr   = ptr;
2549:       smat_i->tmp   = tmp;
2550:       smat_i->ctr   = ctr;

2552:       smat_i->pa          = pa;
2553:       smat_i->req_size    = req_size;
2554:       smat_i->req_source1 = req_source1;

2556:       smat_i->allcolumns = PETSC_FALSE;
2557:       smat_i->singleis   = PETSC_FALSE;
2558:       smat_i->row2proc   = NULL;
2559:       smat_i->rmap       = NULL;
2560:       smat_i->cmap       = NULL;
2561:     }

2563:     if (ismax) PetscFree(lens[0]);
2564:     PetscFree(lens);
2565:     if (sbuf_aj) {
2566:       PetscFree(sbuf_aj[0]);
2567:       PetscFree(sbuf_aj);
2568:     }

2570:   } /* endof scall == MAT_INITIAL_MATRIX */

2572:   /* Post recv matrix values */
2573:   PetscObjectGetNewTag((PetscObject)C, &tag4);
2574:   PetscMalloc1(nrqs, &rbuf4);
2575:   PetscMalloc1(nrqs, &r_waits4);
2576:   for (i = 0; i < nrqs; ++i) {
2577:     PetscMalloc1(rbuf2[i][0], &rbuf4[i]);
2578:     MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i);
2579:   }

2581:   /* Allocate sending buffers for a->a, and send them off */
2582:   PetscMalloc1(nrqr, &sbuf_aa);
2583:   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2584:   if (nrqr) PetscMalloc1(j, &sbuf_aa[0]);
2585:   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];

2587:   PetscMalloc1(nrqr, &s_waits4);
2588:   {
2589:     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2590:     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2591:     PetscInt     cend = C->cmap->rend;
2592:     PetscInt    *b_j  = b->j;
2593:     PetscScalar *vworkA, *vworkB, *a_a, *b_a;

2595:     MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a);
2596:     MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a);
2597:     for (i = 0; i < nrqr; i++) {
2598:       rbuf1_i   = rbuf1[i];
2599:       sbuf_aa_i = sbuf_aa[i];
2600:       ct1       = 2 * rbuf1_i[0] + 1;
2601:       ct2       = 0;
2602:       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2603:         kmax = rbuf1_i[2 * j];
2604:         for (k = 0; k < kmax; k++, ct1++) {
2605:           row    = rbuf1_i[ct1] - rstart;
2606:           nzA    = a_i[row + 1] - a_i[row];
2607:           nzB    = b_i[row + 1] - b_i[row];
2608:           ncols  = nzA + nzB;
2609:           cworkB = b_j + b_i[row];
2610:           vworkA = a_a + a_i[row];
2611:           vworkB = b_a + b_i[row];

2613:           /* load the column values for this row into vals*/
2614:           vals = sbuf_aa_i + ct2;

2616:           lwrite = 0;
2617:           for (l = 0; l < nzB; l++) {
2618:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2619:           }
2620:           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2621:           for (l = 0; l < nzB; l++) {
2622:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2623:           }

2625:           ct2 += ncols;
2626:         }
2627:       }
2628:       MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i);
2629:     }
2630:     MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a);
2631:     MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a);
2632:   }

2634:   /* Assemble the matrices */
2635:   /* First assemble the local rows */
2636:   for (i = 0; i < ismax; i++) {
2637:     row2proc_i = row2proc[i];
2638:     subc       = (Mat_SeqAIJ *)submats[i]->data;
2639:     imat_ilen  = subc->ilen;
2640:     imat_j     = subc->j;
2641:     imat_i     = subc->i;
2642:     imat_a     = subc->a;

2644:     if (!allcolumns[i]) cmap_i = cmap[i];
2645:     rmap_i = rmap[i];
2646:     irow_i = irow[i];
2647:     jmax   = nrow[i];
2648:     for (j = 0; j < jmax; j++) {
2649:       row  = irow_i[j];
2650:       proc = row2proc_i[j];
2651:       if (proc == rank) {
2652:         old_row = row;
2653: #if defined(PETSC_USE_CTABLE)
2654:         PetscTableFind(rmap_i, row + 1, &row);
2655:         row--;
2656: #else
2657:         row = rmap_i[row];
2658: #endif
2659:         ilen_row = imat_ilen[row];
2660:         MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals);
2661:         mat_i = imat_i[row];
2662:         mat_a = imat_a + mat_i;
2663:         mat_j = imat_j + mat_i;
2664:         if (!allcolumns[i]) {
2665:           for (k = 0; k < ncols; k++) {
2666: #if defined(PETSC_USE_CTABLE)
2667:             PetscTableFind(cmap_i, cols[k] + 1, &tcol);
2668: #else
2669:             tcol = cmap_i[cols[k]];
2670: #endif
2671:             if (tcol) {
2672:               *mat_j++ = tcol - 1;
2673:               *mat_a++ = vals[k];
2674:               ilen_row++;
2675:             }
2676:           }
2677:         } else { /* allcolumns */
2678:           for (k = 0; k < ncols; k++) {
2679:             *mat_j++ = cols[k]; /* global col index! */
2680:             *mat_a++ = vals[k];
2681:             ilen_row++;
2682:           }
2683:         }
2684:         MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals);

2686:         imat_ilen[row] = ilen_row;
2687:       }
2688:     }
2689:   }

2691:   /* Now assemble the off proc rows */
2692:   MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE);
2693:   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2694:     sbuf1_i = sbuf1[pa[tmp2]];
2695:     jmax    = sbuf1_i[0];
2696:     ct1     = 2 * jmax + 1;
2697:     ct2     = 0;
2698:     rbuf2_i = rbuf2[tmp2];
2699:     rbuf3_i = rbuf3[tmp2];
2700:     rbuf4_i = rbuf4[tmp2];
2701:     for (j = 1; j <= jmax; j++) {
2702:       is_no  = sbuf1_i[2 * j - 1];
2703:       rmap_i = rmap[is_no];
2704:       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2705:       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2706:       imat_ilen = subc->ilen;
2707:       imat_j    = subc->j;
2708:       imat_i    = subc->i;
2709:       imat_a    = subc->a;
2710:       max1      = sbuf1_i[2 * j];
2711:       for (k = 0; k < max1; k++, ct1++) {
2712:         row = sbuf1_i[ct1];
2713: #if defined(PETSC_USE_CTABLE)
2714:         PetscTableFind(rmap_i, row + 1, &row);
2715:         row--;
2716: #else
2717:         row = rmap_i[row];
2718: #endif
2719:         ilen  = imat_ilen[row];
2720:         mat_i = imat_i[row];
2721:         mat_a = imat_a + mat_i;
2722:         mat_j = imat_j + mat_i;
2723:         max2  = rbuf2_i[ct1];
2724:         if (!allcolumns[is_no]) {
2725:           for (l = 0; l < max2; l++, ct2++) {
2726: #if defined(PETSC_USE_CTABLE)
2727:             PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol);
2728: #else
2729:             tcol = cmap_i[rbuf3_i[ct2]];
2730: #endif
2731:             if (tcol) {
2732:               *mat_j++ = tcol - 1;
2733:               *mat_a++ = rbuf4_i[ct2];
2734:               ilen++;
2735:             }
2736:           }
2737:         } else { /* allcolumns */
2738:           for (l = 0; l < max2; l++, ct2++) {
2739:             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2740:             *mat_a++ = rbuf4_i[ct2];
2741:             ilen++;
2742:           }
2743:         }
2744:         imat_ilen[row] = ilen;
2745:       }
2746:     }
2747:   }

2749:   if (!iscsorted) { /* sort column indices of the rows */
2750:     for (i = 0; i < ismax; i++) {
2751:       subc      = (Mat_SeqAIJ *)submats[i]->data;
2752:       imat_j    = subc->j;
2753:       imat_i    = subc->i;
2754:       imat_a    = subc->a;
2755:       imat_ilen = subc->ilen;

2757:       if (allcolumns[i]) continue;
2758:       jmax = nrow[i];
2759:       for (j = 0; j < jmax; j++) {
2760:         mat_i = imat_i[j];
2761:         mat_a = imat_a + mat_i;
2762:         mat_j = imat_j + mat_i;
2763:         PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a);
2764:       }
2765:     }
2766:   }

2768:   PetscFree(r_waits4);
2769:   MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE);
2770:   PetscFree(s_waits4);

2772:   /* Restore the indices */
2773:   for (i = 0; i < ismax; i++) {
2774:     ISRestoreIndices(isrow[i], irow + i);
2775:     if (!allcolumns[i]) ISRestoreIndices(iscol[i], icol + i);
2776:   }

2778:   for (i = 0; i < ismax; i++) {
2779:     MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY);
2780:     MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY);
2781:   }

2783:   /* Destroy allocated memory */
2784:   if (sbuf_aa) {
2785:     PetscFree(sbuf_aa[0]);
2786:     PetscFree(sbuf_aa);
2787:   }
2788:   PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted);

2790:   for (i = 0; i < nrqs; ++i) PetscFree(rbuf4[i]);
2791:   PetscFree(rbuf4);

2793:   PetscFree4(row2proc, cmap, rmap, allcolumns);
2794:   return 0;
2795: }

2797: /*
2798:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2799:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2800:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2801:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2802:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2803:  state, and needs to be "assembled" later by compressing B's column space.

2805:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2806:  Following this call, C->A & C->B have been created, even if empty.
2807:  */
2808: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2809: {
2810:   /* If making this function public, change the error returned in this function away from _PLIB. */
2811:   Mat_MPIAIJ     *aij;
2812:   Mat_SeqAIJ     *Baij;
2813:   PetscBool       seqaij, Bdisassembled;
2814:   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2815:   PetscScalar     v;
2816:   const PetscInt *rowindices, *colindices;

2818:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2819:   if (A) {
2820:     PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);
2822:     if (rowemb) {
2823:       ISGetLocalSize(rowemb, &m);
2825:     } else {
2827:     }
2828:     if (dcolemb) {
2829:       ISGetLocalSize(dcolemb, &n);
2831:     } else {
2833:     }
2834:   }
2835:   if (B) {
2836:     PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);
2838:     if (rowemb) {
2839:       ISGetLocalSize(rowemb, &m);
2841:     } else {
2843:     }
2844:     if (ocolemb) {
2845:       ISGetLocalSize(ocolemb, &n);
2847:     } else {
2849:     }
2850:   }

2852:   aij = (Mat_MPIAIJ *)C->data;
2853:   if (!aij->A) {
2854:     /* Mimic parts of MatMPIAIJSetPreallocation() */
2855:     MatCreate(PETSC_COMM_SELF, &aij->A);
2856:     MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n);
2857:     MatSetBlockSizesFromMats(aij->A, C, C);
2858:     MatSetType(aij->A, MATSEQAIJ);
2859:   }
2860:   if (A) {
2861:     MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A);
2862:   } else {
2863:     MatSetUp(aij->A);
2864:   }
2865:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2866:     /*
2867:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2868:       need to "disassemble" B -- convert it to using C's global indices.
2869:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

2871:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2872:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

2874:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2875:       At least avoid calling MatSetValues() and the implied searches?
2876:     */

2878:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2879: #if defined(PETSC_USE_CTABLE)
2880:       PetscTableDestroy(&aij->colmap);
2881: #else
2882:       PetscFree(aij->colmap);
2883:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2884: #endif
2885:       ngcol = 0;
2886:       if (aij->lvec) VecGetSize(aij->lvec, &ngcol);
2887:       if (aij->garray) { PetscFree(aij->garray); }
2888:       VecDestroy(&aij->lvec);
2889:       VecScatterDestroy(&aij->Mvctx);
2890:     }
2891:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) MatDestroy(&aij->B);
2892:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) MatZeroEntries(aij->B);
2893:   }
2894:   Bdisassembled = PETSC_FALSE;
2895:   if (!aij->B) {
2896:     MatCreate(PETSC_COMM_SELF, &aij->B);
2897:     MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N);
2898:     MatSetBlockSizesFromMats(aij->B, B, B);
2899:     MatSetType(aij->B, MATSEQAIJ);
2900:     Bdisassembled = PETSC_TRUE;
2901:   }
2902:   if (B) {
2903:     Baij = (Mat_SeqAIJ *)B->data;
2904:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2905:       PetscMalloc1(B->rmap->n, &nz);
2906:       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2907:       MatSeqAIJSetPreallocation(aij->B, 0, nz);
2908:       PetscFree(nz);
2909:     }

2911:     PetscLayoutGetRange(C->rmap, &rstart, &rend);
2912:     shift      = rend - rstart;
2913:     count      = 0;
2914:     rowindices = NULL;
2915:     colindices = NULL;
2916:     if (rowemb) ISGetIndices(rowemb, &rowindices);
2917:     if (ocolemb) ISGetIndices(ocolemb, &colindices);
2918:     for (i = 0; i < B->rmap->n; i++) {
2919:       PetscInt row;
2920:       row = i;
2921:       if (rowindices) row = rowindices[i];
2922:       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2923:         col = Baij->j[count];
2924:         if (colindices) col = colindices[col];
2925:         if (Bdisassembled && col >= rstart) col += shift;
2926:         v = Baij->a[count];
2927:         MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES);
2928:         ++count;
2929:       }
2930:     }
2931:     /* No assembly for aij->B is necessary. */
2932:     /* FIXME: set aij->B's nonzerostate correctly. */
2933:   } else {
2934:     MatSetUp(aij->B);
2935:   }
2936:   C->preallocated  = PETSC_TRUE;
2937:   C->was_assembled = PETSC_FALSE;
2938:   C->assembled     = PETSC_FALSE;
2939:   /*
2940:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2941:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2942:    */
2943:   return 0;
2944: }

2946: /*
2947:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2948:  */
2949: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2950: {
2951:   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;

2955:   /* FIXME: make sure C is assembled */
2956:   *A = aij->A;
2957:   *B = aij->B;
2958:   /* Note that we don't incref *A and *B, so be careful! */
2959:   return 0;
2960: }

2962: /*
2963:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2964:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2965: */
2966: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
2967: {
2968:   PetscMPIInt size, flag;
2969:   PetscInt    i, ii, cismax, ispar;
2970:   Mat        *A, *B;
2971:   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;

2973:   if (!ismax) return 0;

2975:   for (i = 0, cismax = 0; i < ismax; ++i) {
2976:     MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag);
2978:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
2979:     if (size > 1) ++cismax;
2980:   }

2982:   /*
2983:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
2984:      ispar counts the number of parallel ISs across C's comm.
2985:   */
2986:   MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C));
2987:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
2988:     (*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat);
2989:     return 0;
2990:   }

2992:   /* if (ispar) */
2993:   /*
2994:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
2995:     These are used to extract the off-diag portion of the resulting parallel matrix.
2996:     The row IS for the off-diag portion is the same as for the diag portion,
2997:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
2998:   */
2999:   PetscMalloc2(cismax, &cisrow, cismax, &ciscol);
3000:   PetscMalloc1(cismax, &ciscol_p);
3001:   for (i = 0, ii = 0; i < ismax; ++i) {
3002:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3003:     if (size > 1) {
3004:       /*
3005:          TODO: This is the part that's ***NOT SCALABLE***.
3006:          To fix this we need to extract just the indices of C's nonzero columns
3007:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3008:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3009:          to be done without serializing on the IS list, so, most likely, it is best
3010:          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3011:       */
3012:       ISGetNonlocalIS(iscol[i], &(ciscol[ii]));
3013:       /* Now we have to
3014:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3015:              were sorted on each rank, concatenated they might no longer be sorted;
3016:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3017:              indices in the nondecreasing order to the original index positions.
3018:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3019:       */
3020:       ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii);
3021:       ISSort(ciscol[ii]);
3022:       ++ii;
3023:     }
3024:   }
3025:   PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p);
3026:   for (i = 0, ii = 0; i < ismax; ++i) {
3027:     PetscInt        j, issize;
3028:     const PetscInt *indices;

3030:     /*
3031:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3032:      */
3033:     ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i);
3034:     ISSort(isrow[i]);
3035:     ISGetLocalSize(isrow[i], &issize);
3036:     ISGetIndices(isrow[i], &indices);
3037:     for (j = 1; j < issize; ++j) {
3039:     }
3040:     ISRestoreIndices(isrow[i], &indices);
3041:     ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i);
3042:     ISSort(iscol[i]);
3043:     ISGetLocalSize(iscol[i], &issize);
3044:     ISGetIndices(iscol[i], &indices);
3045:     for (j = 1; j < issize; ++j) {
3047:     }
3048:     ISRestoreIndices(iscol[i], &indices);
3049:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3050:     if (size > 1) {
3051:       cisrow[ii] = isrow[i];
3052:       ++ii;
3053:     }
3054:   }
3055:   /*
3056:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3057:     array of sequential matrices underlying the resulting parallel matrices.
3058:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3059:     contain duplicates.

3061:     There are as many diag matrices as there are original index sets. There are only as many parallel
3062:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

3064:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3065:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3066:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3067:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3068:       with A[i] and B[ii] extracted from the corresponding MPI submat.
3069:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3070:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3071:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3072:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3073:       values into A[i] and B[ii] sitting inside the corresponding submat.
3074:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3075:       A[i], B[ii], AA[i] or BB[ii] matrices.
3076:   */
3077:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3078:   if (scall == MAT_INITIAL_MATRIX) PetscMalloc1(ismax, submat);

3080:   /* Now obtain the sequential A and B submatrices separately. */
3081:   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3082:   (*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A);
3083:   (*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B);

3085:   /*
3086:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3087:     matrices A & B have been extracted directly into the parallel matrices containing them, or
3088:     simply into the sequential matrix identical with the corresponding A (if size == 1).
3089:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3090:     to have the same sparsity pattern.
3091:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3092:     must be constructed for C. This is done by setseqmat(s).
3093:   */
3094:   for (i = 0, ii = 0; i < ismax; ++i) {
3095:     /*
3096:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3097:        That way we can avoid sorting and computing permutations when reusing.
3098:        To this end:
3099:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3100:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3101:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3102:     */
3103:     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;

3105:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3106:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3107:     if (size > 1) {
3108:       if (scall == MAT_INITIAL_MATRIX) {
3109:         MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i);
3110:         MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE);
3111:         MatSetType((*submat)[i], MATMPIAIJ);
3112:         PetscLayoutSetUp((*submat)[i]->rmap);
3113:         PetscLayoutSetUp((*submat)[i]->cmap);
3114:       }
3115:       /*
3116:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3117:       */
3118:       {
3119:         Mat AA = A[i], BB = B[ii];

3121:         if (AA || BB) {
3122:           setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB);
3123:           MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY);
3124:           MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY);
3125:         }
3126:         MatDestroy(&AA);
3127:       }
3128:       ISDestroy(ciscol + ii);
3129:       ISDestroy(ciscol_p + ii);
3130:       ++ii;
3131:     } else { /* if (size == 1) */
3132:       if (scall == MAT_REUSE_MATRIX) MatDestroy(&(*submat)[i]);
3133:       if (isrow_p[i] || iscol_p[i]) {
3134:         MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i);
3135:         setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]);
3136:         /* Otherwise A is extracted straight into (*submats)[i]. */
3137:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3138:         MatDestroy(A + i);
3139:       } else (*submat)[i] = A[i];
3140:     }
3141:     ISDestroy(&isrow_p[i]);
3142:     ISDestroy(&iscol_p[i]);
3143:   }
3144:   PetscFree2(cisrow, ciscol);
3145:   PetscFree2(isrow_p, iscol_p);
3146:   PetscFree(ciscol_p);
3147:   PetscFree(A);
3148:   MatDestroySubMatrices(cismax, &B);
3149:   return 0;
3150: }

3152: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3153: {
3154:   MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ);
3155:   return 0;
3156: }