Actual source code: sfpack.c

  1: #include "petsc/private/sfimpl.h"
  2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>

  5: /* This is a C file that contains packing facilities, with dispatches to device if enabled. */

  7: #if defined(PETSC_HAVE_CUDA)
  8: #include <cuda_runtime.h>
  9: #include <petsccublas.h>
 10: #endif
 11: #if defined(PETSC_HAVE_HIP)
 12: #include <hip/hip_runtime.h>
 13: #include <petschipblas.h>
 14: #endif
 15: /*
 16:  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
 17:  * therefore we pack data types manually. This file defines packing routines for the standard data types.
 18:  */

 20: #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d

 22: /* Operations working like s += t */
 23: #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
 24: #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
 25: #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
 26: #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
 27: /* Ref MPI MAXLOC */
 28: #define OP_XLOC(op,s,t) \
 29:   do {                                       \
 30:     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
 31:     else if (!((s).u op (t).u)) s = t;           \
 32:   } while (0)

 34: /* DEF_PackFunc - macro defining a Pack routine

 36:    Arguments of the macro:
 37:    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
 38:    .BS        Block size for vectorization. It is a factor of bsz.
 39:    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.

 41:    Arguments of the Pack routine:
 42:    +count     Number of indices in idx[].
 43:    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
 44:    .opt       Per-pack optimization plan. NULL means no such plan.
 45:    .idx       Indices of entries to packed.
 46:    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
 47:    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
 48:    -packed    Address of the packed data.
 49:  */
 50: #define DEF_PackFunc(Type,BS,EQ) \
 51:   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
 52:   {                                                                                                          \
 54:     const Type     *u = (const Type*)unpacked,*u2;                                                           \
 55:     Type           *p = (Type*)packed,*p2;                                                                   \
 56:     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
 57:     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
 58:     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
 60:     if (!idx) {PetscArraycpy(p,u+start*MBS,MBS*count);}/* idx[] are contiguous */       \
 61:     else if (opt) { /* has optimizations available */                                                        \
 62:       p2 = p;                                                                                                \
 63:       for (r=0; r<opt->n; r++) {                                                                             \
 64:         u2 = u + opt->start[r]*MBS;                                                                          \
 65:         X  = opt->X[r];                                                                                      \
 66:         Y  = opt->Y[r];                                                                                      \
 67:         for (k=0; k<opt->dz[r]; k++)                                                                         \
 68:           for (j=0; j<opt->dy[r]; j++) {                                                                     \
 69:             PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);                        \
 70:             p2  += opt->dx[r]*MBS;                                                                           \
 71:           }                                                                                                  \
 72:       }                                                                                                      \
 73:     } else {                                                                                                 \
 74:       for (i=0; i<count; i++)                                                                                \
 75:         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
 76:           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
 77:             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
 78:     }                                                                                                        \
 79:     return(0);                                                                                  \
 80:   }

 82: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
 83:                 and inserts into a sparse array.

 85:    Arguments:
 86:   .Type       Type of the data
 87:   .BS         Block size for vectorization
 88:   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.

 90:   Notes:
 91:    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
 92:  */
 93: #define DEF_UnpackFunc(Type,BS,EQ)               \
 94:   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
 95:   {                                                                                                          \
 97:     Type           *u = (Type*)unpacked,*u2;                                                                 \
 98:     const Type     *p = (const Type*)packed;                                                                 \
 99:     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
100:     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
101:     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
103:     if (!idx) {                                                                                              \
104:       u += start*MBS;                                                                                        \
105:       if (u != p) {PetscArraycpy(u,p,count*MBS);}                                       \
106:     } else if (opt) { /* has optimizations available */                                                      \
107:       for (r=0; r<opt->n; r++) {                                                                             \
108:         u2 = u + opt->start[r]*MBS;                                                                          \
109:         X  = opt->X[r];                                                                                      \
110:         Y  = opt->Y[r];                                                                                      \
111:         for (k=0; k<opt->dz[r]; k++)                                                                         \
112:           for (j=0; j<opt->dy[r]; j++) {                                                                     \
113:             PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);                         \
114:             p   += opt->dx[r]*MBS;                                                                           \
115:           }                                                                                                  \
116:       }                                                                                                      \
117:     } else {                                                                                                 \
118:       for (i=0; i<count; i++)                                                                                \
119:         for (j=0; j<M; j++)                                                                                  \
120:           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
121:     }                                                                                                        \
122:     return(0);                                                                                  \
123:   }

125: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert

127:    Arguments:
128:   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
129:   .Type       Type of the data
130:   .BS         Block size for vectorization
131:   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
132:   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
133:   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
134:  */
135: #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
136:   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
137:   {                                                                                                          \
138:     Type           *u = (Type*)unpacked,*u2;                                                                 \
139:     const Type     *p = (const Type*)packed;                                                                 \
140:     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
141:     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
142:     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
144:     if (!idx) {                                                                                              \
145:       u += start*MBS;                                                                                        \
146:       for (i=0; i<count; i++)                                                                                \
147:         for (j=0; j<M; j++)                                                                                  \
148:           for (k=0; k<BS; k++)                                                                               \
149:             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
150:     } else if (opt) { /* idx[] has patterns */                                                               \
151:       for (r=0; r<opt->n; r++) {                                                                             \
152:         u2 = u + opt->start[r]*MBS;                                                                          \
153:         X  = opt->X[r];                                                                                      \
154:         Y  = opt->Y[r];                                                                                      \
155:         for (k=0; k<opt->dz[r]; k++)                                                                         \
156:           for (j=0; j<opt->dy[r]; j++) {                                                                     \
157:             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
158:             p += opt->dx[r]*MBS;                                                                             \
159:           }                                                                                                  \
160:       }                                                                                                      \
161:     } else {                                                                                                 \
162:       for (i=0; i<count; i++)                                                                                \
163:         for (j=0; j<M; j++)                                                                                  \
164:           for (k=0; k<BS; k++)                                                                               \
165:             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
166:     }                                                                                                        \
167:     return(0);                                                                                  \
168:   }

170: #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
171:   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
172:   {                                                                                                          \
173:     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
174:     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
175:     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
176:     const PetscInt MBS = M*BS;                                                                               \
178:     for (i=0; i<count; i++) {                                                                                \
179:       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
180:       l = i*MBS;                                                                                             \
181:       for (j=0; j<M; j++)                                                                                    \
182:         for (k=0; k<BS; k++) {                                                                               \
183:           tmp = u[r+j*BS+k];                                                                                 \
184:           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
185:           p[l+j*BS+k] = tmp;                                                                                 \
186:         }                                                                                                    \
187:     }                                                                                                        \
188:     return(0);                                                                                  \
189:   }

191: #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
192:   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
193:   {                                                                                                          \
195:     const Type     *u = (const Type*)src;                                                                    \
196:     Type           *v = (Type*)dst;                                                                          \
197:     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
198:     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
199:     const PetscInt MBS = M*BS;                                                                               \
201:     if (!srcIdx) { /* src is contiguous */                                                                   \
202:       u += srcStart*MBS;                                                                                     \
203:       CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);  \
204:     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
205:       u += srcOpt->start[0]*MBS;                                                                             \
206:       v += dstStart*MBS;                                                                                     \
207:       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
208:       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
209:         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
210:           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
211:           v += srcOpt->dx[0]*MBS;                                                                            \
212:         }                                                                                                    \
213:     } else { /* all other cases */                                                                           \
214:       for (i=0; i<count; i++) {                                                                              \
215:         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
216:         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
217:         for (j=0; j<M; j++)                                                                                  \
218:           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
219:       }                                                                                                      \
220:     }                                                                                                        \
221:     return(0);                                                                                  \
222:   }

224: #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
225:   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
226:   {                                                                                                          \
227:     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
228:     const Type     *ldata = (const Type*)leafdata;                                                           \
229:     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
230:     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
231:     const PetscInt MBS = M*BS;                                                                               \
233:     for (i=0; i<count; i++) {                                                                                \
234:       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
235:       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
236:       for (j=0; j<M; j++)                                                                                    \
237:         for (k=0; k<BS; k++) {                                                                               \
238:           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
239:           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
240:         }                                                                                                    \
241:     }                                                                                                        \
242:     return(0);                                                                                  \
243:   }

245: /* Pack, Unpack/Fetch ops */
246: #define DEF_Pack(Type,BS,EQ)                                                      \
247:   DEF_PackFunc(Type,BS,EQ)                                                        \
248:   DEF_UnpackFunc(Type,BS,EQ)                                                      \
249:   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
250:   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
251:     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
252:     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
253:     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
254:   }

256: /* Add, Mult ops */
257: #define DEF_Add(Type,BS,EQ)                                                       \
258:   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
259:   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
260:   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
261:   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
262:   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
263:   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
264:   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
265:     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
266:     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
267:     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
268:     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
269:     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
270:     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
271:   }

273: /* Max, Min ops */
274: #define DEF_Cmp(Type,BS,EQ)                                                       \
275:   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
276:   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
277:   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
278:   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
279:   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
280:     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
281:     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
282:     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
283:     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
284:   }

286: /* Logical ops.
287:   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
288:   the compilation warning "empty macro arguments are undefined in ISO C90"
289:  */
290: #define DEF_Log(Type,BS,EQ)                                                       \
291:   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
292:   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
293:   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
294:   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
295:   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
296:   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
297:   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
298:     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
299:     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
300:     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
301:     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
302:     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
303:     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
304:   }

306: /* Bitwise ops */
307: #define DEF_Bit(Type,BS,EQ)                                                       \
308:   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
309:   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
310:   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
311:   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
312:   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
313:   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
314:   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
315:     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
316:     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
317:     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
318:     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
319:     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
320:     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
321:   }

323: /* Maxloc, Minloc ops */
324: #define DEF_Xloc(Type,BS,EQ)                                                      \
325:   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
326:   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
327:   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
328:   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
329:   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
330:     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
331:     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
332:     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
333:     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
334:   }

336: #define DEF_IntegerType(Type,BS,EQ)                                               \
337:   DEF_Pack(Type,BS,EQ)                                                            \
338:   DEF_Add(Type,BS,EQ)                                                             \
339:   DEF_Cmp(Type,BS,EQ)                                                             \
340:   DEF_Log(Type,BS,EQ)                                                             \
341:   DEF_Bit(Type,BS,EQ)                                                             \
342:   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
343:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
344:     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
345:     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
346:     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
347:     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
348:   }

350: #define DEF_RealType(Type,BS,EQ)                                                  \
351:   DEF_Pack(Type,BS,EQ)                                                            \
352:   DEF_Add(Type,BS,EQ)                                                             \
353:   DEF_Cmp(Type,BS,EQ)                                                             \
354:   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
355:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
356:     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
357:     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
358:   }

360: #if defined(PETSC_HAVE_COMPLEX)
361: #define DEF_ComplexType(Type,BS,EQ)                                               \
362:   DEF_Pack(Type,BS,EQ)                                                            \
363:   DEF_Add(Type,BS,EQ)                                                             \
364:   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
365:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366:     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
367:   }
368: #endif

370: #define DEF_DumbType(Type,BS,EQ)                                                  \
371:   DEF_Pack(Type,BS,EQ)                                                            \
372:   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
373:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374:   }

376: /* Maxloc, Minloc */
377: #define DEF_PairType(Type,BS,EQ)                                                  \
378:   DEF_Pack(Type,BS,EQ)                                                            \
379:   DEF_Xloc(Type,BS,EQ)                                                            \
380:   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
381:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
382:     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
383:   }

385: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
386: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
387: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
388: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
389: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
390: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
391: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
392: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */

394: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
395: DEF_IntegerType(int,1,1)
396: DEF_IntegerType(int,2,1)
397: DEF_IntegerType(int,4,1)
398: DEF_IntegerType(int,8,1)
399: DEF_IntegerType(int,1,0)
400: DEF_IntegerType(int,2,0)
401: DEF_IntegerType(int,4,0)
402: DEF_IntegerType(int,8,0)
403: #endif

405: /* The typedefs are used to get a typename without space that CPPJoin can handle */
406: typedef signed char SignedChar;
407: DEF_IntegerType(SignedChar,1,1)
408: DEF_IntegerType(SignedChar,2,1)
409: DEF_IntegerType(SignedChar,4,1)
410: DEF_IntegerType(SignedChar,8,1)
411: DEF_IntegerType(SignedChar,1,0)
412: DEF_IntegerType(SignedChar,2,0)
413: DEF_IntegerType(SignedChar,4,0)
414: DEF_IntegerType(SignedChar,8,0)

416: typedef unsigned char UnsignedChar;
417: DEF_IntegerType(UnsignedChar,1,1)
418: DEF_IntegerType(UnsignedChar,2,1)
419: DEF_IntegerType(UnsignedChar,4,1)
420: DEF_IntegerType(UnsignedChar,8,1)
421: DEF_IntegerType(UnsignedChar,1,0)
422: DEF_IntegerType(UnsignedChar,2,0)
423: DEF_IntegerType(UnsignedChar,4,0)
424: DEF_IntegerType(UnsignedChar,8,0)

426: DEF_RealType(PetscReal,1,1)
427: DEF_RealType(PetscReal,2,1)
428: DEF_RealType(PetscReal,4,1)
429: DEF_RealType(PetscReal,8,1)
430: DEF_RealType(PetscReal,1,0)
431: DEF_RealType(PetscReal,2,0)
432: DEF_RealType(PetscReal,4,0)
433: DEF_RealType(PetscReal,8,0)

435: #if defined(PETSC_HAVE_COMPLEX)
436: DEF_ComplexType(PetscComplex,1,1)
437: DEF_ComplexType(PetscComplex,2,1)
438: DEF_ComplexType(PetscComplex,4,1)
439: DEF_ComplexType(PetscComplex,8,1)
440: DEF_ComplexType(PetscComplex,1,0)
441: DEF_ComplexType(PetscComplex,2,0)
442: DEF_ComplexType(PetscComplex,4,0)
443: DEF_ComplexType(PetscComplex,8,0)
444: #endif

446: #define PairType(Type1,Type2) Type1##_##Type2
447: typedef struct {int u; int i;}           PairType(int,int);
448: typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
449: DEF_PairType(PairType(int,int),1,1)
450: DEF_PairType(PairType(PetscInt,PetscInt),1,1)

452: /* If we don't know the basic type, we treat it as a stream of chars or ints */
453: DEF_DumbType(char,1,1)
454: DEF_DumbType(char,2,1)
455: DEF_DumbType(char,4,1)
456: DEF_DumbType(char,1,0)
457: DEF_DumbType(char,2,0)
458: DEF_DumbType(char,4,0)

460: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
461: DEF_DumbType(DumbInt,1,1)
462: DEF_DumbType(DumbInt,2,1)
463: DEF_DumbType(DumbInt,4,1)
464: DEF_DumbType(DumbInt,8,1)
465: DEF_DumbType(DumbInt,1,0)
466: DEF_DumbType(DumbInt,2,0)
467: DEF_DumbType(DumbInt,4,0)
468: DEF_DumbType(DumbInt,8,0)

470: #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
471: PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
472: {
473:   int ierr;
474:   MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
475:   MPI_Type_commit(newtype); if (ierr) return ierr;
476:   return MPI_SUCCESS;
477: }
478: #endif

480: PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
481: {
482:   PetscErrorCode    ierr;
483:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
484:   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;

487:   /* Destroy device-specific fields */
488:   if (link->deviceinited) {(*link->Destroy)(sf,link);}

490:   /* Destroy host related fields */
491:   if (!link->isbuiltin) {MPI_Type_free(&link->unit);}
492:   if (!link->use_nvshmem) {
493:     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
494:       if (link->reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&link->reqs[i]);}
495:     }
496:     PetscFree(link->reqs);
497:     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
498:       PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
499:       PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
500:     }
501:   }
502:   PetscFree(link);
503:   return(0);
504: }

506: PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
507: {
508:   PetscErrorCode    ierr;

511:   PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);
512:  #if defined(PETSC_HAVE_NVSHMEM)
513:   {
514:     PetscBool use_nvshmem;
515:     PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);
516:     if (use_nvshmem) {
517:       PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);
518:       return(0);
519:     }
520:   }
521:  #endif
522:   PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);
523:   return(0);
524: }

526: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
527:    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
528: */
529: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
530: {
531:   PetscErrorCode       ierr;
532:   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
533:   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
534:   const PetscInt       *rootoffset,*leafoffset;
535:   PetscMPIInt          n;
536:   MPI_Aint             disp;
537:   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
538:   MPI_Datatype         unit = link->unit;
539:   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
540:   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;

543:   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
544:   if (sf->persistent) {
545:     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
546:       PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);
547:       if (direction == PETSCSF_LEAF2../../../../../..) {
548:         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
549:           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
550:           PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
551:           MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
552:         }
553:       } else { /* PETSCSF_../../../../../..2LEAF */
554:         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
555:           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
556:           PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
557:           MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
558:         }
559:       }
560:       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
561:     }

563:     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
564:       PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);
565:       if (direction == PETSCSF_LEAF2../../../../../..) {
566:         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
567:           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
568:           PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
569:           MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
570:         }
571:       } else { /* PETSCSF_../../../../../..2LEAF */
572:         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
573:           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
574:           PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
575:           MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
576:         }
577:       }
578:       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
579:     }
580:   }
581:   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
582:   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
583:   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
584:   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
585:   return(0);
586: }


589: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
590: {
591:   PetscErrorCode    ierr;
592:   PetscSFLink       link,*p;
593:   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;

596:   /* Look for types in cache */
597:   for (p=&bas->inuse; (link=*p); p=&link->next) {
598:     PetscBool match;
599:     MPIPetsc_Type_compare(unit,link->unit,&match);
600:     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
601:       switch (cmode) {
602:       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
603:       case PETSC_USE_POINTER: break;
604:       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
605:       }
606:       *mylink = link;
607:       return(0);
608:     }
609:   }
610:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
611: }

613: PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
614: {
615:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
616:   PetscSFLink       link = *mylink;

619:   link->rootdata = NULL;
620:   link->leafdata = NULL;
621:   link->next     = bas->avail;
622:   bas->avail     = link;
623:   *mylink        = NULL;
624:   return(0);
625: }

627: /* Error out on unsupported overlapped communications */
628: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
629: {
630:   PetscErrorCode    ierr;
631:   PetscSFLink       link,*p;
632:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
633:   PetscBool         match;

636:   if (PetscDefined(USE_DEBUG)) {
637:     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
638:        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
639:     */
640:     if (rootdata || leafdata) {
641:       for (p=&bas->inuse; (link=*p); p=&link->next) {
642:         MPIPetsc_Type_compare(unit,link->unit,&match);
643:         if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
644:       }
645:     }
646:   }
647:   return(0);
648: }

650: static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
651: {
653:   if (n) {PetscErrorCode PetscMemcpy(dst,src,n);}
654:   return(0);
655: }


658: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
659: {
661:   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
662:   PetscBool      is2Int,is2PetscInt;
663:   PetscMPIInt    ni,na,nd,combiner;
664: #if defined(PETSC_HAVE_COMPLEX)
665:   PetscInt       nPetscComplex=0;
666: #endif

669:   MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);
670:   MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
671:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
672:   MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);
673:   MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
674:   MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
675: #if defined(PETSC_HAVE_COMPLEX)
676:   MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
677: #endif
678:   MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
679:   MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
680:   /* TODO: shaell we also handle Fortran MPI_2REAL? */
681:   MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
682:   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
683:   link->bs = 1; /* default */

685:   if (is2Int) {
686:     PackInit_PairType_int_int_1_1(link);
687:     link->bs        = 1;
688:     link->unitbytes = 2*sizeof(int);
689:     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
690:     link->basicunit = MPI_2INT;
691:     link->unit      = MPI_2INT;
692:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
693:     PackInit_PairType_PetscInt_PetscInt_1_1(link);
694:     link->bs        = 1;
695:     link->unitbytes = 2*sizeof(PetscInt);
696:     link->basicunit = MPIU_2INT;
697:     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
698:     link->unit      = MPIU_2INT;
699:   } else if (nPetscReal) {
700:     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
701:     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
702:     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
703:     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
704:     link->bs        = nPetscReal;
705:     link->unitbytes = nPetscReal*sizeof(PetscReal);
706:     link->basicunit = MPIU_REAL;
707:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
708:   } else if (nPetscInt) {
709:     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
710:     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
711:     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
712:     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
713:     link->bs        = nPetscInt;
714:     link->unitbytes = nPetscInt*sizeof(PetscInt);
715:     link->basicunit = MPIU_INT;
716:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
717: #if defined(PETSC_USE_64BIT_INDICES)
718:   } else if (nInt) {
719:     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
720:     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
721:     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
722:     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
723:     link->bs        = nInt;
724:     link->unitbytes = nInt*sizeof(int);
725:     link->basicunit = MPI_INT;
726:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
727: #endif
728:   } else if (nSignedChar) {
729:     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
730:     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
731:     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
732:     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
733:     link->bs        = nSignedChar;
734:     link->unitbytes = nSignedChar*sizeof(SignedChar);
735:     link->basicunit = MPI_SIGNED_CHAR;
736:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
737:   }  else if (nUnsignedChar) {
738:     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
739:     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
740:     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
741:     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
742:     link->bs        = nUnsignedChar;
743:     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
744:     link->basicunit = MPI_UNSIGNED_CHAR;
745:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
746: #if defined(PETSC_HAVE_COMPLEX)
747:   } else if (nPetscComplex) {
748:     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
749:     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
750:     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
751:     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
752:     link->bs        = nPetscComplex;
753:     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
754:     link->basicunit = MPIU_COMPLEX;
755:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
756: #endif
757:   } else {
758:     MPI_Aint lb,nbyte;
759:     MPI_Type_get_extent(unit,&lb,&nbyte);
760:     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
761:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
762:       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
763:       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
764:       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
765:       link->bs        = nbyte;
766:       link->unitbytes = nbyte;
767:       link->basicunit = MPI_BYTE;
768:     } else {
769:       nInt = nbyte / sizeof(int);
770:       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
771:       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
772:       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
773:       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
774:       link->bs        = nInt;
775:       link->unitbytes = nbyte;
776:       link->basicunit = MPI_INT;
777:     }
778:     if (link->isbuiltin) link->unit = unit;
779:   }

781:   if (!link->isbuiltin) {MPI_Type_dup(unit,&link->unit);}

783:   link->Memcpy = PetscSFLinkMemcpy_Host;
784:   return(0);
785: }

787: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
788: {
790:   *UnpackAndOp = NULL;
791:   if (PetscMemTypeHost(mtype)) {
792:     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
793:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
794:     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
795:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
796:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
797:     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
798:     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
799:     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
800:     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
801:     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
802:     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
803:     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
804:     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
805:   }
806: #if defined(PETSC_HAVE_DEVICE)
807:   else if (PetscMemTypeDevice(mtype) && !atomic) {
808:     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
809:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
810:     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
811:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
812:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
813:     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
814:     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
815:     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
816:     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
817:     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
818:     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
819:     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
820:     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
821:   } else if (PetscMemTypeDevice(mtype) && atomic) {
822:     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
823:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
824:     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
825:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
826:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
827:     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
828:     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
829:     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
830:     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
831:     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
832:     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
833:     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
834:     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
835:   }
836: #endif
837:   return(0);
838: }

840: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
841: {
843:   *ScatterAndOp = NULL;
844:   if (PetscMemTypeHost(mtype)) {
845:     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
846:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
847:     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
848:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
849:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
850:     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
851:     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
852:     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
853:     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
854:     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
855:     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
856:     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
857:     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
858:   }
859: #if defined(PETSC_HAVE_DEVICE)
860:   else if (PetscMemTypeDevice(mtype) && !atomic) {
861:     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
862:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
863:     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
864:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
865:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
866:     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
867:     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
868:     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
869:     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
870:     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
871:     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
872:     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
873:     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
874:   } else if (PetscMemTypeDevice(mtype) && atomic) {
875:     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
876:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
877:     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
878:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
879:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
880:     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
881:     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
882:     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
883:     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
884:     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
885:     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
886:     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
887:     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
888:   }
889: #endif
890:   return(0);
891: }

893: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
894: {
896:   *FetchAndOp = NULL;
897:   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
898:   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
899: #if defined(PETSC_HAVE_DEVICE)
900:   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
901:   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOp = link->da_FetchAndAdd;
902: #endif
903:   return(0);
904: }

906: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
907: {
909:   *FetchAndOpLocal = NULL;
910:   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
911:   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
912: #if defined(PETSC_HAVE_DEVICE)
913:   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
914:   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
915: #endif
916:   return(0);
917: }

919: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
920: {
922:   PetscLogDouble flops;
923:   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;

926:   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
927:     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
928: #if defined(PETSC_HAVE_DEVICE)
929:     if (PetscMemTypeDevice(link->rootmtype)) {PetscLogGpuFlops(flops);} else
930: #endif
931:     {PetscLogFlops(flops);}
932:   }
933:   return(0);
934: }

936: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
937: {
938:   PetscLogDouble flops;

942:   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
943:     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
944: #if defined(PETSC_HAVE_DEVICE)
945:     if (PetscMemTypeDevice(link->leafmtype)) {PetscLogGpuFlops(flops);} else
946: #endif
947:     {PetscLogFlops(flops);}
948:   }
949:   return(0);
950: }

952: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
953:   Input Arguments:
954:   +sf      - The StarForest
955:   .link    - The link
956:   .count   - Number of entries to unpack
957:   .start   - The first index, significent when indices=NULL
958:   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
959:   .buf     - A contiguous buffer to unpack from
960:   -op      - Operation after unpack

962:   Output Arguments:
963:   .data    - The data to unpack to
964: */
965: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
966: {
968: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
969:   {
971:     PetscInt       i;
972:     PetscMPIInt    n;
973:     if (indices) {
974:       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
975:          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
976:       */
977:       for (i=0; i<count; i++) {MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);}
978:     } else {
979:       PetscMPIIntCast(count,&n);
980:       MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);
981:     }
982:   }
983:   return(0);
984: #else
985:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
986: #endif
987: }

989: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
990: {
992: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
993:   {
995:     PetscInt       i,disp;
996:     if (!srcIdx) {
997:       PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);
998:     } else {
999:       for (i=0; i<count; i++) {
1000:         disp = dstIdx? dstIdx[i] : dstStart + i;
1001:         MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);
1002:       }
1003:     }
1004:   }
1005:   return(0);
1006: #else
1007:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1008: #endif
1009: }

1011: /*=============================================================================
1012:               Pack/Unpack/Fetch/Scatter routines
1013:  ============================================================================*/

1015: /* Pack rootdata to rootbuf
1016:   Input Arguments:
1017:   + sf       - The SF this packing works on.
1018:   . link     - It gives the memtype of the roots and also provides root buffer.
1019:   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1020:   - rootdata - Where to read the roots.

1022:   Notes:
1023:   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1024:   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
1025:  */
1026: PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1027: {
1028:   PetscErrorCode   ierr;
1029:   const PetscInt   *rootindices = NULL;
1030:   PetscInt         count,start;
1031:   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1032:   PetscMemType     rootmtype = link->rootmtype;
1033:   PetscSFPackOpt   opt = NULL;

1036:   PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1037:   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1038:     PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1039:     PetscSFLinkGetPack(link,rootmtype,&Pack);
1040:     (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1041:   }
1042:   PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1043:   return(0);
1044: }

1046: /* Pack leafdata to leafbuf */
1047: PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1048: {
1049:   PetscErrorCode   ierr;
1050:   const PetscInt   *leafindices = NULL;
1051:   PetscInt         count,start;
1052:   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1053:   PetscMemType     leafmtype = link->leafmtype;
1054:   PetscSFPackOpt   opt = NULL;

1057:   PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1058:   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1059:     PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1060:     PetscSFLinkGetPack(link,leafmtype,&Pack);
1061:     (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1062:   }
1063:   PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1064:   return(0);
1065: }

1067: /* Pack rootdata to rootbuf, which are in the same memory space */
1068: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1069: {
1070:   PetscErrorCode   ierr;
1071:   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;

1074:   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1075:     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1076:     if (link->PrePack) {(*link->PrePack)(sf,link,PETSCSF_../../../../../..2LEAF);} /* Used by SF nvshmem */
1077:   }
1078:   PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1079:   if (bas->rootbuflen[scope]) {PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);}
1080:   PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1081:   return(0);
1082: }
1083: /* Pack leafdata to leafbuf, which are in the same memory space */
1084: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1085: {
1086:   PetscErrorCode   ierr;

1089:   if (scope == PETSCSF_REMOTE) {
1090:     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1091:     if (link->PrePack) {(*link->PrePack)(sf,link,PETSCSF_LEAF2../../../../../..);}  /* Used by SF nvshmem */
1092:   }
1093:   PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1094:   if (sf->leafbuflen[scope]) {PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);}
1095:   PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1096:   return(0);
1097: }

1099: PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1100: {
1101:   PetscErrorCode   ierr;
1102:   const PetscInt   *rootindices = NULL;
1103:   PetscInt         count,start;
1104:   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1105:   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1106:   PetscMemType     rootmtype = link->rootmtype;
1107:   PetscSFPackOpt   opt = NULL;

1110:   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1111:     PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);
1112:     if (UnpackAndOp) {
1113:       PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1114:       (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1115:     } else {
1116:       PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);
1117:       PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);
1118:     }
1119:   }
1120:   PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1121:   return(0);
1122: }

1124: PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1125: {
1126:   PetscErrorCode   ierr;
1127:   const PetscInt   *leafindices = NULL;
1128:   PetscInt         count,start;
1129:   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1130:   PetscMemType     leafmtype = link->leafmtype;
1131:   PetscSFPackOpt   opt = NULL;

1134:   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1135:     PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);
1136:     if (UnpackAndOp) {
1137:       PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1138:       (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1139:     } else {
1140:       PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);
1141:       PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);
1142:     }
1143:   }
1144:   PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);
1145:   return(0);
1146: }
1147: /* Unpack rootbuf to rootdata, which are in the same memory space */
1148: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1149: {
1150:   PetscErrorCode   ierr;
1151:   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;

1154:   PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1155:   if (bas->rootbuflen[scope]) {PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);}
1156:   PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1157:   if (scope == PETSCSF_REMOTE) {
1158:     if (link->PostUnpack) {(*link->PostUnpack)(sf,link,PETSCSF_LEAF2../../../../../..);}  /* Used by SF nvshmem */
1159:     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1160:   }
1161:   return(0);
1162: }

1164: /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1165: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1166: {
1167:   PetscErrorCode   ierr;

1170:   PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1171:   if (sf->leafbuflen[scope]) {PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);}
1172:   PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1173:   if (scope == PETSCSF_REMOTE) {
1174:     if (link->PostUnpack) {(*link->PostUnpack)(sf,link,PETSCSF_../../../../../..2LEAF);}  /* Used by SF nvshmem */
1175:     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1176:   }
1177:   return(0);
1178: }

1180: /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1181: PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1182: {
1183:   PetscErrorCode     ierr;
1184:   const PetscInt     *rootindices = NULL;
1185:   PetscInt           count,start;
1186:   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1187:   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1188:   PetscMemType       rootmtype = link->rootmtype;
1189:   PetscSFPackOpt     opt = NULL;

1192:   PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1193:   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1194:     /* Do FetchAndOp on rootdata with rootbuf */
1195:     PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);
1196:     PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);
1197:     (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);
1198:   }
1199:   PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);
1200:   PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1201:   return(0);
1202: }

1204: PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1205: {
1206:   PetscErrorCode       ierr;
1207:   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1208:   PetscInt             count,rootstart,leafstart;
1209:   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1210:   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1211:   PetscMemType         rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1212:   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1213:   PetscInt             buflen = sf->leafbuflen[PETSCSF_LOCAL];
1214:   char                 *srcbuf = NULL,*dstbuf = NULL;
1215:   PetscBool            dstdups;

1217:   if (!buflen) return(0);
1218:   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1219:     if (direction == PETSCSF_../../../../../..2LEAF) {
1220:       PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);
1221:       srcmtype = rootmtype;
1222:       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1223:       dstmtype = leafmtype;
1224:       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1225:     } else {
1226:       PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);
1227:       srcmtype = leafmtype;
1228:       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1229:       dstmtype = rootmtype;
1230:       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1231:     }
1232:     (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);
1233:     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1234:     if (PetscMemTypeHost(dstmtype)) {(*link->SyncStream)(link);}
1235:     if (direction == PETSCSF_../../../../../..2LEAF) {
1236:       PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);
1237:     } else {
1238:       PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);
1239:     }
1240:   } else {
1241:     dstdups  = (direction == PETSCSF_../../../../../..2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1242:     dstmtype = (direction == PETSCSF_../../../../../..2LEAF) ? link->leafmtype : link->rootmtype;
1243:     PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);
1244:     if (ScatterAndOp) {
1245:       PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1246:       PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1247:       if (direction == PETSCSF_../../../../../..2LEAF) {
1248:         (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);
1249:       } else {
1250:         (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);
1251:       }
1252:     } else {
1253:       PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1254:       PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1255:       if (direction == PETSCSF_../../../../../..2LEAF) {
1256:         PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);
1257:       } else {
1258:         PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);
1259:       }
1260:     }
1261:   }
1262:   return(0);
1263: }

1265: /* Fetch rootdata to leafdata and leafupdate locally */
1266: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1267: {
1268:   PetscErrorCode       ierr;
1269:   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1270:   PetscInt             count,rootstart,leafstart;
1271:   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1272:   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1273:   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1274:   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;

1277:   if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1278:   if (rootmtype != leafmtype) {
1279:     /* The local communication has to go through pack and unpack */
1280:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1281:   } else {
1282:     PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1283:     PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1284:     PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);
1285:     (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);
1286:   }
1287:   return(0);
1288: }

1290: /*
1291:   Create per-rank pack/unpack optimizations based on indice patterns

1293:    Input Parameters:
1294:   +  n       - Number of destination ranks
1295:   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1296:   -  idx     - [*]   Array storing indices

1298:    Output Parameters:
1299:   +  opt     - Pack optimizations. NULL if no optimizations.
1300: */
1301: PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1302: {
1304:   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1305:   PetscBool      optimizable = PETSC_TRUE;
1306:   PetscSFPackOpt opt;

1309:   PetscMalloc1(1,&opt);
1310:   PetscMalloc1(7*n+2,&opt->array);
1311:   opt->n      = opt->array[0] = n;
1312:   opt->offset = opt->array + 1;
1313:   opt->start  = opt->array + n   + 2;
1314:   opt->dx     = opt->array + 2*n + 2;
1315:   opt->dy     = opt->array + 3*n + 2;
1316:   opt->dz     = opt->array + 4*n + 2;
1317:   opt->X      = opt->array + 5*n + 2;
1318:   opt->Y      = opt->array + 6*n + 2;

1320:   for (r=0; r<n; r++) { /* For each destination rank */
1321:     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1322:     p     = offset[r];
1323:     start = idx[p]; /* First index for this rank */
1324:     p++;

1326:     /* Search in X dimension */
1327:     for (dx=1; dx<m; dx++,p++) {
1328:       if (start+dx != idx[p]) break;
1329:     }

1331:     dydz = m/dx;
1332:     X    = dydz > 1 ? (idx[p]-start) : dx;
1333:     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1334:     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1335:     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1336:       for (i=0; i<dx; i++,p++) {
1337:         if (start+X*dy+i != idx[p]) {
1338:           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1339:           else goto Z_dimension;
1340:         }
1341:       }
1342:     }

1344: Z_dimension:
1345:     dz = m/(dx*dy);
1346:     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1347:     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1348:     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1349:     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1350:       for (j=0; j<dy; j++) {
1351:         for (i=0; i<dx; i++,p++) {
1352:           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1353:         }
1354:       }
1355:     }
1356:     opt->start[r] = start;
1357:     opt->dx[r]    = dx;
1358:     opt->dy[r]    = dy;
1359:     opt->dz[r]    = dz;
1360:     opt->X[r]     = X;
1361:     opt->Y[r]     = Y;
1362:   }

1364: finish:
1365:   /* If not optimizable, free arrays to save memory */
1366:   if (!n || !optimizable) {
1367:     PetscFree(opt->array);
1368:     PetscFree(opt);
1369:     *out = NULL;
1370:   } else {
1371:     opt->offset[0] = 0;
1372:     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1373:     *out = opt;
1374:   }
1375:   return(0);
1376: }

1378: PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1379: {
1381:   PetscSFPackOpt opt = *out;

1384:   if (opt) {
1385:     PetscSFFree(sf,mtype,opt->array);
1386:     PetscFree(opt);
1387:     *out = NULL;
1388:   }
1389:   return(0);
1390: }

1392: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1393: {
1395:   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1396:   PetscInt       i,j;

1399:   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1400:   for (i=0; i<2; i++) { /* Set defaults */
1401:     sf->leafstart[i]   = 0;
1402:     sf->leafcontig[i]  = PETSC_TRUE;
1403:     sf->leafdups[i]    = PETSC_FALSE;
1404:     bas->rootstart[i]  = 0;
1405:     bas->rootcontig[i] = PETSC_TRUE;
1406:     bas->rootdups[i]   = PETSC_FALSE;
1407:   }

1409:   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1410:   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];

1412:   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1413:   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];

1415:   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1416:   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1417:     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1418:   }
1419:   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1420:     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1421:   }

1423:   /* If not, see if we can have per-rank optimizations by doing index analysis */
1424:   if (!sf->leafcontig[0]) {PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);}
1425:   if (!sf->leafcontig[1]) {PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);}

1427:   /* Are root indices for self and remote contiguous? */
1428:   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1429:   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];

1431:   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1432:   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];

1434:   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1435:     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1436:   }
1437:   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1438:     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1439:   }

1441:   if (!bas->rootcontig[0]) {PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);}
1442:   if (!bas->rootcontig[1]) {PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);}

1444:  #if defined(PETSC_HAVE_DEVICE)
1445:     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1446:   if (PetscDefined(HAVE_DEVICE)) {
1447:     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1452:   }
1453: #endif
1454:   return(0);
1455: }

1457: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1458: {
1460:   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1461:   PetscInt       i;

1464:   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1465:     PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);
1466:     PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);
1467:    #if defined(PETSC_HAVE_DEVICE)
1468:     PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);
1469:     PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);
1470:    #endif
1471:   }
1472:   return(0);
1473: }