Actual source code: xmllogevent.c

  1: /*************************************************************************************
  2:  *    M A R I T I M E  R E S E A R C H  I N S T I T U T E  N E T H E R L A N D S     *
  3:  *************************************************************************************
  4:  *    authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij                    *
  5:  *************************************************************************************
  6:  *    content: Support for nested PetscTimers                                        *
  7:  *************************************************************************************/
  8: #include <petsclog.h>
  9: #include <petsc/private/logimpl.h>
 10: #include <petsctime.h>
 11: #include <petscviewer.h>
 12: #include "../src/sys/logging/xmlviewer.h"

 14: #if defined(PETSC_USE_LOG)

 16: /*
 17:  * Support for nested PetscTimers
 18:  *
 19:  * PetscTimers keep track of a lot of useful information: Wall clock times,
 20:  * message passing statistics, flop counts.  Information about the nested structure
 21:  * of the timers is lost. Example:
 22:  *
 23:  * 7:30   Start: awake
 24:  * 7:30      Start: morning routine
 25:  * 7:40         Start: eat
 26:  * 7:49         Done:  eat
 27:  * 7:43      Done:  morning routine
 28:  * 8:15      Start: work
 29:  * 12:15        Start: eat
 30:  * 12:45        Done:  eat
 31:  * 16:00     Done:  work
 32:  * 16:30     Start: evening routine
 33:  * 18:30        Start: eat
 34:  * 19:15        Done:  eat
 35:  * 22:00     Done:  evening routine
 36:  * 22:00  Done:  awake
 37:  *
 38:  * Petsc timers provide the following timer results:
 39:  *
 40:  *    awake:              1 call    14:30 hours
 41:  *    morning routine:    1 call     0:13 hours
 42:  *    eat:                3 calls    1:24 hours
 43:  *    work:               1 call     7:45 hours
 44:  *    evening routine     1 call     5:30 hours
 45:  *
 46:  * Nested timers can be used to get the following table:
 47:  *
 48:  *   [1 call]: awake                14:30 hours
 49:  *   [1 call]:    morning routine         0:13 hours         ( 2 % of awake)
 50:  *   [1 call]:       eat                       0:09 hours         (69 % of morning routine)
 51:  *                   rest (morning routine)    0:04 hours         (31 % of morning routine)
 52:  *   [1 call]:    work                    7:45 hours         (53 % of awake)
 53:  *   [1 call]:       eat                       0:30 hours         ( 6 % of work)
 54:  *                   rest (work)               7:15 hours         (94 % of work)
 55:  *   [1 call]:    evening routine         5:30 hours         (38 % of awake)
 56:  *   [1 call]:       eat                       0:45 hours         (14 % of evening routine)
 57:  *                   rest (evening routine)    4:45 hours         (86 % of morning routine)
 58:  *
 59:  * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
 60:  * the nested timers make the stages unnecessary.
 61:  *
 62:  */

 64: /*
 65:  * Data structures for keeping track of nested timers:
 66:  *
 67:  *   nestedEvents: information about the timers that have actually been activated
 68:  *   dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
 69:  *
 70:  * The Default-timers are used to time the nested timers. Every nested timer corresponds to
 71:  * (one or more) default timers, where one of the default timers has the same event-id as the
 72:  * nested one.
 73:  *
 74:  * Because of the risk of confusion between nested timer ids and default timer ids, we
 75:  * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
 76:  * only for default events. Also, all nested event variables are prepended with 'nst', and
 77:  * default timers with 'dft'.
 78:  */

 80: #define DFT_ID_AWAKE -1

 82: typedef PetscLogEvent NestedEventId;
 83: typedef struct {
 84:   NestedEventId   nstEvent;         /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
 85:   int             nParents;         /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
 86:   PetscLogEvent  *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
 87:   PetscLogEvent  *dftEvents;        /* The default timers which represent the different 'instances' of this nested event */

 89:   PetscLogEvent  *dftParents;       /* The default timers which were the dftParentActive when this nested event was started */
 90:   PetscLogEvent  *dftEventsSorted;  /* The default timers which represent the different 'instances' of this nested event */
 91: } PetscNestedEvent;

 93: static PetscLogEvent    dftParentActive        = DFT_ID_AWAKE;
 94: static int              nNestedEvents          = 0;
 95: static int              nNestedEventsAllocated = 0;
 96: static PetscNestedEvent *nestedEvents          = NULL;
 97: static PetscLogDouble   thresholdTime          = 0.01; /* initial value was 0.1 */

 99: #define THRESHOLD (thresholdTime/100.0+1e-12)

101: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
102: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);
104: PETSC_INTERN PetscErrorCode PetscLogView_Flamegraph(PetscViewer);


107: /*@C
108:   PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
109:   rates and object creation and should not slow programs down too much.

111:   Logically Collective over PETSC_COMM_WORLD

113:   Options Database Keys:
114: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

116:   Usage:
117: .vb
118:       PetscInitialize(...);
119:       PetscLogNestedBegin();
120:        ... code ...
121:       PetscLogView(viewer);
122:       PetscFinalize();
123: .ve

125:   Level: advanced

127: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin()
128: @*/
129: PetscErrorCode PetscLogNestedBegin(void)
130: {

134:   if (nestedEvents) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"nestedEvents already allocated");

136:   nNestedEventsAllocated = 10;
137:   PetscMalloc1(nNestedEventsAllocated,&nestedEvents);
138:   dftParentActive = DFT_ID_AWAKE;
139:   nNestedEvents =1;

141:   /* 'Awake' is nested event 0. It has no parents */
142:   nestedEvents[0].nstEvent          = 0;
143:   nestedEvents[0].nParents          = 0;
144:   nestedEvents[0].dftParentsSorted  = NULL;
145:   nestedEvents[0].dftEvents         = NULL;
146:   nestedEvents[0].dftParents        = NULL;
147:   nestedEvents[0].dftEventsSorted   = NULL;

149:   PetscLogSet(PetscLogEventBeginNested,PetscLogEventEndNested);
150:   return(0);
151: }

153: /* Delete the data structures for the nested timers */
154: PetscErrorCode PetscLogNestedEnd(void)
155: {
157:   int            i;

160:   if (!nestedEvents) return(0);
161:   for (i=0; i<nNestedEvents; i++) {
162:     PetscFree4(nestedEvents[i].dftParentsSorted,nestedEvents[i].dftEventsSorted,nestedEvents[i].dftParents,nestedEvents[i].dftEvents);
163:   }
164:   PetscFree(nestedEvents);
165:   nestedEvents           = NULL;
166:   nNestedEvents          = 0;
167:   nNestedEventsAllocated = 0;
168:   return(0);
169: }


172: /*
173:  UTILITIES: FIND STUFF IN SORTED ARRAYS

175:     dftIndex - index to be found
176:     dftArray - sorted array of PetscLogEvent-ids
177:     narray - dimension of dftArray
178:     entry - entry in the array where dftIndex may be found;

180:      if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
181:      In that case, the dftIndex can be inserted at this entry.
182: */
183: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex,const PetscLogEvent *dftArray,int narray,int *entry)
184: {
186:   if (narray==0 || dftIndex <= dftArray[0]) {
187:     *entry = 0;
188:   } else if (dftIndex > dftArray[narray-1]) {
189:     *entry = narray;
190:   } else {
191:     int ihigh = narray-1, ilow=0;
192:     while (ihigh>ilow) {
193:       const int imiddle = (ihigh+ilow)/2;
194:       if (dftArray[imiddle] > dftIndex) {
195:         ihigh = imiddle;
196:       } else if (dftArray[imiddle]<dftIndex) {
197:         ilow = imiddle+1;
198:       } else {
199:         ihigh = imiddle;
200:         ilow  = imiddle;
201:       }
202:     }
203:     *entry = ihigh;
204:   }
205:   return(0);
206: }

208: /*
209:     Utility: find the nested event with given identification

211:     nstEvent - Nested event to be found
212:     entry - entry in the nestedEvents where nstEvent may be found;

214:     if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray
215: */
216: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent,int *entry)
217: {
219:   if (nNestedEvents==0 || nstEvent <= nestedEvents[0].nstEvent) {
220:     *entry = 0;
221:   } else if (nstEvent > nestedEvents[nNestedEvents-1].nstEvent) {
222:     *entry = nNestedEvents;
223:   } else {
224:     int ihigh = nNestedEvents-1,  ilow = 0;
225:     while (ihigh>ilow) {
226:       const int imiddle = (ihigh+ilow)/2;
227:       if (nestedEvents[imiddle].nstEvent > nstEvent) {
228:         ihigh = imiddle;
229:       } else if (nestedEvents[imiddle].nstEvent<nstEvent) {
230:         ilow = imiddle+1;
231:       } else {
232:         ihigh = imiddle;
233:         ilow  = imiddle;
234:       }
235:     }
236:     *entry = ihigh;
237:   }
238:   return(0);
239: }

241: /*
242:  Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
243:  Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
244: */

246: #define MAINSTAGE 0

248: static PetscLogStage savedStage = 0;

250: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageOverride(void)
251: {
252:   PetscStageLog  stageLog = petsc_stageLog;

256:   if (stageLog->curStage == MAINSTAGE) return(0);
257:   savedStage = stageLog->curStage;
258:   stageLog->curStage = MAINSTAGE;
259:   PetscIntStackPush(stageLog->stack, MAINSTAGE);
260:   return(0);
261: }

263: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageRestore(void)
264: {
265:   PetscStageLog  stageLog = petsc_stageLog;

269:   if (savedStage == MAINSTAGE) return(0);
270:   stageLog->curStage = savedStage;
271:   PetscIntStackPop(stageLog->stack, &savedStage);
272:   return(0);
273: }

275: /******************************************************************************************/
276: /* Start a nested event */
277: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
278: {
279:   PetscErrorCode  ierr;
280:   int             entry, pentry, tentry,i;
281:   PetscLogEvent   dftEvent;

284:   PetscLogEventFindNestedTimer(nstEvent, &entry);
285:   if (entry>=nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
286:     /* Nested event doesn't exist yet: create it */

288:     if (nNestedEvents==nNestedEventsAllocated) {
289:       /* Enlarge and re-allocate nestedEvents if needed */
290:       PetscNestedEvent *tmp = nestedEvents;
291:       PetscMalloc1(2*nNestedEvents,&nestedEvents);
292:       nNestedEventsAllocated*=2;
293:       PetscArraycpy(nestedEvents, tmp, nNestedEvents);
294:       PetscFree(tmp);
295:     }

297:     /* Clear space in nestedEvents for new nested event */
298:     nNestedEvents++;
299:     for (i = nNestedEvents-1; i>entry; i--) {
300:       nestedEvents[i] = nestedEvents[i-1];
301:     }

303:     /* Create event in nestedEvents */
304:     nestedEvents[entry].nstEvent = nstEvent;
305:     nestedEvents[entry].nParents=1;
306:     PetscMalloc4(1,&nestedEvents[entry].dftParentsSorted,1,&nestedEvents[entry].dftEventsSorted,1,&nestedEvents[entry].dftParents,1,&nestedEvents[entry].dftEvents);

308:     /* Fill in new event */
309:     pentry = 0;
310:     dftEvent = (PetscLogEvent) nstEvent;

312:     nestedEvents[entry].nstEvent                 = nstEvent;
313:     nestedEvents[entry].dftParents[pentry]       = dftParentActive;
314:     nestedEvents[entry].dftEvents[pentry]        = dftEvent;
315:     nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
316:     nestedEvents[entry].dftEventsSorted[pentry]  = dftEvent;

318:   } else {
319:     /* Nested event exists: find current dftParentActive among parents */
320:     PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
321:     PetscLogEvent *dftEvents        = nestedEvents[entry].dftEvents;
322:     int           nParents          = nestedEvents[entry].nParents;

324:     PetscLogEventFindDefaultTimer( dftParentActive, dftParentsSorted, nParents, &pentry);

326:     if (pentry>=nParents || dftParentActive != dftParentsSorted[pentry]) {
327:       /* dftParentActive not in the list: add it to the list */
328:       int           i;
329:       PetscLogEvent *dftParents      = nestedEvents[entry].dftParents;
330:       PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
331:       char          name[100];

333:       /* Register a new default timer */
334:       sprintf(name, "%d -> %d", (int) dftParentActive, (int) nstEvent);
335:       PetscLogEventRegister(name, 0, &dftEvent);
336:       PetscLogEventFindDefaultTimer( dftEvent, dftEventsSorted, nParents, &tentry);

338:       /* Reallocate parents and dftEvents to make space for new parent */
339:       PetscMalloc4(1+nParents,&nestedEvents[entry].dftParentsSorted,1+nParents,&nestedEvents[entry].dftEventsSorted,1+nParents,&nestedEvents[entry].dftParents,1+nParents,&nestedEvents[entry].dftEvents);
340:       PetscArraycpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents);
341:       PetscArraycpy(nestedEvents[entry].dftEventsSorted,  dftEventsSorted,  nParents);
342:       PetscArraycpy(nestedEvents[entry].dftParents,       dftParents,       nParents);
343:       PetscArraycpy(nestedEvents[entry].dftEvents,        dftEvents,        nParents);
344:       PetscFree4(dftParentsSorted,dftEventsSorted,dftParents,dftEvents);

346:       dftParents       = nestedEvents[entry].dftParents;
347:       dftEvents        = nestedEvents[entry].dftEvents;
348:       dftParentsSorted = nestedEvents[entry].dftParentsSorted;
349:       dftEventsSorted  = nestedEvents[entry].dftEventsSorted;

351:       nestedEvents[entry].nParents++;
352:       nParents++;

354:       for (i = nParents-1; i>pentry; i--) {
355:         dftParentsSorted[i] = dftParentsSorted[i-1];
356:         dftEvents[i]        = dftEvents[i-1];
357:       }
358:       for (i = nParents-1; i>tentry; i--) {
359:         dftParents[i]      = dftParents[i-1];
360:         dftEventsSorted[i] = dftEventsSorted[i-1];
361:       }

363:       /* Fill in the new default timer */
364:       dftParentsSorted[pentry] = dftParentActive;
365:       dftEvents[pentry]        = dftEvent;
366:       dftParents[tentry]       = dftParentActive;
367:       dftEventsSorted[tentry]  = dftEvent;

369:     } else {
370:       /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
371:       dftEvent = nestedEvents[entry].dftEvents[pentry];
372:     }
373:   }

375:   /* Start the default 'dftEvent'-timer and update the dftParentActive */
376:   PetscLogStageOverride();
377:   PetscLogEventBeginDefault(dftEvent,t,o1,o2,o3,o4);
378:   PetscLogStageRestore();
379:   dftParentActive = dftEvent;
380:   return(0);
381: }

383: /* End a nested event */
384: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
385: {
386:   PetscErrorCode  ierr;
387:   int             entry, pentry, nParents;
388:   PetscLogEvent  *dftEventsSorted;

391:   /* Find the nested event */
392:   PetscLogEventFindNestedTimer(nstEvent, &entry);
393:   if (entry>=nNestedEvents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d larger than number of events %d",entry,nNestedEvents);
394:   if (nestedEvents[entry].nstEvent != nstEvent) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d had unbalanced begin/end pairs does not match %d",entry,nstEvent);
395:   dftEventsSorted = nestedEvents[entry].dftEventsSorted;
396:   nParents        = nestedEvents[entry].nParents;

398:   /* Find the current default timer among the 'dftEvents' of this event */
399:   PetscLogEventFindDefaultTimer( dftParentActive, dftEventsSorted, nParents, &pentry);

401:   if (pentry>=nParents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Entry %d is larger than number of parents %d",pentry,nParents);
402:   if (dftEventsSorted[pentry] != dftParentActive) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Active parent is %d, but we seem to be closing %d",dftParentActive,dftEventsSorted[pentry]);

404:   /* Stop the default timer and update the dftParentActive */
405:   PetscLogStageOverride();
406:   PetscLogEventEndDefault(dftParentActive,t,o1,o2,o3,o4);
407:   PetscLogStageRestore();
408:   dftParentActive = nestedEvents[entry].dftParents[pentry];
409:   return(0);
410: }

412: /*@
413:    PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
414:           that takes 1 or more percent of the time.

416:   Logically Collective over PETSC_COMM_WORLD

418:   Input Parameter:
419: .   newThresh - the threshold to use

421:   Output Parameter:
422: .   oldThresh - the previously set threshold value

424:   Options Database Keys:
425: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

427:   Usage:
428: .vb
429:       PetscInitialize(...);
430:       PetscLogNestedBegin();
431:       PetscLogSetThreshold(0.1,&oldthresh);
432:        ... code ...
433:       PetscLogView(viewer);
434:       PetscFinalize();
435: .ve

437:   Level: advanced

439: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin(),
440:           PetscLogNestedBegin()
441: @*/
442: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
443: {
445:   if (oldThresh) *oldThresh = thresholdTime;
446:   if (newThresh == PETSC_DECIDE)  newThresh = 0.01;
447:   if (newThresh == PETSC_DEFAULT) newThresh = 0.01;
448:   thresholdTime = PetscMax(newThresh, 0.0);
449:   return(0);
450: }

452: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
453: {
454:   PetscErrorCode     ierr;
455:   char               arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128];
456:   char               version[256], buildoptions[128] = "";
457:   PetscMPIInt        size;
458:   size_t             len;

461:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
462:   PetscGetArchType(arch,sizeof(arch));
463:   PetscGetHostName(hostname,sizeof(hostname));
464:   PetscGetUserName(username,sizeof(username));
465:   PetscGetProgramName(pname,sizeof(pname));
466:   PetscGetDate(date,sizeof(date));
467:   PetscGetVersion(version,sizeof(version));

469:   PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
470:   PetscViewerXMLPutString(   viewer, "executable"  , "Executable"   , pname);
471:   PetscViewerXMLPutString(   viewer, "architecture", "Architecture" , arch);
472:   PetscViewerXMLPutString(   viewer, "hostname"    , "Host"         , hostname);
473:   PetscViewerXMLPutInt(      viewer, "nprocesses"  , "Number of processes", size);
474:   PetscViewerXMLPutString(   viewer, "user"        , "Run by user"  , username);
475:   PetscViewerXMLPutString(   viewer, "date"        , "Started at"   , date);
476:   PetscViewerXMLPutString(   viewer, "petscrelease", "Petsc Release", version);

478:   if (PetscDefined(USE_DEBUG)) {
479:     PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions));
480:   }
481:   if (PetscDefined(USE_COMPLEX)) {
482:     PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions));
483:   }
484:   if (PetscDefined(USE_REAL_SINGLE)) {
485:     PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions));
486:   } else if (PetscDefined(USE_REAL___FLOAT128)) {
487:     PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions));
488:   } else if (PetscDefined(USE_REAL___FP16)) {
489:     PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions));
490:   }
491:   if (PetscDefined(USE_64BIT_INDICES)) {
492:     PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions));
493:   }
494: #if defined(__cplusplus)
495:   PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions));
496: #endif
497:   PetscStrlen(buildoptions,&len);
498:   if (len) {
499:     PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
500:   }
501:   PetscViewerXMLEndSection(viewer, "runspecification");
502:   return(0);
503: }

505: /* Print the global performance: max, max/min, average and total of
506:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
507:  */
508: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble local_val, const PetscBool print_average, const PetscBool print_total)
509: {
510:   PetscErrorCode  ierr;
511:   PetscLogDouble  min, tot, ratio, avg;
512:   MPI_Comm        comm;
513:   PetscMPIInt     rank, size;
514:   PetscLogDouble  valrank[2], max[2];

517:   PetscObjectGetComm((PetscObject)viewer,&comm);
518:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
519:   MPI_Comm_rank(comm, &rank);

521:   valrank[0] = local_val;
522:   valrank[1] = (PetscLogDouble) rank;
523:   MPIU_Allreduce(&local_val, &min, 1, MPIU_PETSCLOGDOUBLE,  MPI_MIN,    comm);
524:   MPIU_Allreduce(valrank,    &max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
525:   MPIU_Allreduce(&local_val, &tot, 1, MPIU_PETSCLOGDOUBLE,  MPI_SUM,    comm);
526:   avg  = tot/((PetscLogDouble) size);
527:   if (min != 0.0) ratio = max[0]/min;
528:   else ratio = 0.0;

530:   PetscViewerXMLStartSection(viewer, name, desc);
531:   PetscViewerXMLPutDouble(viewer, "max", NULL, max[0], "%e");
532:   PetscViewerXMLPutInt(   viewer, "maxrank"  , "rank at which max was found" , (PetscMPIInt) max[1]);
533:   PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
534:   if (print_average) {
535:     PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
536:   }
537:   if (print_total) {
538:     PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
539:   }
540:   PetscViewerXMLEndSection(viewer, name);
541:   return(0);
542: }

544: /* Print the global performance: max, max/min, average and total of
545:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
546:  */
547: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
548: {
549:   PetscErrorCode  ierr;
550:   PetscLogDouble  flops, mem, red, mess;
551:   const PetscBool print_total_yes   = PETSC_TRUE,
552:                   print_total_no    = PETSC_FALSE,
553:                   print_average_no  = PETSC_FALSE,
554:                   print_average_yes = PETSC_TRUE;

557:   /* Must preserve reduction count before we go on */
558:   red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;

560:   /* Calculate summary information */
561:   PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");

563:   /*   Time */
564:   PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", locTotalTime, print_average_yes, print_total_no);

566:   /*   Objects */
567:   PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", (PetscLogDouble) petsc_numObjects, print_average_yes, print_total_no);

569:   /*   Flop */
570:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", petsc_TotalFlops/1.0E6, print_average_yes, print_total_yes);

572:   /*   Flop/sec -- Must talk to Barry here */
573:   if (locTotalTime != 0.0) flops = petsc_TotalFlops/locTotalTime;
574:   else flops = 0.0;
575:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", flops/1.0E6, print_average_yes, print_total_yes);

577:   /*   Memory */
578:   PetscMallocGetMaximumUsage(&mem);
579:   if (mem > 0.0) {
580:     PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", mem/1024.0/1024.0, print_average_yes, print_total_yes);
581:   }
582:   /*   Messages */
583:   mess = 0.5*(petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
584:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", mess, print_average_yes, print_total_yes);

586:   /*   Message Volume */
587:   mess = 0.5*(petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
588:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", mess/1024.0/1024.0, print_average_yes, print_total_yes);

590:   /*   Reductions */
591:   PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", red , print_average_no, print_total_no);
592:   PetscViewerXMLEndSection(viewer, "globalperformance");
593:   return(0);
594: }

596: typedef struct {
597:   PetscLogEvent  dftEvent;
598:   NestedEventId  nstEvent;
599:   PetscLogEvent  dftParent;
600:   NestedEventId  nstParent;
601:   PetscBool      own;
602:   int            depth;
603:   NestedEventId* nstPath;
604: } PetscNestedEventTree;

606: /* Compare timers to sort them in the tree */
607: static int compareTreeItems(const void *item1_, const void *item2_)
608: {
609:   int                  i;
610:   PetscNestedEventTree *item1 = (PetscNestedEventTree *) item1_;
611:   PetscNestedEventTree *item2 = (PetscNestedEventTree *) item2_;

613:   for (i=0; i<PetscMin(item1->depth,item2->depth); i++) {
614:     if (item1->nstPath[i]<item2->nstPath[i]) return -1;
615:     if (item1->nstPath[i]>item2->nstPath[i]) return +1;
616:   }
617:   if (item1->depth < item2->depth) return -1;
618:   if (item1->depth > item2->depth) return 1;
619:   return 0;
620: }
621: /*
622:  * Do MPI communication to get the complete, nested calling tree for all processes: there may be
623:  * calls that happen in some processes, but not in others.
624:  *
625:  * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
626:  * The tree is sorted so that the timers can be printed in the order of appearance.
627:  *
628:  * For tree-items which appear in the trees of multiple processes (which will be most items), the
629:  * following rule is followed:
630:  * + if information from my own process is available, then that is the information stored in tree.
631:  *   otherwise it is some other process's information.
632:  */
633: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
634: {
635:   PetscNestedEventTree *tree = NULL, *newTree;
636:   int                  *treeIndices;
637:   int                  nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
638:   int                  yesno;
639:   PetscBool            done;
640:   PetscErrorCode       ierr;
641:   int                  maxdepth;
642:   int                  depth;
643:   int                  illegalEvent;
644:   int                  iextra;
645:   NestedEventId        *nstPath, *nstMyPath;
646:   MPI_Comm             comm;

649:   PetscObjectGetComm((PetscObject)viewer,&comm);

651:   /* Calculate memory needed to store everybody's information and allocate tree */
652:   nTimers = 0;
653:   for (i=0; i<nNestedEvents; i++) nTimers += nestedEvents[i].nParents;

655:   PetscMalloc1(nTimers,&tree);

657:   /* Fill tree with readily available information */
658:   iTimer0 = 0;
659:   maxDefaultTimer =0;
660:   for (i=0; i<nNestedEvents; i++) {
661:     int           nParents          = nestedEvents[i].nParents;
662:     NestedEventId nstEvent          = nestedEvents[i].nstEvent;
663:     PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
664:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
665:     for (j=0; j<nParents; j++) {
666:       maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);

668:       tree[iTimer0+j].dftEvent   = dftEvents[j];
669:       tree[iTimer0+j].nstEvent   = nstEvent;
670:       tree[iTimer0+j].dftParent  = dftParentsSorted[j];
671:       tree[iTimer0+j].own        = PETSC_TRUE;

673:       tree[iTimer0+j].nstParent  = 0;
674:       tree[iTimer0+j].depth      = 0;
675:       tree[iTimer0+j].nstPath    = NULL;
676:     }
677:     iTimer0 += nParents;
678:   }

680:   /* Calculate the global maximum for the default timer index, so array treeIndices can
681:    * be allocated only once */
682:   MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
683:   maxDefaultTimer = j;

685:   /* Find default timer's place in the tree */
686:   PetscCalloc1(maxDefaultTimer+1,&treeIndices);
687:   treeIndices[0] = 0;
688:   for (i=0; i<nTimers; i++) {
689:     PetscLogEvent dftEvent = tree[i].dftEvent;
690:     treeIndices[dftEvent] = i;
691:   }

693:   /* Find each dftParent's nested identification */
694:   for (i=0; i<nTimers; i++) {
695:     PetscLogEvent dftParent = tree[i].dftParent;
696:     if (dftParent!= DFT_ID_AWAKE) {
697:       int j = treeIndices[dftParent];
698:       tree[i].nstParent = tree[j].nstEvent;
699:     }
700:   }

702:   /* Find depths for each timer path */
703:   done = PETSC_FALSE;
704:   maxdepth = 0;
705:   while (!done) {
706:     done = PETSC_TRUE;
707:     for (i=0; i<nTimers; i++) {
708:       if (tree[i].dftParent == DFT_ID_AWAKE) {
709:         tree[i].depth = 1;
710:         maxdepth = PetscMax(1,maxdepth);
711:       } else {
712:         int j = treeIndices[tree[i].dftParent];
713:         depth = 1+tree[j].depth;
714:         if (depth>tree[i].depth) {
715:           done          = PETSC_FALSE;
716:           tree[i].depth = depth;
717:           maxdepth      = PetscMax(depth,maxdepth);
718:         }
719:       }
720:     }
721:   }

723:   /* Allocate the paths in the entire tree */
724:   for (i=0; i<nTimers; i++) {
725:     depth = tree[i].depth;
726:     PetscCalloc1(depth,&tree[i].nstPath);
727:   }

729:   /* Calculate the paths for all timers */
730:   for (depth=1; depth<=maxdepth; depth++) {
731:     for (i=0; i<nTimers; i++) {
732:       if (tree[i].depth==depth) {
733:         if (depth>1) {
734:           int    j = treeIndices[tree[i].dftParent];
735:           PetscArraycpy(tree[i].nstPath,tree[j].nstPath,depth-1);
736:         }
737:         tree[i].nstPath[depth-1] = tree[i].nstEvent;
738:       }
739:     }
740:   }
741:   PetscFree(treeIndices);

743:   /* Sort the tree on basis of the paths */
744:   qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);

746:   /* Allocate an array to store paths */
747:   depth = maxdepth;
748:   MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
749:   PetscMalloc1(maxdepth+1, &nstPath);
750:   PetscMalloc1(maxdepth+1, &nstMyPath);

752:   /* Find an illegal nested event index (1+largest nested event index) */
753:   illegalEvent = 1+nestedEvents[nNestedEvents-1].nstEvent;
754:   i = illegalEvent;
755:   MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);

757:   /* First, detect timers which are not available in this process, but are available in others
758:    *        Allocate a new tree, that can contain all timers
759:    * Then,  fill the new tree with all (own and not-own) timers */
760:   newTree= NULL;
761:   for (yesno=0; yesno<=1; yesno++) {
762:     depth  = 1;
763:     i      = 0;
764:     iextra = 0;
765:     while (depth>0) {
766:       int       j;
767:       PetscBool same;

769:       /* Construct the next path in this process's tree:
770:        * if necessary, supplement with invalid path entries */
771:       depth++;
772:       if (depth > maxdepth + 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Depth %d > maxdepth+1 %d",depth,maxdepth+1);
773:       if (i<nTimers) {
774:         for (j=0;             j<tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
775:         for (j=tree[i].depth; j<depth;         j++) nstMyPath[j] = illegalEvent;
776:       } else {
777:         for (j=0;             j<depth;         j++) nstMyPath[j] = illegalEvent;
778:       }

780:       /* Communicate with other processes to obtain the next path and its depth */
781:       MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
782:       for (j=depth-1; (int) j>=0; j--) {
783:         if (nstPath[j]==illegalEvent) depth=j;
784:       }

786:       if (depth>0) {
787:         /* If the path exists */

789:         /* check whether the next path is the same as this process's next path */
790:         same = PETSC_TRUE;
791:         for (j=0; same && j<depth; j++) { same = (same &&  nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;}

793:         if (same) {
794:           /* Register 'own path' */
795:           if (newTree) newTree[i+iextra] = tree[i];
796:           i++;
797:         } else {
798:           /* Register 'not an own path' */
799:           if (newTree) {
800:             newTree[i+iextra].nstEvent   = nstPath[depth-1];
801:             newTree[i+iextra].own        = PETSC_FALSE;
802:             newTree[i+iextra].depth      = depth;
803:             PetscMalloc1(depth, &newTree[i+iextra].nstPath);
804:             for (j=0; j<depth; j++) {newTree[i+iextra].nstPath[j] = nstPath[j];}

806:             newTree[i+iextra].dftEvent  = 0;
807:             newTree[i+iextra].dftParent = 0;
808:             newTree[i+iextra].nstParent = 0;
809:           }
810:           iextra++;
811:         }

813:       }
814:     }

816:     /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
817:     totalNTimers = nTimers + iextra;
818:     if (!newTree) {
819:       PetscMalloc1(totalNTimers, &newTree);
820:     }
821:   }
822:   PetscFree(nstPath);
823:   PetscFree(nstMyPath);
824:   PetscFree(tree);
825:   tree = newTree;
826:   newTree = NULL;

828:   /* Set return value and return */
829:   *p_tree    = tree;
830:   *p_nTimers = totalNTimers;
831:   return(0);
832: }

834: /*
835:  * Delete the nested timer tree
836:  */
837: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
838: {
839:   int             i;
840:   PetscErrorCode  ierr;

843:   for (i=0; i<nTimers; i++) {
844:     PetscFree(tree[i].nstPath);
845:   }
846:   PetscFree(tree);
847:   return(0);
848: }

850: /* Print the global performance: max, max/min, average and total of
851:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
852:  */
853: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer,const char *name,PetscLogDouble value,PetscLogDouble minthreshold,PetscLogDouble maxthreshold,PetscLogDouble minmaxtreshold)
854: {
855:   MPI_Comm       comm;                          /* MPI communicator in reduction */
856:   PetscMPIInt    rank;                          /* rank of this process */
857:   PetscLogDouble val_in[2], max[2], min[2];
858:   PetscLogDouble minvalue, maxvalue, tot;
859:   PetscMPIInt    size;
860:   PetscMPIInt    minLoc, maxLoc;

864:   PetscObjectGetComm((PetscObject)viewer,&comm);
865:   MPI_Comm_size(comm, &size);
866:   MPI_Comm_rank(comm, &rank);
867:   val_in[0] = value;
868:   val_in[1] = (PetscLogDouble) rank;
869:   MPIU_Allreduce(val_in, max,  1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
870:   MPIU_Allreduce(val_in, min,  1, MPIU_2PETSCLOGDOUBLE, MPI_MINLOC, comm);
871:   maxvalue = max[0];
872:   maxLoc   = (PetscMPIInt) max[1];
873:   minvalue = min[0];
874:   minLoc   = (PetscMPIInt) min[1];
875:   MPIU_Allreduce(&value, &tot, 1, MPIU_PETSCLOGDOUBLE,  MPI_SUM,    comm);

877:   if (maxvalue<maxthreshold && minvalue>=minthreshold) {
878:     /* One call per parent or NO value: don't print */
879:   } else {
880:      PetscViewerXMLStartSection(viewer, name, NULL);
881:      if (maxvalue>minvalue*minmaxtreshold) {
882:        PetscViewerXMLPutDouble(viewer, "avgvalue", NULL, tot/size, "%g");
883:        PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%g");
884:        PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%g");
885:        PetscViewerXMLPutInt(   viewer, "minloc"  , NULL, minLoc);
886:        PetscViewerXMLPutInt(   viewer, "maxloc"  , NULL, maxLoc);
887:      } else {
888:        PetscViewerXMLPutDouble(viewer, "value", NULL, tot/size, "%g");
889:      }
890:      PetscViewerXMLEndSection(viewer, name);
891:   }
892:   return(0);
893: }

895: #define N_COMM 8
896: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer,PetscEventPerfInfo perfInfo,PetscLogDouble countsPerCall,int parentCount,int depth,const char *name,PetscLogDouble totalTime,PetscBool *isPrinted)
897: {
898:   PetscLogDouble time = perfInfo.time;
899:   PetscLogDouble timeMx;
901:   MPI_Comm       comm;

904:   PetscObjectGetComm((PetscObject)viewer,&comm);
905:   MPIU_Allreduce(&time, &timeMx, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
906:   *isPrinted = ((timeMx/totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
907:   if (*isPrinted) {
908:     PetscViewerXMLStartSection(viewer, "event", NULL);
909:     PetscViewerXMLPutString(viewer, "name", NULL, name);
910:     PetscPrintXMLNestedLinePerfResults(viewer, "time", time/totalTime*100.0, 0, 0, 1.02);
911:     PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", parentCount>0 ? countsPerCall : 0, 0.99, 1.01, 1.02);
912:     PetscPrintXMLNestedLinePerfResults(viewer, "mflops", time>=timeMx*0.001 ? 1e-6*perfInfo.flops/time : 0, 0, 0.01, 1.05);
913:     PetscPrintXMLNestedLinePerfResults(viewer, "mbps",time>=timeMx*0.001 ? perfInfo.messageLength/(1024*1024*time) : 0, 0, 0.01, 1.05);
914:     PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", time>=timeMx*0.001 ? perfInfo.numReductions/time : 0, 0, 0.01, 1.05);
915:   }
916:   return(0);
917: }

919: /* Count the number of times the parent event was called */

921: static int countParents( const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
922: {
923:   if (tree[i].depth<=1) {
924:     return 1;  /* Main event: only once */
925:   } else if (!tree[i].own) {
926:     return 1;  /* This event didn't happen in this process, but did in another */
927:   } else {
928:     int iParent;
929:     for (iParent=i-1; iParent>=0; iParent--) {
930:       if (tree[iParent].depth == tree[i].depth-1) break;
931:     }
932:     if (tree[iParent].depth != tree[i].depth-1) {
933:       /* *****  Internal error: cannot find parent */
934:       return -2;
935:     } else {
936:       PetscLogEvent dftEvent  = tree[iParent].dftEvent;
937:       return eventPerfInfo[dftEvent].count;
938:     }
939:   }
940: }

942: typedef struct {
943:   int             id;
944:   PetscLogDouble  val;
945: } PetscSortItem;

947: static int compareSortItems(const void *item1_, const void *item2_)
948: {
949:   PetscSortItem *item1 = (PetscSortItem *) item1_;
950:   PetscSortItem *item2 = (PetscSortItem *) item2_;
951:   if (item1->val > item2->val) return -1;
952:   if (item1->val < item2->val) return +1;
953:   return 0;
954: }

956: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
957: {
958:   int                depth = tree[iStart].depth;
959:   const char         *name;
960:   int                parentCount, nChildren;
961:   PetscSortItem      *children;
962:   PetscErrorCode     ierr;
963:   const int          stage = MAINSTAGE;
964:   PetscStageLog      stageLog;
965:   PetscEventRegInfo  *eventRegInfo;
966:   PetscEventPerfInfo *eventPerfInfo;
967:   PetscEventPerfInfo myPerfInfo,  otherPerfInfo, selfPerfInfo;
968:   PetscLogDouble     countsPerCall;
969:   PetscBool          wasPrinted;
970:   PetscBool          childWasPrinted;
971:   MPI_Comm           comm;

974:   /* Look up the name of the event and its PerfInfo */
975:   PetscLogGetStageLog(&stageLog);
976:   eventRegInfo  = stageLog->eventLog->eventInfo;
977:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
978:   name = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;
979:   PetscObjectGetComm((PetscObject)viewer,&comm);

981:   /* Count the number of child processes */
982:   nChildren = 0;
983:   {
984:     int i;
985:     for (i=iStart+1; i<nTimers; i++) {
986:       if (tree[i].depth <= depth) break;
987:       if (tree[i].depth == depth + 1) nChildren++;
988:     }
989:   }

991:   if (nChildren>0) {
992:     /* Create an array for the id-s and maxTimes of the children,
993:      *  leaving 2 spaces for self-time and other-time */
994:     int            i;
995:     PetscLogDouble *times, *maxTimes;

997:     PetscMalloc1(nChildren+2,&children);
998:     nChildren = 0;
999:     for (i=iStart+1; i<nTimers; i++) {
1000:       if (tree[i].depth<=depth) break;
1001:       if (tree[i].depth == depth + 1) {
1002:         children[nChildren].id  = i;
1003:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1004:         nChildren++;
1005:       }
1006:     }

1008:     /* Calculate the children's maximum times, to see whether children will be ignored or printed */
1009:     PetscMalloc1(nChildren,&times);
1010:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

1012:     PetscMalloc1(nChildren,&maxTimes);
1013:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1014:     PetscFree(times);

1016:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1017:     PetscFree(maxTimes);
1018:   }

1020:   if (!tree[iStart].own) {
1021:   /* Set values for a timer that was not activated in this process
1022:    * (but was, in other processes of this run) */
1023:     PetscMemzero(&myPerfInfo,sizeof(myPerfInfo));

1025:     selfPerfInfo  = myPerfInfo;
1026:     otherPerfInfo = myPerfInfo;

1028:     parentCount   = 1;
1029:     countsPerCall = 0;
1030:   } else {
1031:   /* Set the values for a timer that was activated in this process */
1032:     int           i;
1033:     PetscLogEvent dftEvent   = tree[iStart].dftEvent;

1035:     parentCount    = countParents( tree, eventPerfInfo, iStart);
1036:     myPerfInfo     = eventPerfInfo[dftEvent];
1037:     countsPerCall  = (PetscLogDouble) myPerfInfo.count / (PetscLogDouble) parentCount;

1039:     selfPerfInfo                = myPerfInfo;
1040:     otherPerfInfo.time          = 0;
1041:     otherPerfInfo.flops         = 0;
1042:     otherPerfInfo.numMessages   = 0;
1043:     otherPerfInfo.messageLength = 0;
1044:     otherPerfInfo.numReductions = 0;

1046:     for (i=0; i<nChildren; i++) {
1047:       /* For all child counters: subtract the child values from self-timers */

1049:       PetscLogEvent      dftChild  = tree[children[i].id].dftEvent;
1050:       PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];

1052:       selfPerfInfo.time          -= childPerfInfo.time;
1053:       selfPerfInfo.flops         -= childPerfInfo.flops;
1054:       selfPerfInfo.numMessages   -= childPerfInfo.numMessages;
1055:       selfPerfInfo.messageLength -= childPerfInfo.messageLength;
1056:       selfPerfInfo.numReductions -= childPerfInfo.numReductions;

1058:       if ((children[i].val/totalTime) < THRESHOLD) {
1059:         /* Add them to 'other' if the time is ignored in the output */
1060:         otherPerfInfo.time          += childPerfInfo.time;
1061:         otherPerfInfo.flops         += childPerfInfo.flops;
1062:         otherPerfInfo.numMessages   += childPerfInfo.numMessages;
1063:         otherPerfInfo.messageLength += childPerfInfo.messageLength;
1064:         otherPerfInfo.numReductions += childPerfInfo.numReductions;
1065:       }
1066:     }
1067:   }

1069:   /* Main output for this timer */
1070:   PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);

1072:   /* Now print the lines for the children */
1073:   if (nChildren > 0) {
1074:     /* Calculate max-times for 'self' and 'other' */
1075:     int            i;
1076:     PetscLogDouble times[2], maxTimes[2];
1077:     times[0] = selfPerfInfo.time;   times[1] = otherPerfInfo.time;
1078:     MPIU_Allreduce(times, maxTimes, 2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1079:     children[nChildren+0].id = -1;
1080:     children[nChildren+0].val = maxTimes[0];
1081:     children[nChildren+1].id = -2;
1082:     children[nChildren+1].val = maxTimes[1];

1084:     /* Now sort the children (including 'self' and 'other') on total time */
1085:     qsort(children, nChildren+2, sizeof(PetscSortItem), compareSortItems);

1087:     /* Print (or ignore) the children in ascending order of total time */
1088:     PetscViewerXMLStartSection(viewer,"events", NULL);
1089:     for (i=0; i<nChildren+2; i++) {
1090:       if ((children[i].val/totalTime) < THRESHOLD) {
1091:         /* ignored: no output */
1092:       } else if (children[i].id==-1) {
1093:         PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth+1, "self", totalTime, &childWasPrinted);
1094:         if (childWasPrinted) {
1095:           PetscViewerXMLEndSection(viewer,"event");
1096:         }
1097:       } else if (children[i].id==-2) {
1098:         size_t  len;
1099:         char    *otherName;

1101:         PetscStrlen(name,&len);
1102:         PetscMalloc1(len+16,&otherName);
1103:         PetscSNPrintf(otherName,len+16,"%s: other-timed",name);
1104:         PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth+1, otherName, totalTime, &childWasPrinted);
1105:         PetscFree(otherName);
1106:         if (childWasPrinted) {
1107:           PetscViewerXMLEndSection(viewer,"event");
1108:         }
1109:       } else {
1110:         /* Print the child with a recursive call to this function */
1111:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1112:       }
1113:     }
1114:     PetscViewerXMLEndSection(viewer,"events");
1115:     PetscFree(children);
1116:   }

1118:   if (wasPrinted) {
1119:     PetscViewerXMLEndSection(viewer, "event");
1120:   }
1121:   return(0);
1122: }

1124: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1125: {
1126:   int                i, nChildren;
1127:   PetscSortItem      *children;
1128:   PetscErrorCode     ierr;
1129:   const int          stage = MAINSTAGE;
1130:   PetscStageLog      stageLog;
1131:   PetscEventPerfInfo *eventPerfInfo;
1132:   MPI_Comm           comm;

1135:   PetscObjectGetComm((PetscObject)viewer,&comm);

1137:   /* Look up the PerfInfo */
1138:   PetscLogGetStageLog(&stageLog);
1139:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1141:   /* Count the number of child processes, and count total time */
1142:   nChildren = 0;
1143:   for (i=0; i<nTimers; i++)
1144:     if (tree[i].depth==1) nChildren++;

1146:   if (nChildren>0) {
1147:     /* Create an array for the id-s and maxTimes of the children,
1148:      *  leaving 2 spaces for self-time and other-time */
1149:     PetscLogDouble *times, *maxTimes;

1151:     PetscMalloc1(nChildren,&children);
1152:     nChildren = 0;
1153:     for (i=0; i<nTimers; i++) {
1154:       if (tree[i].depth == 1) {
1155:         children[nChildren].id  = i;
1156:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1157:         nChildren++;
1158:       }
1159:     }

1161:     /* Calculate the children's maximum times, to sort them */
1162:     PetscMalloc1(nChildren,&times);
1163:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

1165:     PetscMalloc1(nChildren,&maxTimes);
1166:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1167:     PetscFree(times);

1169:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1170:     PetscFree(maxTimes);

1172:     /* Now sort the children on total time */
1173:     qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1174:     /* Print (or ignore) the children in ascending order of total time */
1175:     PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1176:     PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1177:     PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f");

1179:     for (i=0; i<nChildren; i++) {
1180:       if ((children[i].val/totalTime) < THRESHOLD) {
1181:         /* ignored: no output */
1182:       } else {
1183:         /* Print the child with a recursive call to this function */
1184:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1185:       }
1186:     }
1187:     PetscViewerXMLEndSection(viewer, "timertree");
1188:     PetscFree(children);
1189:   }
1190:   return(0);
1191: }

1193: typedef struct {
1194:   char           *name;
1195:   PetscLogDouble time;
1196:   PetscLogDouble flops;
1197:   PetscLogDouble numMessages;
1198:   PetscLogDouble messageLength;
1199:   PetscLogDouble numReductions;
1200: } PetscSelfTimer;

1202: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1203: {
1204:   PetscErrorCode     ierr;
1205:   const int          stage = MAINSTAGE;
1206:   PetscStageLog      stageLog;
1207:   PetscEventRegInfo  *eventRegInfo;
1208:   PetscEventPerfInfo *eventPerfInfo;
1209:   PetscSelfTimer     *selftimes;
1210:   PetscSelfTimer     *totaltimes;
1211:   NestedEventId      *nstEvents;
1212:   int                i, j, maxDefaultTimer;
1213:   NestedEventId      nst;
1214:   PetscLogEvent      dft;
1215:   int                nstMax, nstMax_local;
1216:   MPI_Comm           comm;

1219:   PetscObjectGetComm((PetscObject)viewer,&comm);
1220:   PetscLogGetStageLog(&stageLog);
1221:   eventRegInfo  = stageLog->eventLog->eventInfo;
1222:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1224:   /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1225:   maxDefaultTimer =0;
1226:   for (i=0; i<nNestedEvents; i++) {
1227:     int           nParents   = nestedEvents[i].nParents;
1228:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1229:     for (j=0; j<nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
1230:   }
1231:   PetscMalloc1(maxDefaultTimer+1,&nstEvents);
1232:   for (dft=0; dft<maxDefaultTimer; dft++) {nstEvents[dft] = 0;}
1233:   for (i=0; i<nNestedEvents; i++) {
1234:     int           nParents   = nestedEvents[i].nParents;
1235:     NestedEventId nstEvent   = nestedEvents[i].nstEvent;
1236:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1237:     for (j=0; j<nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1238:   }

1240:   /* Calculate largest nested event-ID */
1241:   nstMax_local = 0;
1242:   for (i=0; i<nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent,nstMax_local);
1243:   MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);

1245:   /* Initialize all total-times with zero */
1246:   PetscMalloc1(nstMax+1,&selftimes);
1247:   PetscMalloc1(nstMax+1,&totaltimes);
1248:   for (nst=0; nst<=nstMax; nst++) {
1249:     totaltimes[nst].time          = 0;
1250:     totaltimes[nst].flops         = 0;
1251:     totaltimes[nst].numMessages   = 0;
1252:     totaltimes[nst].messageLength = 0;
1253:     totaltimes[nst].numReductions = 0;
1254:     totaltimes[nst].name          = NULL;
1255:   }

1257:   /* Calculate total-times */
1258:   for (i=0; i<nNestedEvents; i++) {
1259:     const int            nParents  = nestedEvents[i].nParents;
1260:     const NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1261:     const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1262:     for (j=0; j<nParents; j++) {
1263:       const PetscLogEvent dftEvent = dftEvents[j];
1264:       totaltimes[nstEvent].time          += eventPerfInfo[dftEvent].time;
1265:       totaltimes[nstEvent].flops         += eventPerfInfo[dftEvent].flops;
1266:       totaltimes[nstEvent].numMessages   += eventPerfInfo[dftEvent].numMessages;
1267:       totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1268:       totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1269:     }
1270:     totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1271:   }

1273:   /* Initialize: self-times := totaltimes */
1274:   for (nst=0; nst<=nstMax; nst++) { selftimes[nst] = totaltimes[nst]; }

1276:   /* Subtract timed subprocesses from self-times */
1277:   for (i=0; i<nNestedEvents; i++) {
1278:     const int           nParents          = nestedEvents[i].nParents;
1279:     const PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1280:     const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1281:     for (j=0; j<nParents; j++) {
1282:       if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1283:         const PetscLogEvent dftEvent  = dftEvents[j];
1284:         const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1285:         selftimes[nstParent].time          -= eventPerfInfo[dftEvent].time;
1286:         selftimes[nstParent].flops         -= eventPerfInfo[dftEvent].flops;
1287:         selftimes[nstParent].numMessages   -= eventPerfInfo[dftEvent].numMessages;
1288:         selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1289:         selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1290:       }
1291:     }
1292:   }

1294:   PetscFree(nstEvents);
1295:   PetscFree(totaltimes);

1297:   /* Set outputs */
1298:   *p_self  = selftimes;
1299:   *p_nstMax = nstMax;
1300:   return(0);
1301: }

1303: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1304: {
1305:   PetscErrorCode     ierr;
1306:   int                i;
1307:   NestedEventId      nst;
1308:   PetscSortItem      *sortSelfTimes;
1309:   PetscLogDouble     *times, *maxTimes;
1310:   PetscStageLog      stageLog;
1311:   PetscEventRegInfo  *eventRegInfo;
1312:   const int          dum_depth = 1, dum_count=1, dum_parentcount=1;
1313:   PetscBool          wasPrinted;
1314:   MPI_Comm           comm;

1317:   PetscObjectGetComm((PetscObject)viewer,&comm);
1318:   PetscLogGetStageLog(&stageLog);
1319:   eventRegInfo  = stageLog->eventLog->eventInfo;

1321:   PetscMalloc1(nstMax+1,&times);
1322:   PetscMalloc1(nstMax+1,&maxTimes);
1323:   for (nst=0; nst<=nstMax; nst++) { times[nst] = selftimes[nst].time;}
1324:   MPIU_Allreduce(times, maxTimes, nstMax+1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1325:   PetscFree(times);

1327:   PetscMalloc1(nstMax+1,&sortSelfTimes);

1329:   /* Sort the self-timers on basis of the largest time needed */
1330:   for (nst=0; nst<=nstMax; nst++) {
1331:     sortSelfTimes[nst].id  = nst;
1332:     sortSelfTimes[nst].val = maxTimes[nst];
1333:   }
1334:   PetscFree(maxTimes);
1335:   qsort(sortSelfTimes, nstMax+1, sizeof(PetscSortItem), compareSortItems);

1337:   PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1338:   PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");

1340:   for (i=0; i<=nstMax; i++) {
1341:     if ((sortSelfTimes[i].val/totalTime) >= THRESHOLD) {
1342:       NestedEventId      nstEvent = sortSelfTimes[i].id;
1343:       const char         *name    = eventRegInfo[(PetscLogEvent)nstEvent].name;
1344:       PetscEventPerfInfo selfPerfInfo;

1346:       selfPerfInfo.time          = selftimes[nstEvent].time ;
1347:       selfPerfInfo.flops         = selftimes[nstEvent].flops;
1348:       selfPerfInfo.numMessages   = selftimes[nstEvent].numMessages;
1349:       selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1350:       selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;

1352:       PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1353:       if (wasPrinted){
1354:         PetscViewerXMLEndSection(viewer, "event");
1355:       }
1356:     }
1357:   }
1358:   PetscViewerXMLEndSection(viewer, "selftimertable");
1359:   PetscFree(sortSelfTimes);
1360:   return(0);
1361: }

1363: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1364: {
1365:   PetscErrorCode       ierr;
1366:   PetscLogDouble       locTotalTime, globTotalTime;
1367:   PetscNestedEventTree *tree = NULL;
1368:   PetscSelfTimer       *selftimers = NULL;
1369:   int                  nTimers = 0, nstMax = 0;
1370:   MPI_Comm             comm;

1373:   PetscObjectGetComm((PetscObject)viewer,&comm);
1374:   PetscViewerInitASCII_XML(viewer);
1375:   PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1376:   PetscViewerXMLStartSection(viewer, "petscroot", NULL);

1378:   /* Get the total elapsed time, local and global maximum */
1379:   PetscTime(&locTotalTime);  locTotalTime -= petsc_BaseTime;
1380:   MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

1382:   /* Print global information about this run */
1383:   PetscPrintExeSpecs(viewer);
1384:   PetscPrintGlobalPerformance(viewer, locTotalTime);
1385:   /* Collect nested timer tree info from all processes */
1386:   PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1387:   PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime);
1388:   PetscLogNestedTreeDestroy(tree, nTimers);

1390:   /* Calculate self-time for all (not-nested) events */
1391:   PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1392:   PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1393:   PetscFree(selftimers);

1395:   PetscViewerXMLEndSection(viewer, "petscroot");
1396:   PetscViewerFinalASCII_XML(viewer);
1397:   return(0);
1398: }

1400: /*
1401:  * Populate an array containing the ancestors (callers) of an event.
1402:  */
1403: static PetscErrorCode GetEventAncestors(const PetscNestedEventTree *tree,int nEvents,const PetscNestedEventTree event,PetscNestedEventTree **ancestors,int *nAncestors)
1404: {
1405:   PetscErrorCode       ierr;
1406:   int                  pos=0;
1407:   PetscNestedEventTree currentEvent=event;

1410:   // Find the number of ancestors.
1411:   *nAncestors = 0;
1412:   while (currentEvent.dftParent != -1) {
1413:     for (int i=0; i<nEvents; i++) {
1414:       if (tree[i].dftEvent == currentEvent.dftParent) {
1415:         currentEvent = tree[i];
1416:         (*nAncestors)++;
1417:         break;
1418:       }
1419:     }
1420:   }
1421:   // Now populate the array.
1422:   PetscMalloc1(*nAncestors, ancestors);
1423:   currentEvent = event;
1424:   while (currentEvent.dftParent != -1) {
1425:     for (int i=0; i<nEvents; i++) {
1426:       if (tree[i].dftEvent == currentEvent.dftParent) {
1427:         currentEvent = tree[i];
1428:         (*ancestors)[pos] = currentEvent;
1429:         pos++;
1430:         break;
1431:       }
1432:     }
1433:   }
1434:   return(0);
1435: }

1437: /*
1438:  * Get the name of an event.
1439:  */
1440: static PetscErrorCode GetEventName(const PetscNestedEventTree event,char **name)
1441: {
1443:   PetscStageLog  stageLog;

1446:   PetscLogGetStageLog(&stageLog);
1447:   *name = stageLog->eventLog->eventInfo[event.nstEvent].name;
1448:   return(0);
1449: }

1451: /*
1452:  * Return the duration of an event in seconds.
1453:  */
1454: static PetscErrorCode GetEventTime(const PetscNestedEventTree event,PetscLogDouble *eventTime)
1455: {
1457:   PetscStageLog  stageLog;

1460:   PetscLogGetStageLog(&stageLog);
1461:   *eventTime = stageLog->stageInfo[MAINSTAGE].eventLog->eventInfo[event.dftEvent].time;
1462:   return(0);
1463: }

1465: /*
1466:  * Print nested logging information to a file suitable for reading into a Flame Graph.
1467:  *
1468:  * The format consists of a semicolon-separated list of events and the event duration in microseconds (which must be an integer).
1469:  * An example output would look like:
1470:  *   MatAssemblyBegin 1
1471:  *   MatAssemblyEnd 10
1472:  *   MatView 302
1473:  *   KSPSetUp 98
1474:  *   KSPSetUp;VecSet 5
1475:  *   KSPSolve 150
1476:  *
1477:  * This option may be requested from the command line by passing in the flag `-log_view :<somefile>.txt:ascii_flamegraph`.
1478:  */
1479: PetscErrorCode PetscLogView_Flamegraph(PetscViewer viewer)
1480: {
1481:   int                  nEvents,nAncestors;
1482:   char                 *eventName=NULL;
1483:   PetscErrorCode       ierr;
1484:   PetscNestedEventTree *tree=NULL,*ancestors=NULL,event;
1485:   PetscLogDouble       eventTime=0,locTotalTime,globTotalTime;
1486:   MPI_Comm             comm;

1489:   // Determine the overall time for the program.
1490:   PetscTime(&locTotalTime);
1491:   locTotalTime -= petsc_BaseTime;
1492:   PetscObjectGetComm((PetscObject)viewer,&comm);
1493:   MPIU_Allreduce(&locTotalTime,&globTotalTime,1,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);

1495:   PetscLogNestedTreeCreate(viewer, &tree, &nEvents);

1497:   // Now write the events to the file.
1498:   for (int i=0; i<nEvents; i++) {
1499:     event     = tree[i];
1500:     GetEventTime(event,&eventTime);
1501:     if (eventTime/globTotalTime < THRESHOLD) continue;

1503:     // Print out the ancestor events in reverse order, starting with the oldest.
1504:     GetEventAncestors(tree,nEvents,event,&ancestors,&nAncestors);
1505:     for (int j=nAncestors-1; j>=0; j--) {
1506:       GetEventName(ancestors[j],&eventName);
1507:       PetscViewerASCIIPrintf(viewer,"%s;",eventName);
1508:     }
1509:     PetscFree(ancestors);
1510:     // Now print the actual event and duration.
1511:     // The time is written as an integer (in microseconds) so the file can be understood by tools such as Speedscope.
1512:     GetEventName(event,&eventName);
1513:     PetscViewerASCIIPrintf(viewer,"%s %" PetscInt64_FMT "\n",eventName,(PetscInt64)(eventTime*1e6));
1514:   }

1516:   PetscLogNestedTreeDestroy(tree, nEvents);
1517:   return(0);
1518: }

1520: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1521: {
1522: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1524: #endif

1527:   petsc_send_ct++;
1528: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1529:   PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_send_len);
1530: #endif
1531:   return(0);
1532: }

1534: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1535: {
1536: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1538: #endif

1541:   petsc_recv_ct++;
1542: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1543:   PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_recv_len);
1544: #endif
1545:   return(0);
1546: }

1548: PETSC_EXTERN PetscErrorCode PetscAReduce()
1549: {
1551:   petsc_allreduce_ct++;
1552:   return(0);
1553: }

1555: #endif