Fix Profiler + Minor improvements (#1548)

There is currently a bug in the profiler, that the first run recorded includes the runtime of all the warmup runs, making one ugly outlier. This is because `times[0]` is set before the warmups and is never properly reset. This fixes that and makes the surrounding code a bit cleaner (e.g. using `np.diff` instead of list comprehension). I also added some options that make the profiler more usable when it's called inside of other `tqdm` progress bars (allows setting `tqdm`'s `leave=False` and stop it from printing results). List of changes: - Fix: timing error - Add: `tqdm_leave` parameter - Add: `print_results` parameter - Cleanup: Only print `Profiling...` if `tqdm` is not used - Cleanup: use `warnings.warn()` instead of `print("WARNING: ...")` - Cleanup: remove unused variable for return value (didn't seem to be used) - Cleanup: make diff calculation more concise --------- Co-authored-by: Tal Ben-Nun <tbennun@users.noreply.github.com>
spcl · Mar 16, 2024 · e542965 · e542965
1 parent 5f9233e
commit e542965
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 22 deletions.
diff --git a/dace/builtin_hooks.py b/dace/builtin_hooks.py
@@ -15,7 +15,12 @@
 
 
 @contextmanager
-def profile(repetitions: int = 100, warmup: int = 0):
+def profile(
+    repetitions: int = 100,
+    warmup: int = 0,
+    tqdm_leave: bool = True,
+    print_results: bool = True,
+):
     """
     Context manager that enables profiling of each called DaCe program. If repetitions is greater than 1, the
     program is run multiple times and the average execution time is reported.
@@ -35,6 +40,10 @@ def profile(repetitions: int = 100, warmup: int = 0):
 
     :param repetitions: The number of times to run each DaCe program.
     :param warmup: Number of additional repetitions to run the program without measuring time.
+    :param tqdm_leave: Sets the ``leave`` parameter of the ``tqdm`` progress bar (useful
+        for nested progress bars). Ignored if tqdm progress bar is not used.
+    :param print_results: Whether or not to print the median execution time after
+        all repetitions.
     :note: Running functions multiple times may affect the results of the program.
     """
     from dace.frontend.operations import CompiledSDFGProfiler  # Avoid circular import
@@ -51,7 +60,7 @@ def profile(repetitions: int = 100, warmup: int = 0):
             yield hook
             return
 
-    profiler = CompiledSDFGProfiler(repetitions, warmup)
+    profiler = CompiledSDFGProfiler(repetitions, warmup, tqdm_leave, print_results)
 
     with on_compiled_sdfg_call(context_manager=profiler):
         yield profiler

diff --git a/dace/frontend/operations.py b/dace/frontend/operations.py
@@ -1,6 +1,7 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 from __future__ import print_function
 from functools import partial
+from itertools import chain, repeat
 
 from contextlib import contextmanager
 from timeit import default_timer as timer
@@ -10,6 +11,7 @@
 import sympy
 import os
 import sys
+import warnings
 
 from dace import dtypes
 from dace.config import Config
@@ -28,12 +30,20 @@ class CompiledSDFGProfiler:
 
     times: List[Tuple['SDFG', List[float]]]  #: The list of SDFGs and times for each SDFG called within the context.
 
-    def __init__(self, repetitions: int = 0, warmup: int = 0) -> None:
+    def __init__(
+        self,
+        repetitions: int = 0,
+        warmup: int = 0,
+        tqdm_leave: bool = True,
+        print_results: bool = True,
+    ) -> None:
         # Avoid import loop
         from dace.codegen.instrumentation import report
 
         self.repetitions = repetitions or int(Config.get('treps'))
         self.warmup = warmup
+        self.tqdm_leave = tqdm_leave
+        self.print_results = print_results
         if self.repetitions < 1:
             raise ValueError('Number of repetitions must be at least 1')
         if self.warmup < 0:
@@ -47,34 +57,45 @@ def __init__(self, repetitions: int = 0, warmup: int = 0) -> None:
     def __call__(self, compiled_sdfg: 'CompiledSDFG', args: Tuple[Any, ...]):
         from dace.codegen.instrumentation import report  # Avoid import loop
 
-        start = timer()
+        # zeros to overwrite start time, followed by indices for each repetition
+        iterator = chain(repeat(0, self.warmup), range(1, self.repetitions + 1))
 
-        times = [start] * (self.repetitions + 1)
-        ret = None
-        print('\nProfiling...')
-
-        iterator = range(self.warmup + self.repetitions)
         if Config.get_bool('profiling_status'):
             try:
                 from tqdm import tqdm
-                iterator = tqdm(iterator, desc="Profiling", file=sys.stdout)
+
+                iterator = tqdm(
+                    iterator,
+                    desc='Profiling',
+                    total=(self.warmup + self.repetitions),
+                    file=sys.stdout,
+                    leave=self.tqdm_leave,
+                )
             except ImportError:
-                print('WARNING: Cannot show profiling progress, missing optional '
-                      'dependency tqdm...\n\tTo see a live progress bar please install '
-                      'tqdm (`pip install tqdm`)\n\tTo disable this feature (and '
-                      'this warning) set `profiling_status` to false in the dace '
-                      'config (~/.dace.conf).')
+                warnings.warn(
+                    'Cannot show profiling progress, missing optional dependency '
+                    'tqdm...\n\tTo see a live progress bar please install tqdm '
+                    '(`pip install tqdm`)\n\tTo disable this feature (and this '
+                    'warning) set `profiling_status` to false in the dace config '
+                    '(~/.dace.conf).'
+                )
+                print('\nProfiling...')
+        else:
+            print('\nProfiling...')
 
-        offset = 1 - self.warmup
         start_time = int(time.time())
+
+        times = np.ndarray(self.repetitions + 1, dtype=np.float64)
         times[0] = timer()
+
         for i in iterator:
             # Call function
             compiled_sdfg._cfunc(compiled_sdfg._libhandle, *args)
-            if i >= self.warmup:
-                times[i + offset] = timer()
 
-        diffs = np.array([(times[i] - times[i - 1])*1e3 for i in range(1, self.repetitions + 1)])
+            times[i] = timer()
+
+        # compute pairwise differences and convert to milliseconds
+        diffs = np.diff(times) * 1e3
 
         # Add entries to the instrumentation report
         self.report.name = self.report.name or start_time
@@ -88,8 +109,9 @@ def __call__(self, compiled_sdfg: 'CompiledSDFG', args: Tuple[Any, ...]):
         self.report.durations[(0, -1, -1)][f'Python call to {compiled_sdfg.sdfg.name}'][-1].extend(diffs)
 
         # Print profiling results
-        time_msecs = np.median(diffs)
-        print(compiled_sdfg.sdfg.name, time_msecs, 'ms')
+        if self.print_results:
+            time_msecs = np.median(diffs)
+            print(compiled_sdfg.sdfg.name, time_msecs, 'ms')
 
         # Save every call separately
         self.times.append((compiled_sdfg.sdfg, diffs))
@@ -105,7 +127,7 @@ def __call__(self, compiled_sdfg: 'CompiledSDFG', args: Tuple[Any, ...]):
         # Restore state after skipping contents
         compiled_sdfg.do_not_execute = old_dne
 
-        return ret
+        return None
 
 
 def detect_reduction_type(wcr_str, openmp=False):