test/mpi: add p2p benchmarks in test/mpi/bench

Add point-to-point benchmark code in MyDef. The tests have automatic warm-ups and adjusts number of iterations for measurement accuracy. It produces latency measurements with standard deviations and equivalent bandwidths. To run: mydef_page p2p.def # -> p2p_latency.c p2p_bw.c mpicc p2p_latency.c && mpi_run -n 2 ./a.out mpicc p2p_bw.c && mpi_run -n 2 ./a.out Alternatively use mydef_run (uses settings from config): mydef_run p2p.def
pmodels · Feb 12, 2024 · ccc2757 · ccc2757
1 parent a23225e
commit ccc2757
Show file tree

Hide file tree

Showing 4 changed files with 226 additions and 0 deletions.
diff --git a/test/mpi/bench/config b/test/mpi/bench/config
@@ -0,0 +1,4 @@
+module: c
+output_dir: out
+CC: mpicc
+run: mpirun -n 2
diff --git a/test/mpi/bench/macros/bench_frame.def b/test/mpi/bench/macros/bench_frame.def
@@ -0,0 +1,88 @@
+/*
+ * bench_frame       : boilerplate for mpi program
+ * measure(iter)     : measures `tf_dur` for $(iter) iterations
+ * run_stat(N, var)  : run N measurements and obtain (avg, std) in sum1, sum2
+ * warm_up(iter, dur): repeat until measurements (iter, dur) stablize
+ * report_latency(N) : print a line of latency result
+ */
+
+subcode: bench_frame
+    $include stdio
+    $include stdlib
+    $include mpi
+
+    $global grank, gsize: int
+
+    $function main
+        int errs = 0;
+
+        MPI_Init(NULL, NULL);
+
+        MPI_Comm_rank(MPI_COMM_WORLD, &grank);
+        MPI_Comm_size(MPI_COMM_WORLD, &gsize);
+
+        MPI_Comm comm = MPI_COMM_WORLD;
+        char *buf = malloc($(MAX_MSG));
+
+        $call @report_title
+        $call main
+
+        MPI_Finalize();
+
+        return errs
+
+macros:
+    use_double: 1
+    data: buf, size, MPI_CHAR
+    MAX_MSG: 5000000
+
+#---------------------------------------- 
+subcode: _autoload
+    $register_prefix(comm) MPI_Comm
+
+subcode: foreach_size
+    $for int size = 0; size < $(MAX_MSG); size = (size==0)?1:size*2
+        $(set:MSG_SIZE=size)
+        BLOCK
+
+subcode: measure(iter)
+    tf_start = MPI_Wtime()
+    $for 0:$(iter)
+        BLOCK
+    tf_dur = MPI_Wtime() - tf_start
+
+subcode: run_stat(N, var)
+    $my double sum1=0, double sum2=0
+    $for 0:$(N)
+        BLOCK
+        sum1 += $(var)
+        sum2 += $(var) * $(var)
+    sum1 /= $(N)
+    sum2 /= $(N)
+    sum2 = sqrt(sum2 - sum1 * sum1)
+
+subcode: warm_up(iter, dur)
+    $(set:MIN_ITER=(int) ($(iter) * 0.001 / $(dur)))
+    $(iter) = 2
+    $my double last_dur = 1.0
+    $my int num_best = 0
+    $while num_best < 10
+        BLOCK
+        $if $(iter) < $(MIN_ITER)
+            $(iter) = $(MIN_ITER)
+            num_best = 0
+            continue
+        # check that t_dur is no longer monotonically decreasing
+        $if $(dur) > last_dur
+            num_best++
+        last_dur = $(dur)
+
+subcode: report_latency(N)
+    tf_latency = sum1 / ($(N)) * 1e6
+    tf_sigma = sum2 / ($(N)) * 1e6
+    $(if:MSG_SIZE)
+        tf_bw = $(MSG_SIZE) / tf_latency
+        printf("  %10d %10.3f %6.3f  %10.3f\n", $(MSG_SIZE), tf_latency, tf_sigma, tf_bw)
+    $(else)
+        printf("       %10.3f %6.3f\n", tf_latency, tf_sigma)
+
diff --git a/test/mpi/bench/macros/bench_p2p.def b/test/mpi/bench/macros/bench_p2p.def
@@ -0,0 +1,79 @@
+/*
+ * Defines following functions:
+ *   bench_p2p
+ *       bench_send, bench_warmup
+ *       bench_recv
+ *
+ * For each measurement -
+ *    First sender tells receiver the `iter` parameter. `iter = 0` means to quit.
+ *    For each iteration runs `send_side` and `recv_side` assuming the measurement on sender side represents a latency measurement.
+ * 
+ * Caller page defines -
+ *     subcode: sender_side, recv_side
+ *     macro:
+ *         params: function parameters for bench_p2p etc.
+ *         MSG_SIZE: if defined report_latency will include bw
+ *         MULTIPLICITY: divisor for each measurement
+ */
+
+subcode: _autoload
+    $register_name(src) int
+    $register_name(dst) int
+    $define TAG 0
+    $define SYNC_TAG 100
+
+subcode: report_title
+    $if gsize != 2
+        printf("! Test $(_pagename) requires 2 processes !\n");
+        return 0
+    $if grank == 0
+        printf("# Test $(_pagename): msg-size  avg-latency  sigma  avg-bandwidth\n")
+
+fncode: bench_p2p(comm, src, dst, @params)
+    int rank;
+    MPI_Comm_rank(comm, &rank)
+
+    $(if:!REPEAT)
+        $(set:REPEAT=20)
+    $(if:!MULTIPLICITY)
+        $(set:MULTIPLICITY=1)
+
+    $if rank == src
+        iter = bench_warmup(comm, dst, $(params))
+        &call run_stat, $(REPEAT), tf_latency
+            tf_latency = bench_send(iter, comm, dst, $(params))
+            tf_latency /= iter
+        $call report_latency, $(MULTIPLICITY)
+        $call send_stop
+    $elif rank == dst
+        bench_recv(comm, src, $(params))
+
+    subcode: send_stop
+        iter = 0;
+        MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm)
+
+#---------------------------------------- 
+fncode: bench_send(int iter, comm, dst, @params)
+    # synchronize with receiver
+    MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm);
+
+    &call measure, iter
+        $call @send_side
+
+    return tf_dur
+
+fncode: bench_recv(comm, src, @params)
+    $while 1
+        int iter;
+        # synchronize with sender */
+        MPI_Recv(&iter, 1, MPI_INT, src, SYNC_TAG, comm, MPI_STATUS_IGNORE);
+        $if iter == 0
+            # time to quit
+            break
+        $for i=0:iter
+            $call @recv_side
+
+fncode: bench_warmup(comm, dst, @params): int
+    &call warm_up, iter, tf_dur
+        tf_dur = bench_send(iter, comm, dst, $(params))
+    return iter
diff --git a/test/mpi/bench/p2p.def b/test/mpi/bench/p2p.def
@@ -0,0 +1,55 @@
+/* Instructions: 
+ *     mydef_page p2p.def  # -> p2p_latency.c p2p_bw.c
+ *     mpicc p2p_latency.c && mpi_run -n 2 ./a.out
+ *     mpicc p2p_bw.c      && mpi_run -n 2 ./a.out
+ *
+ * Reference the output C code or bench_{frame,p2p}.def.
+ */
+
+include: macros/bench_frame.def
+include: macros/bench_p2p.def
+
+subcode: _autoload
+    $register_name(buf) void *
+    $register_name(size) int
+    $register_name(batch_size) int
+
+page: p2p_latency, bench_frame
+    params: buf, size
+    MSG_SIZE: size
+    MULTIPLICITY: 2
+
+    bench_p2p(comm, 0, 1, buf, 0)
+    $for int size = 1; size < $(MAX_MSG); size *= 2
+        bench_p2p(comm, 0, 1, buf, size)
+
+    subcode: send_side
+        MPI_Send($(data), dst, TAG, comm);
+        MPI_Recv($(data), dst, TAG, comm, MPI_STATUS_IGNORE);
+
+    subcode: recv_side
+        MPI_Recv($(data), src, TAG, comm, MPI_STATUS_IGNORE);
+        MPI_Send($(data), src, TAG, comm);
+
+page: p2p_bw, bench_frame
+    params: buf, size, batch_size
+    MSG_SIZE: size
+    MULTIPLICITY: batch_size
+    MAX_BATCH_SIZE: 64
+
+    $for int size = 1; size < $(MAX_MSG); size *= 2
+        bench_p2p(comm, 0, 1, buf, size, 64)
+
+    subcode: send_side
+        $my MPI_Request reqs[$(MAX_BATCH_SIZE)]
+        $for j=0:batch_size
+            MPI_Isend($(data), dst, TAG, comm, &reqs[j])
+        MPI_Waitall(batch_size, reqs, MPI_STATUSES_IGNORE)
+        MPI_Recv(NULL, 0, MPI_DATATYPE_NULL, dst, TAG, comm, MPI_STATUS_IGNORE)
+
+    subcode: recv_side
+        $my MPI_Request reqs[$(MAX_BATCH_SIZE)]
+        $for j=0:batch_size
+            MPI_Irecv($(data), src, TAG, comm, &reqs[j])
+        MPI_Waitall(batch_size, reqs, MPI_STATUSES_IGNORE)
+        MPI_Send(NULL, 0, MPI_DATATYPE_NULL, src, TAG, comm)