From 1a823bd51db86dd106d622bad5704819f5b93ac2 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 28 Feb 2023 10:31:26 -0600 Subject: [PATCH 01/13] coll: use pointer instead of array in MPIC_Waitall Because the statuses parameter can accept MPI_STATUSES_IGNORE, we need use pointer rather than array, or the modern compiler may complain. --- src/include/mpir_coll.h | 2 +- src/mpi/coll/helper_fns.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/include/mpir_coll.h b/src/include/mpir_coll.h index c7b22a602b8..6ddc6d428f7 100644 --- a/src/include/mpir_coll.h +++ b/src/include/mpir_coll.h @@ -47,7 +47,7 @@ int MPIC_Issend(const void *buf, MPI_Aint count, MPI_Datatype datatype, int dest MPIR_Comm * comm_ptr, MPIR_Request ** request, MPIR_Errflag_t * errflag); int MPIC_Irecv(void *buf, MPI_Aint count, MPI_Datatype datatype, int source, int tag, MPIR_Comm * comm_ptr, MPIR_Request ** request); -int MPIC_Waitall(int numreq, MPIR_Request * requests[], MPI_Status statuses[], +int MPIC_Waitall(int numreq, MPIR_Request * requests[], MPI_Status * statuses, MPIR_Errflag_t * errflag); int MPIR_Reduce_local(const void *inbuf, void *inoutbuf, MPI_Aint count, MPI_Datatype datatype, diff --git a/src/mpi/coll/helper_fns.c b/src/mpi/coll/helper_fns.c index e15b6fce32d..9889c9fd654 100644 --- a/src/mpi/coll/helper_fns.c +++ b/src/mpi/coll/helper_fns.c @@ -587,7 +587,7 @@ int MPIC_Irecv(void *buf, MPI_Aint count, MPI_Datatype datatype, int source, } -int MPIC_Waitall(int numreq, MPIR_Request * requests[], MPI_Status statuses[], +int MPIC_Waitall(int numreq, MPIR_Request * requests[], MPI_Status * statuses, MPIR_Errflag_t * errflag) { int mpi_errno = MPI_SUCCESS; From ef8d19d0b4bce76accfa6140ef8feebfa53b8fee Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Fri, 26 May 2023 11:20:11 -0500 Subject: [PATCH 02/13] confdb: Improve f08 support test Add a call to CFI_is_contiguous, which is needed by the f08 binding. Some compilers provide this prototype, but not the symbol, so we need to disable f08 if the test fails to link. Fixes pmodels/mpich#6505 --- confdb/aclocal_fc.m4 | 1 + 1 file changed, 1 insertion(+) diff --git a/confdb/aclocal_fc.m4 b/confdb/aclocal_fc.m4 index bc11e2c52ae..97897badb13 100644 --- a/confdb/aclocal_fc.m4 +++ b/confdb/aclocal_fc.m4 @@ -1092,6 +1092,7 @@ int foo_c(CFI_cdesc_t * a_desc, CFI_cdesc_t * b_desc) void test_assumed_rank_async_impl_c(CFI_cdesc_t * a_desc) { + CFI_is_contiguous(a_desc); return; } ]])],[mv conftest.$OBJEXT conftest1.$OBJEXT],[f08_works=no]) From 868ce6dbf462c577acf482952abc421e1ef39fba Mon Sep 17 00:00:00 2001 From: "rithwik.tom" Date: Fri, 24 Feb 2023 14:49:48 -0800 Subject: [PATCH 03/13] ch4/posix: Use correct CVAR to read posix tuning json file --- src/mpid/ch4/shm/posix/posix_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpid/ch4/shm/posix/posix_init.c b/src/mpid/ch4/shm/posix/posix_init.c index 9ce8c3b1a99..eb1021cc4b2 100644 --- a/src/mpid/ch4/shm/posix/posix_init.c +++ b/src/mpid/ch4/shm/posix/posix_init.c @@ -277,7 +277,7 @@ int MPIDI_POSIX_coll_init(int rank, int size) mpi_errno = MPIR_Csel_create_from_buf(MPIDI_POSIX_coll_generic_json, create_container, &MPIDI_global.shm.posix.csel_root); } else { - mpi_errno = MPIR_Csel_create_from_file(MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE, + mpi_errno = MPIR_Csel_create_from_file(MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE, create_container, &MPIDI_global.shm.posix.csel_root); } MPIR_ERR_CHECK(mpi_errno); From 4ec33ec85a09d90b22bd13b7e2f82f8bdaaa2bb9 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Tue, 30 May 2023 14:48:40 -0500 Subject: [PATCH 04/13] errhan: Add error message for canceling inactive persistent request Return an accurate error message to the user when they try to cancel an inactive persistent send or recv request. Closes pmodels/mpich#6542. --- src/mpi/errhan/errnames.txt | 1 + src/mpi/request/request_impl.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mpi/errhan/errnames.txt b/src/mpi/errhan/errnames.txt index a6323e0baef..605173bf9dd 100644 --- a/src/mpi/errhan/errnames.txt +++ b/src/mpi/errhan/errnames.txt @@ -112,6 +112,7 @@ also the value at index %d deferred because of resource limits is not implemented **notgenreq:Attempt to complete a request with MPI_GREQUEST_COMPLETE that \ was not started with MPI_GREQUEST_START +**cancelinactive:Attempt to cancel an inactive persistent request **cancelunknown:Attempt to cancel an unknown type of request **permop:Cannot free permanent MPI_Op **attrsentinal:Internal fields in an attribute have been overwritten; \ diff --git a/src/mpi/request/request_impl.c b/src/mpi/request/request_impl.c index 28ba2769e6f..3221d35c8a0 100644 --- a/src/mpi/request/request_impl.c +++ b/src/mpi/request/request_impl.c @@ -95,7 +95,7 @@ int MPIR_Cancel_impl(MPIR_Request * request_ptr) MPIR_ERR_CHECK(mpi_errno); } } else { - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_REQUEST, "**requestpersistactive"); + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_REQUEST, "**cancelinactive"); } break; } @@ -106,7 +106,7 @@ int MPIR_Cancel_impl(MPIR_Request * request_ptr) mpi_errno = MPID_Cancel_recv(request_ptr->u.persist.real_request); MPIR_ERR_CHECK(mpi_errno); } else { - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_REQUEST, "**requestpersistactive"); + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_REQUEST, "**cancelinactive"); } break; } From 2bdeda77552d82768c83e002e0fb77de75f97a5e Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 24 Mar 2023 17:07:45 -0500 Subject: [PATCH 05/13] ch4/ofi: retry in MPIDI_OFI_do_iprobe We should retry the fi_rectmsg if it returns -FI_EAGAIN. --- src/mpid/ch4/netmod/ofi/ofi_impl.h | 18 ++++++++++++++++++ src/mpid/ch4/netmod/ofi/ofi_probe.h | 4 +++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 46dd21de107..b352e7b9c5c 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -101,6 +101,24 @@ int MPIDI_OFI_handle_cq_error(int vni, int nic, ssize_t ret); } while (_ret == -FI_EAGAIN); \ } while (0) +#define MPIDI_OFI_CALL_RETRY_RETURN(FUNC,vci_,ret) \ + do { \ + int _retry = MPIR_CVAR_CH4_OFI_MAX_EAGAIN_RETRY; \ + while (1) { \ + ret = FUNC; \ + if (likely(ret != -FI_EAGAIN)) { \ + break; \ + } \ + if (_retry > 0) { \ + _retry--; \ + MPIR_ERR_CHKANDJUMP(_retry == 0, mpi_errno, MPIX_ERR_EAGAIN, "**eagain"); \ + } \ + MPIDI_OFI_THREAD_CS_EXIT_VCI_OPTIONAL(vci_); \ + mpi_errno = MPIDI_OFI_retry_progress(); \ + MPIDI_OFI_THREAD_CS_ENTER_VCI_OPTIONAL(vci_); \ + } \ + } while (0) + /* per-vci macros - we'll transition into these macros once the locks are * moved down to ofi-layer */ #define MPIDI_OFI_VCI_PROGRESS(vci_) \ diff --git a/src/mpid/ch4/netmod/ofi/ofi_probe.h b/src/mpid/ch4/netmod/ofi/ofi_probe.h index 7c529995e89..d69316856d1 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_probe.h +++ b/src/mpid/ch4/netmod/ofi/ofi_probe.h @@ -67,7 +67,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_iprobe(int source, if (message) { recv_flags |= FI_CLAIM; } - MPIDI_OFI_CALL_RETURN(fi_trecvmsg(MPIDI_OFI_global.ctx[ctx_idx].rx, &msg, recv_flags), ofi_err); + ofi_err = 0; + MPIDI_OFI_CALL_RETRY_RETURN(fi_trecvmsg(MPIDI_OFI_global.ctx[ctx_idx].rx, &msg, recv_flags), + vni_dst, ofi_err); if (ofi_err == -FI_ENOMSG) { *flag = 0; if (message) From 4e1999221952fceba7466a597bebd3cf03efc87f Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Wed, 31 May 2023 16:00:05 -0500 Subject: [PATCH 06/13] submodule: Update libfabric Add a patch to disable an error message from the psm3 provider in builds with --disable-shared. Fixes pmodels/mpich#6518. --- modules/libfabric | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/libfabric b/modules/libfabric index d1a0275a1b2..5951f1c46cb 160000 --- a/modules/libfabric +++ b/modules/libfabric @@ -1 +1 @@ -Subproject commit d1a0275a1b20833593fe81cf3c1a505e3c127ff8 +Subproject commit 5951f1c46cb98b9c4301cf48167e83710f8e54ac From 69697f2b5829b85e7f723836189e78dc7ad3a873 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Tue, 30 May 2023 10:37:54 -0500 Subject: [PATCH 07/13] ch4/coll: Fix reduce composition alpha We need to handle the case where a non-zero root uses MPI_IN_PLACE. Otherwise we could try reading from a bad address and crash. Fixes pmodels/mpich#6540. NOTE: For single node reduce operation with non-zero root, this composition incurs an extra copy from rank 0->root. --- src/mpid/ch4/src/ch4_coll_impl.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/mpid/ch4/src/ch4_coll_impl.h b/src/mpid/ch4/src/ch4_coll_impl.h index cfe834fe389..780c794332a 100644 --- a/src/mpid/ch4/src/ch4_coll_impl.h +++ b/src/mpid/ch4/src/ch4_coll_impl.h @@ -630,7 +630,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_Reduce_intra_composition_alpha(const void *se MPI_Aint true_lb = 0; MPI_Aint true_extent = 0; MPI_Aint extent = 0; - const void *inter_sendbuf; + const void *intra_sendbuf, *inter_sendbuf; void *ori_recvbuf = recvbuf; MPIR_CHKLMEM_DECL(1); @@ -648,14 +648,19 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_Reduce_intra_composition_alpha(const void *se recvbuf = (void *) ((char *) recvbuf - true_lb); } + /* non-zero root needs to send from recvbuf if using MPI_IN_PLACE */ + intra_sendbuf = (sendbuf == MPI_IN_PLACE && root != 0) ? recvbuf : sendbuf; + /* intranode reduce on all nodes */ if (comm->node_comm != NULL) { #ifndef MPIDI_CH4_DIRECT_NETMOD - mpi_errno = MPIDI_SHM_mpi_reduce(sendbuf, recvbuf, count, datatype, op, 0, comm->node_comm, - errflag); + mpi_errno = + MPIDI_SHM_mpi_reduce(intra_sendbuf, recvbuf, count, datatype, op, 0, comm->node_comm, + errflag); #else - mpi_errno = MPIDI_NM_mpi_reduce(sendbuf, recvbuf, count, datatype, op, 0, comm->node_comm, - errflag); + mpi_errno = + MPIDI_NM_mpi_reduce(intra_sendbuf, recvbuf, count, datatype, op, 0, comm->node_comm, + errflag); #endif /* MPIDI_CH4_DIRECT_NETMOD */ if (mpi_errno) { From 83ee4708be54193b63bd195940f13f8538d9725e Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 26 May 2023 08:40:23 -0500 Subject: [PATCH 08/13] test/coll: add test allred_float This test checks whether MPI_Allreduce produce identical results on all ranks with floating point datatype. --- test/mpi/coll/Makefile.am | 1 + test/mpi/coll/allred_float.c | 98 ++++++++++++++++++++++++++++++++++++ test/mpi/coll/testlist.in | 1 + 3 files changed, 100 insertions(+) create mode 100644 test/mpi/coll/allred_float.c diff --git a/test/mpi/coll/Makefile.am b/test/mpi/coll/Makefile.am index 42508cc78f2..edb76abd0a2 100644 --- a/test/mpi/coll/Makefile.am +++ b/test/mpi/coll/Makefile.am @@ -24,6 +24,7 @@ noinst_PROGRAMS = \ allred5 \ allred6 \ allred_derived \ + allred_float \ allredmany \ alltoall1 \ alltoallv \ diff --git a/test/mpi/coll/allred_float.c b/test/mpi/coll/allred_float.c new file mode 100644 index 00000000000..4bfa1662b76 --- /dev/null +++ b/test/mpi/coll/allred_float.c @@ -0,0 +1,98 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#include "mpitest.h" +#include +#include + +/* MPI_Allreduce need produce identical results on all ranks. This is + * particular challenging for floating point datatypes since computer + * floating point arithmetic do not follow associative law. This means + * certain algorithms that works for integers need to be excluded for + * floating point. + * + * This test checks when an inapproprate algorithms is used for floating + * point reduction. + */ + +/* single-precision float has roughly a precision of 7 decimal digits */ +#define BIG 1e6 +#define TINY 1e-2 + +#define N 8 + +float buf[N]; + +static void init_buf(int rank, int pos1, int pos2) +{ + /* Mix a pair of (BIG, -BIG) and TINY, the sum of array will be the sum of + * all TINYs if we add (BIG, -BIG) first, but different results following + * different associativity. A valid algorithm need to produce consistent + * results on all ranks. + */ + for (int i = 0; i < N; i++) { + if (rank == pos1) { + buf[i] = BIG; + } else if (rank == pos2) { + buf[i] = -BIG; + } else { + buf[i] = TINY; + } + } +} + +int main(int argc, char **argv) +{ + int errs = 0; + + MTest_Init(&argc, &argv); + + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + if (size < 3) { + printf("At least 3 processes required. More (e.g. 10) is recommended.\n"); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + for (int pos1 = 0; pos1 < size; pos1++) { + for (int pos2 = pos1 + 1; pos2 < size; pos2++) { + init_buf(rank, pos1, pos2); + + MPI_Allreduce(MPI_IN_PLACE, buf, N, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + + float *check_buf; + if (rank == 0) { + check_buf = malloc(N * size * sizeof(float)); + } + MPI_Gather(buf, N, MPI_FLOAT, check_buf, N, MPI_FLOAT, 0, MPI_COMM_WORLD); + + if (rank == 0) { + MTestPrintfMsg(1, "BIG positions = (%d, %d), result = [", pos1, pos2); + for (int j = 0; j < N; j++) { + MTestPrintfMsg(1, "%f ", buf[j]); + } + MTestPrintfMsg(1, "]\n"); + + for (int i = 0; i < size; i++) { + for (int j = 0; j < N; j++) { + if (memcmp(&check_buf[i * N + j], &buf[j], sizeof(float)) != 0) { + if (errs < 10) { + printf("(%d - %d) Result [%d] from rank %d mismatch: %f != %f\n", + pos1, pos2, j, i, check_buf[i * N + j], buf[j]); + } + errs++; + } + } + } + free(check_buf); + } + } + } + + MTest_Finalize(errs); + return MTestReturnValue(errs); +} diff --git a/test/mpi/coll/testlist.in b/test/mpi/coll/testlist.in index 091a3d448eb..09def909043 100644 --- a/test/mpi/coll/testlist.in +++ b/test/mpi/coll/testlist.in @@ -10,6 +10,7 @@ allred5 10 allred6 4 allred6 7 allred_derived 4 +allred_float 10 reduce 5 reduce 10 reduce_local 2 From ba0ee687652bbf7a2a5822adc5d0ccecdd61e7f2 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 26 May 2023 12:07:43 -0500 Subject: [PATCH 09/13] test: add allred_float to collective cvars tests --- test/mpi/maint/coll_cvars.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/mpi/maint/coll_cvars.txt b/test/mpi/maint/coll_cvars.txt index b9ee146e4ab..daca406b99a 100644 --- a/test/mpi/maint/coll_cvars.txt +++ b/test/mpi/maint/coll_cvars.txt @@ -57,6 +57,7 @@ tests: allred5 5 allred6 4 allred6 7 + allred_float 10 persistent: p_allred 7 alltoall: From 870270d43804277fb19792731c9a5c450b7a5293 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 28 May 2023 13:07:18 -0500 Subject: [PATCH 10/13] coll/allreduce: remove the sort in allreduce_intra_recexch The ranks should be already in order from MPII_Recexchalgo_get_neighbors. --- src/mpi/coll/allreduce/allreduce_intra_recexch.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/mpi/coll/allreduce/allreduce_intra_recexch.c b/src/mpi/coll/allreduce/allreduce_intra_recexch.c index 8c8ca62bef4..0c51f57ce13 100644 --- a/src/mpi/coll/allreduce/allreduce_intra_recexch.c +++ b/src/mpi/coll/allreduce/allreduce_intra_recexch.c @@ -194,12 +194,6 @@ int MPIR_Allreduce_intra_recexch(const void *sendbuf, /* step2 */ - if (!is_commutative && in_step2 && count > 0) { - /* sort the neighbor list so that receives can be posted in order */ - for (phase = 0; phase < step2_nphases; phase++) - qsort(step2_nbrs[phase], k - 1, sizeof(int), MPII_Algo_compare_int); - } - /* step2 sends and reduces */ for (phase = 0; phase < step2_nphases && in_step2; phase++) { buf = 0; From 158f44cc9720522bdf5981da020f80957fe6da76 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 28 May 2023 13:14:46 -0500 Subject: [PATCH 11/13] coll: add MPIR_Datatype_is_float --- src/include/mpir_datatype.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/include/mpir_datatype.h b/src/include/mpir_datatype.h index d73a9b8c61c..676286e8d65 100644 --- a/src/include/mpir_datatype.h +++ b/src/include/mpir_datatype.h @@ -196,6 +196,16 @@ void MPIR_Datatype_get_flattened(MPI_Datatype type, void **flattened, int *flatt basic_type_ = MPI_DATATYPE_NULL; \ } while (0) +#define MPIR_Datatype_is_float(a, is_float) do { \ + MPI_Datatype basic_type; \ + MPIR_Datatype_get_basic_type(a, basic_type); \ + if (basic_type == MPI_FLOAT || basic_type == MPI_DOUBLE) { \ + is_float = true; \ + } else { \ + is_float = false; \ + } \ +} while (0) + #define MPIR_Datatype_get_ptr(a,ptr) MPIR_Getb_ptr(Datatype,DATATYPE,a,0x000000ff,ptr) /* Note: Probably there is some clever way to build all of these from a macro. From 4f5accd5104811f3d9fd5706e2bbb65cbd91a5db Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 28 May 2023 16:11:58 -0500 Subject: [PATCH 12/13] coll/allreduce: fix allreduce_intra_recexch for float If the basic datatype is a floating point, we need make sure to do the local reduction following the same associativity on all ranks, or different rank will result in non-identical results due to rounding. --- .../coll/allreduce/allreduce_intra_recexch.c | 174 +++++++----------- 1 file changed, 71 insertions(+), 103 deletions(-) diff --git a/src/mpi/coll/allreduce/allreduce_intra_recexch.c b/src/mpi/coll/allreduce/allreduce_intra_recexch.c index 0c51f57ce13..c4e50118f78 100644 --- a/src/mpi/coll/allreduce/allreduce_intra_recexch.c +++ b/src/mpi/coll/allreduce/allreduce_intra_recexch.c @@ -9,6 +9,9 @@ #include "recexchalgo.h" #include "algo_common.h" +static int find_myidx(int *nbrs, int k, int rank); +static int do_reduce(void **bufs, void *recvbuf, int n, int idx, + MPI_Aint count, MPI_Datatype datatype, MPI_Op op); int MPIR_Allreduce_intra_recexch(const void *sendbuf, void *recvbuf, @@ -35,6 +38,9 @@ int MPIR_Allreduce_intra_recexch(const void *sendbuf, nranks = comm->local_size; is_commutative = MPIR_Op_is_commutative(op); + bool is_float; + MPIR_Datatype_is_float(datatype, is_float); + /* if there is only 1 rank, copy data from sendbuf * to recvbuf and exit */ if (nranks == 1) { @@ -217,7 +223,6 @@ int MPIR_Allreduce_intra_recexch(const void *sendbuf, } send_nreq = 0; - myidx = 0; /* send data to all the neighbors */ for (i = 0; i < k - 1; i++) { nbr = step2_nbrs[phase][i]; @@ -232,7 +237,6 @@ int MPIR_Allreduce_intra_recexch(const void *sendbuf, MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } if (rank > nbr) { - myidx = i + 1; } } @@ -240,64 +244,30 @@ int MPIR_Allreduce_intra_recexch(const void *sendbuf, if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIR_ERR_POP(mpi_errno); - buf = myidx - 1; + mpi_errno = MPIC_Waitall((k - 1), recv_reqs, MPI_STATUSES_IGNORE, errflag); if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIR_ERR_POP(mpi_errno); - for (i = myidx - 1; i >= 0 && count > 0; i--, buf--) { - mpi_errno = MPIR_Reduce_local(nbr_buffer[buf], recvbuf, count, datatype, op); - if (mpi_errno) { - /* for communication errors, just record the error but continue */ - *errflag = - MPIX_ERR_PROC_FAILED == - MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; - MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); - MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); - } + if (is_commutative && !is_float) { + myidx = k - 1; + } else { + myidx = find_myidx(step2_nbrs[phase], k, rank); } - - buf = myidx; - for (i = myidx; i < k - 1 && count > 0; i++, buf++) { - if (is_commutative) { - mpi_errno = MPIR_Reduce_local(nbr_buffer[buf], recvbuf, count, datatype, op); - if (mpi_errno) { - /* for communication errors, just record the error but continue */ - *errflag = - MPIX_ERR_PROC_FAILED == - MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; - MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); - MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); - } - } else { - mpi_errno = MPIR_Reduce_local(recvbuf, nbr_buffer[buf], count, datatype, op); - if (mpi_errno) { - /* for communication errors, just record the error but continue */ - *errflag = - MPIX_ERR_PROC_FAILED == - MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; - MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); - MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); - } - - mpi_errno = - MPIR_Localcopy(nbr_buffer[buf], count, datatype, recvbuf, count, datatype); - if (mpi_errno) { - /* for communication errors, just record the error but continue */ - *errflag = - MPIX_ERR_PROC_FAILED == - MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; - MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); - MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); - } - } + mpi_errno = do_reduce(nbr_buffer, recvbuf, k, myidx, count, datatype, op); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } if (single_phase_recv == false) { /* post sends and do reduction for the 2nd phase */ phase++; if (phase < step2_nphases) { send_nreq = 0; - myidx = 0; /* send data to all the neighbors */ for (i = 0; i < k - 1; i++) { nbr = step2_nbrs[phase][i]; @@ -313,9 +283,6 @@ int MPIR_Allreduce_intra_recexch(const void *sendbuf, MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } - if (rank > nbr) { - myidx = i + 1; - } } mpi_errno = MPIC_Waitall(send_nreq, send_reqs, MPI_STATUSES_IGNORE, errflag); @@ -327,58 +294,19 @@ int MPIR_Allreduce_intra_recexch(const void *sendbuf, if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIR_ERR_POP(mpi_errno); - buf = (k - 1) + myidx - 1; - for (i = myidx - 1; i >= 0 && count > 0; i--, buf--) { - mpi_errno = MPIR_Reduce_local(nbr_buffer[buf], recvbuf, count, datatype, op); - if (mpi_errno) { - /* for communication errors, just record the error but continue */ - *errflag = - MPIX_ERR_PROC_FAILED == - MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; - MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); - MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); - } + if (is_commutative && !is_float) { + myidx = k - 1; + } else { + myidx = find_myidx(step2_nbrs[phase], k, rank); } - - buf = (k - 1) + myidx; - for (i = myidx; i < k - 1 && count > 0; i++, buf++) { - if (is_commutative) { - mpi_errno = - MPIR_Reduce_local(nbr_buffer[buf], recvbuf, count, datatype, op); - if (mpi_errno) { - /* for communication errors, just record the error but continue */ - *errflag = - MPIX_ERR_PROC_FAILED == - MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : - MPIR_ERR_OTHER; - MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); - MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); - } - } else { - mpi_errno = - MPIR_Reduce_local(recvbuf, nbr_buffer[buf], count, datatype, op); - if (mpi_errno) { - /* for communication errors, just record the error but continue */ - *errflag = - MPIX_ERR_PROC_FAILED == - MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : - MPIR_ERR_OTHER; - MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); - MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); - } - mpi_errno = - MPIR_Localcopy(nbr_buffer[buf], count, datatype, recvbuf, count, - datatype); - if (mpi_errno) { - /* for communication errors, just record the error but continue */ - *errflag = - MPIX_ERR_PROC_FAILED == - MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : - MPIR_ERR_OTHER; - MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); - MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); - } - } + mpi_errno = do_reduce(nbr_buffer + k - 1, recvbuf, k, myidx, count, datatype, op); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } } @@ -452,3 +380,43 @@ int MPIR_Allreduce_intra_recexch(const void *sendbuf, fn_fail: goto fn_exit; } + +static int find_myidx(int *nbrs, int k, int rank) +{ + for (int i = 0; i < k - 1; i++) { + if (nbrs[i] > rank) { + return i; + } + } + return k - 1; +} + +static int do_reduce(void **bufs, void *recvbuf, int k, int idx, + MPI_Aint count, MPI_Datatype datatype, MPI_Op op) +{ + int mpi_errno = MPI_SUCCESS; + + for (int i = 0; i < idx - 1; i++) { + mpi_errno = MPIR_Reduce_local(bufs[i], bufs[i + 1], count, datatype, op); + MPIR_ERR_CHECK(mpi_errno); + } + if (idx > 0) { + mpi_errno = MPIR_Reduce_local(bufs[idx - 1], recvbuf, count, datatype, op); + MPIR_ERR_CHECK(mpi_errno); + } + if (idx < k - 1) { + mpi_errno = MPIR_Reduce_local(recvbuf, bufs[idx], count, datatype, op); + MPIR_ERR_CHECK(mpi_errno); + + for (int i = idx; i < k - 2; i++) { + mpi_errno = MPIR_Reduce_local(bufs[i], bufs[i + 1], count, datatype, op); + MPIR_ERR_CHECK(mpi_errno); + } + + mpi_errno = MPIR_Localcopy(bufs[k - 2], count, datatype, recvbuf, count, datatype); + MPIR_ERR_CHECK(mpi_errno); + } + + fn_fail: + return mpi_errno; +} From e802e44e40c82c61e70588b423f0e681abee6409 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Mon, 5 Jun 2023 14:48:06 -0500 Subject: [PATCH 13/13] Update CHANGES again for 4.1.2 --- CHANGES | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 17cf756609e..448d9f00d13 100644 --- a/CHANGES +++ b/CHANGES @@ -9,10 +9,17 @@ # Fix compiler wrapper scripts to be compatible with CUDA memory hooks -# Fix MPI_WAITALL_ENQUEUE to make a copy of the input request array +# Fix MPIX_WAITALL_ENQUEUE to make a copy of the input request array + +# Fix bug in MPI_ALLREDUCE that could result in ranks receiving + different floating point values # Fix potential deadlock when progressing RMA windows +# Fix potential crash in MPI_REDUCE with non-zero root and MPI_IN_PLACE + +# Fix potential crash during probe with libfabric CXI provider + # Fix MPI_PARRIVED when the partitioned request is inactive # Fix potential bug when an attribute delete callback deletes another @@ -20,6 +27,8 @@ # Fix build issue in ROMIO Lustre driver +# Improve Fortran 2008 binding support detection during configure + # Report an error if collective tuning json file fails to open # Several fixes for testsuite programs and build configuration