Skip to content

Commit

Permalink
ch4: deprecate MPIDI_OFI_MAX_NICS restriction
Browse files Browse the repository at this point in the history
This patch removes the MPIDI_OFI_MAX_NICS variable which restricts the
max number of NICs that can be used by MPICH.

This change requires changing usages of MPIDI_OFI_MAX_NICS and
converting statically allocated arrays to dynamic arrays. As a result,
malloc and free procredures are added to init and finalize hooks.

This patch retains the current behavior of MPIR_CVAR_CH4_OFI_MAX_NICS,
which by default uses all NICs available from the OFI provider; the
application can restrict the NIC count by setting this CVAR.

Signed-off-by: Wenduo Wang <wenduwan@amazon.com>
  • Loading branch information
wenduwan committed Oct 9, 2023
1 parent a675f64 commit b4fbfcf
Show file tree
Hide file tree
Showing 11 changed files with 243 additions and 148 deletions.
15 changes: 7 additions & 8 deletions doc/mpich/tuning_parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,7 @@ disabled, the application cannot use hashing on any communicator.
`MPIR_CVAR_CH4_OFI_MAX_NICS`: This CVAR determines the number of physical NICs to use. The default
is -1 which means utilizing all available NICs. A value strictly less than -1 or equal to 0 will be
mapped to using one NIC which is the first provider in the list of providers returned by
`fi_getinfo()`. There is an upper bound for this value: the compile-time constant,
`MPIDI_OFI_MAX_NICS`.
`fi_getinfo()`.

`MPIR_CVAR_DEBUG_SUMMARY`: Prints out lots of debug information at initialization time to help find
problems with OFI provider selection.
Expand Down Expand Up @@ -212,8 +211,8 @@ The new PVARs to track the amount of bytes sent and received are shown Table V.
+ ================================ + ================ + ================= + ==== + ======= +
| Variable Name | Class | Handle | Type | Storage |
+ ================================ + ================ + ================= + ==== + ======= +
| nic_sent_bytes_count | Counter (Array) | Non-continuous | SUM | Static |
| nic_recvd_bytes_count | Counter (Array) | Non-continuous | SUM | Static |
| nic_sent_bytes_count | Counter (Array) | Non-continuous | SUM | Dynamic |
| nic_recvd_bytes_count | Counter (Array) | Non-continuous | SUM | Dynamic |
+ -------------------------------- + ---------------- + ------------------------ + ------- +
```

Expand All @@ -228,8 +227,8 @@ Hence, the sum of the two PVARS in Table VI represent the total amount of data t
+ ================================ + ================ + ================= + ==== + ======= +
| Variable Name | Class | Handle | Type | Storage |
+ ================================ + ================ + ================= + ==== + ======= +
| striped_nic_sent_bytes_count | Counter (Array) | Non-continuous | SUM | Static |
| striped_nic_recvd_bytes_count | Counter (Array) | Non-continuous | SUM | Static |
| striped_nic_sent_bytes_count | Counter (Array) | Non-continuous | SUM | Dynamic |
| striped_nic_recvd_bytes_count | Counter (Array) | Non-continuous | SUM | Dynamic |
+ -------------------------------- + ---------------- + ------------------------ + ------- +
```

Expand All @@ -241,8 +240,8 @@ The new PVARs to track number of bytes sent and received through RMA calls are s
+ ================================ + ================ + ================= + ==== + ======= +
| Variable Name | Class | Handle | Type | Storage |
+ ================================ + ================ + ================= + ==== + ======= +
| rma_pref_phy_nic_put_bytes_count | Counter (Array) | Non-continuous | SUM | Static |
| rma_pref_phy_nic_get_bytes_count | Counter (Array) | Non-continuous | SUM | Static |
| rma_pref_phy_nic_put_bytes_count | Counter (Array) | Non-continuous | SUM | Dynamic |
| rma_pref_phy_nic_get_bytes_count | Counter (Array) | Non-continuous | SUM | Dynamic |
+ -------------------------------- + ---------------- + ------------------------ + ------- +
```

Expand Down
3 changes: 3 additions & 0 deletions src/mpid/ch4/ch4_api.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#

Non Native API:
init_avt: int
NM : table
init_local: int
NM : tag_bits
SHM : tag_bits
Expand Down Expand Up @@ -520,6 +522,7 @@ PARAM:
src_vci: int
sreq: MPIR_Request *
status: MPI_Status *
table: MPIDI_av_table_t *
tag: int
tag_bits: int *
target: int
Expand Down
16 changes: 6 additions & 10 deletions src/mpid/ch4/netmod/ofi/globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,12 @@
#include "ofi_impl.h"
MPIDI_OFI_global_t MPIDI_OFI_global;

unsigned long long PVAR_COUNTER_nic_sent_bytes_count[MPIDI_OFI_MAX_NICS] ATTRIBUTE((unused));
unsigned long long PVAR_COUNTER_nic_recvd_bytes_count[MPIDI_OFI_MAX_NICS] ATTRIBUTE((unused));
unsigned long long PVAR_COUNTER_striped_nic_sent_bytes_count[MPIDI_OFI_MAX_NICS]
ATTRIBUTE((unused));
unsigned long long PVAR_COUNTER_striped_nic_recvd_bytes_count[MPIDI_OFI_MAX_NICS]
ATTRIBUTE((unused));
unsigned long long PVAR_COUNTER_rma_pref_phy_nic_put_bytes_count[MPIDI_OFI_MAX_NICS]
ATTRIBUTE((unused));
unsigned long long PVAR_COUNTER_rma_pref_phy_nic_get_bytes_count[MPIDI_OFI_MAX_NICS]
ATTRIBUTE((unused));
unsigned long long *PVAR_COUNTER_nic_sent_bytes_count = NULL;
unsigned long long *PVAR_COUNTER_nic_recvd_bytes_count = NULL;
unsigned long long *PVAR_COUNTER_striped_nic_sent_bytes_count = NULL;
unsigned long long *PVAR_COUNTER_striped_nic_recvd_bytes_count = NULL;
unsigned long long *PVAR_COUNTER_rma_pref_phy_nic_put_bytes_count = NULL;
unsigned long long *PVAR_COUNTER_rma_pref_phy_nic_get_bytes_count = NULL;

MPIDI_OFI_capabilities_t MPIDI_OFI_caps_list[MPIDI_OFI_NUM_SETS] =
/* Initialize a runtime version of all of the capability sets defined in
Expand Down
17 changes: 6 additions & 11 deletions src/mpid/ch4/netmod/ofi/ofi_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,12 @@
#include "mpidch4r.h"
#include "ch4_impl.h"

extern unsigned long long PVAR_COUNTER_nic_sent_bytes_count[MPIDI_OFI_MAX_NICS] ATTRIBUTE((unused));
extern unsigned long long PVAR_COUNTER_nic_recvd_bytes_count[MPIDI_OFI_MAX_NICS]
ATTRIBUTE((unused));
extern unsigned long long PVAR_COUNTER_striped_nic_sent_bytes_count[MPIDI_OFI_MAX_NICS]
ATTRIBUTE((unused));
extern unsigned long long PVAR_COUNTER_striped_nic_recvd_bytes_count[MPIDI_OFI_MAX_NICS]
ATTRIBUTE((unused));
extern unsigned long long PVAR_COUNTER_rma_pref_phy_nic_put_bytes_count[MPIDI_OFI_MAX_NICS]
ATTRIBUTE((unused));
extern unsigned long long PVAR_COUNTER_rma_pref_phy_nic_get_bytes_count[MPIDI_OFI_MAX_NICS]
ATTRIBUTE((unused));
extern unsigned long long *PVAR_COUNTER_nic_sent_bytes_count;
extern unsigned long long *PVAR_COUNTER_nic_recvd_bytes_count;
extern unsigned long long *PVAR_COUNTER_striped_nic_sent_bytes_count;
extern unsigned long long *PVAR_COUNTER_striped_nic_recvd_bytes_count;
extern unsigned long long *PVAR_COUNTER_rma_pref_phy_nic_put_bytes_count;
extern unsigned long long *PVAR_COUNTER_rma_pref_phy_nic_get_bytes_count;


#define MPIDI_OFI_ENAVAIL -1 /* OFI resource not available */
Expand Down
204 changes: 147 additions & 57 deletions src/mpid/ch4/netmod/ofi/ofi_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -506,70 +506,109 @@ static void dump_global_settings(void);
static void dump_dynamic_settings(void);
static int create_vci_context(int vci, int nic);
static int destroy_vci_context(int vci, int nic);
static int ofi_pvar_init(void);
static int ofi_pvar_init(int num_nics);

static int ofi_am_init(int vci);
static int ofi_am_post_recv(int vci, int nic);

static void *host_alloc(uintptr_t size);
static void host_free(void *ptr);

static int ofi_pvar_init(void)
static int ofi_pvar_init(int num_nics)
{
int mpi_errno = MPI_SUCCESS;
MPIR_T_PVAR_COUNTER_ARRAY_REGISTER_STATIC(MULTINIC,
MPI_UNSIGNED_LONG_LONG,
nic_sent_bytes_count,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM), "CH4",
"number of bytes sent through a particular NIC");

MPIR_T_PVAR_COUNTER_ARRAY_REGISTER_STATIC(MULTINIC,
MPI_UNSIGNED_LONG_LONG,
nic_recvd_bytes_count,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM), "CH4",
"number of bytes received through a particular NIC");

MPIR_T_PVAR_COUNTER_ARRAY_REGISTER_STATIC(MULTINIC,
MPI_UNSIGNED_LONG_LONG,
striped_nic_sent_bytes_count,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM), "CH4",
"number of striped bytes sent through a particular NIC");

MPIR_T_PVAR_COUNTER_ARRAY_REGISTER_STATIC(MULTINIC,
MPI_UNSIGNED_LONG_LONG,
striped_nic_recvd_bytes_count,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM), "CH4",
"number of striped bytes received through a particular NIC");

MPIR_T_PVAR_COUNTER_ARRAY_REGISTER_STATIC(MULTINIC,
MPI_UNSIGNED_LONG_LONG,
rma_pref_phy_nic_put_bytes_count,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM), "CH4",
"number of bytes sent through preferred physical NIC using RMA");

MPIR_T_PVAR_COUNTER_ARRAY_REGISTER_STATIC(MULTINIC,
MPI_UNSIGNED_LONG_LONG,
rma_pref_phy_nic_get_bytes_count,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM), "CH4",
"number of bytes received through preferred physical NIC using RMA");
assert(num_nics > 0);

MPIR_T_PVAR_COUNTER_REGISTER_DYNAMIC(
MULTINIC,
MPI_UNSIGNED_LONG_LONG,
nic_sent_bytes_count,
PVAR_COUNTER_nic_sent_bytes_count,
num_nics,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM),
NULL,
NULL,
"CH4",
"number of bytes sent through a particular NIC");

MPIR_T_PVAR_COUNTER_REGISTER_DYNAMIC(
MULTINIC,
MPI_UNSIGNED_LONG_LONG,
nic_recvd_bytes_count,
PVAR_COUNTER_nic_recvd_bytes_count,
num_nics,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM),
NULL,
NULL,
"CH4",
"number of bytes received through a particular NIC");

MPIR_T_PVAR_COUNTER_REGISTER_DYNAMIC(
MULTINIC,
MPI_UNSIGNED_LONG_LONG,
striped_nic_sent_bytes_count,
PVAR_COUNTER_striped_nic_sent_bytes_count,
num_nics,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM),
NULL,
NULL,
"CH4",
"number of striped bytes sent through a particular NIC");

MPIR_T_PVAR_COUNTER_REGISTER_DYNAMIC(
MULTINIC,
MPI_UNSIGNED_LONG_LONG,
striped_nic_recvd_bytes_count,
PVAR_COUNTER_striped_nic_recvd_bytes_count,
num_nics,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM),
NULL,
NULL,
"CH4",
"number of striped bytes received through a particular NIC");

MPIR_T_PVAR_COUNTER_REGISTER_DYNAMIC(
MULTINIC,
MPI_UNSIGNED_LONG_LONG,
rma_pref_phy_nic_put_bytes_count,
PVAR_COUNTER_rma_pref_phy_nic_put_bytes_count,
num_nics,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM),
NULL,
NULL,
"CH4",
"number of bytes sent through preferred physical NIC using RMA");

MPIR_T_PVAR_COUNTER_REGISTER_DYNAMIC(
MULTINIC,
MPI_UNSIGNED_LONG_LONG,
rma_pref_phy_nic_get_bytes_count,
PVAR_COUNTER_rma_pref_phy_nic_get_bytes_count,
num_nics,
MPI_T_VERBOSITY_USER_DETAIL,
MPI_T_BIND_NO_OBJECT,
(MPIR_T_PVAR_FLAG_READONLY |
MPIR_T_PVAR_FLAG_SUM),
NULL,
NULL,
"CH4",
"number of bytes received through preferred physical NIC using RMA");

return mpi_errno;
}

Expand Down Expand Up @@ -600,6 +639,46 @@ static void set_sep_counters(int nic)
}
}

int MPIDI_OFI_init_avt(MPIDI_av_table_t *table)
{
int mpi_errno = MPI_SUCCESS;

MPIR_Assert(MPIDI_OFI_global.num_nics > 0);
for (int i = 0; i < table->size; ++i) {
table->table[i].netmod.ofi.dest = MPL_malloc(sizeof(fi_addr_t *) * MPIDI_OFI_global.num_nics, MPL_MEM_ADDRESS);
MPIR_ERR_CHKANDJUMP(!table->table[i].netmod.ofi.dest, mpi_errno, MPI_ERR_OTHER, "**nomem");
for (int j = 0; j < MPIDI_OFI_global.num_nics; ++j) {
table->table[i].netmod.ofi.dest[j] = MPL_malloc(sizeof(fi_addr_t *) * MPIDI_CH4_MAX_VCIS, MPL_MEM_ADDRESS);
MPIR_ERR_CHKANDJUMP(!table->table[i].netmod.ofi.dest[j], mpi_errno, MPI_ERR_OTHER, "**nomem");
}
}
fn_exit:
return mpi_errno;
fn_fail:
goto fn_exit;
}

static void destroy_avt(MPIDI_av_table_t *table)
{
for (int i = 0; i < table->size; ++i) {
if (table->table[i].netmod.ofi.dest) {
for (int j = 0; j < MPIDI_OFI_global.num_nics; ++j) {
if (table->table[i].netmod.ofi.dest[j]) {
MPL_free(table->table[i].netmod.ofi.dest[j]);
}
}
MPL_free(table->table[i].netmod.ofi.dest);
}
}
}

static void destroy_global_avts(void)
{
for (int i = 0; i < MPIDIU_get_n_avts(); ++i) {
destroy_avt(MPIDI_global.avt_mgr.av_tables[i]);
}
}

int MPIDI_OFI_init_local(int *tag_bits)
{
int mpi_errno = MPI_SUCCESS;
Expand Down Expand Up @@ -642,7 +721,6 @@ int MPIDI_OFI_init_local(int *tag_bits)
MPIDI_OFI_global.num_comms_enabled_striping = 0;
MPIDI_OFI_global.num_comms_enabled_hashing = 0;

mpi_errno = ofi_pvar_init();
MPIR_ERR_CHECK(mpi_errno);

/* -------------------------------- */
Expand All @@ -658,17 +736,23 @@ int MPIDI_OFI_init_local(int *tag_bits)
mpi_errno = MPIDI_OFI_find_provider(&prov);
MPIR_ERR_CHECK(mpi_errno);

/* init multi-nic and populates MPIDI_OFI_global.prov_use[] */
/* init multi-nic and populates MPIDI_OFI_global */
mpi_errno = MPIDI_OFI_init_multi_nic(prov);
MPIR_ERR_CHECK(mpi_errno);

/* init ofi address vectors in global av table0 */
mpi_errno = MPIDI_OFI_init_avt(MPIDI_global.avt_mgr.av_table0);
MPIR_ERR_CHECK(mpi_errno);

mpi_errno = update_global_limits(MPIDI_OFI_global.prov_use[0]);
MPIR_ERR_CHECK(mpi_errno);

if (MPIR_CVAR_DEBUG_SUMMARY && MPIR_Process.rank == 0) {
dump_global_settings();
}

mpi_errno = ofi_pvar_init(MPIDI_OFI_global.num_nics);

/* Finally open the fabric */
MPIDI_OFI_CALL(fi_fabric(MPIDI_OFI_global.prov_use[0]->fabric_attr,
&MPIDI_OFI_global.fabric, NULL), fabric);
Expand Down Expand Up @@ -1030,6 +1114,8 @@ int MPIDI_OFI_mpi_finalize_hook(void)
fi_freeinfo(MPIDI_OFI_global.prov_use[i]);
}

destroy_global_avts();

MPIDIU_map_destroy(MPIDI_OFI_global.win_map);

if (MPIDI_OFI_ENABLE_AM) {
Expand All @@ -1055,6 +1141,10 @@ int MPIDI_OFI_mpi_finalize_hook(void)
}
}

MPL_free(MPIDI_OFI_global.ctx);
MPL_free(MPIDI_OFI_global.nic_info);
MPL_free(MPIDI_OFI_global.prov_use);

int err;
MPID_Thread_mutex_destroy(&MPIDI_OFI_THREAD_UTIL_MUTEX, &err);
MPIR_Assert(err == 0);
Expand Down
2 changes: 1 addition & 1 deletion src/mpid/ch4/netmod/ofi/ofi_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ int MPIDI_OFI_match_provider(struct fi_info *prov,
void MPIDI_OFI_update_global_settings(struct fi_info *prov);

/* Determine if NIC has already been included in others */
bool MPIDI_OFI_nic_already_used(const struct fi_info *prov, struct fi_info **others, int nic_count);
bool MPIDI_OFI_nic_already_used(const struct fi_info *prov, struct fi_info *others);

int MPIDI_OFI_addr_exchange_root_ctx(void);
int MPIDI_OFI_addr_exchange_all_ctx(void);
Expand Down

0 comments on commit b4fbfcf

Please sign in to comment.