Skip to content

Commit

Permalink
prov/rxm: Add option to auto detect hmem iface of user buffers
Browse files Browse the repository at this point in the history
A new option FI_OFI_RXM_DETECT_HMEM_IFACE is added. With the option
turned on, RxM tries to detect the HMEM iface of user buffers if
no mr_desc is supplied. This allows successful copy between user
buffers located in device memory and internal bounce buffers, as
well as registering such user buffer in RxM Rendezvous protocol.

Note that this doesn't allow such buffers to be used in the RxM
direct send mode if the core provider requires FI_MR_HMEM because
no memory registration is introduced by the option.

Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
  • Loading branch information
j-xiong committed Mar 13, 2024
1 parent feae05f commit d553031
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 23 deletions.
6 changes: 6 additions & 0 deletions man/fi_rxm.7.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,12 @@ with (default: 256).
consecutively read across progress calls without checking to see if the
CM progress interval has been reached (default: 128)

*FI_OFI_RXM_DETECT_HMEM_IFACE*
: Set this to 1 to allow automatic detection of HMEM iface of user buffers
when such information is not supplied. This feature allows such buffers be
copied or registered (e.g. in Rendezvous) internally by RxM. Note that no
extra memory registration is performed with this option. (default: false)

# Tuning

## Bandwidth
Expand Down
18 changes: 15 additions & 3 deletions prov/rxm/src/rxm.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ extern size_t rxm_cq_eq_fairness;
extern int rxm_passthru;
extern int force_auto_progress;
extern int rxm_use_write_rndv;
extern int rxm_detect_hmem_iface;
extern enum fi_wait_obj def_wait_obj, def_tcp_wait_obj;

struct rxm_ep;
Expand Down Expand Up @@ -309,11 +310,22 @@ struct rxm_mr {
};

static inline enum fi_hmem_iface
rxm_mr_desc_to_hmem_iface_dev(void **desc, size_t count, uint64_t *device)
rxm_iov_desc_to_hmem_iface_dev(const struct iovec *iov, void **desc,
size_t count, uint64_t *device)
{
if (!count || !desc || !desc[0]) {
enum fi_hmem_iface iface = FI_HMEM_SYSTEM;

if (!count) {
*device = 0;
return FI_HMEM_SYSTEM;
return iface;
}

if (!desc || !desc[0]) {
if (rxm_detect_hmem_iface)
iface = ofi_get_hmem_iface(iov[0].iov_base, device, NULL);
else
*device = 0;
return iface;
}

*device = ((struct rxm_mr *) desc[0])->device;
Expand Down
14 changes: 8 additions & 6 deletions prov/rxm/src/rxm_atomic.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,10 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
datatype_sz);
buf_len = ofi_total_iov_len(buf_iov, msg->iov_count);

buf_iface = rxm_mr_desc_to_hmem_iface_dev(msg->desc,
msg->iov_count,
&buf_device);
buf_iface = rxm_iov_desc_to_hmem_iface_dev(buf_iov,
msg->desc,
msg->iov_count,
&buf_device);
}

if (op == ofi_op_atomic_compare) {
Expand All @@ -136,9 +137,10 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
cmp_len = ofi_total_iov_len(cmp_iov, compare_iov_count);
assert(buf_len == cmp_len);

cmp_iface = rxm_mr_desc_to_hmem_iface_dev(compare_desc,
compare_iov_count,
&cmp_device);
cmp_iface = rxm_iov_desc_to_hmem_iface_dev(cmp_iov,
compare_desc,
compare_iov_count,
&cmp_device);
}

data_len = buf_len + cmp_len + sizeof(struct rxm_atomic_hdr);
Expand Down
22 changes: 13 additions & 9 deletions prov/rxm/src/rxm_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -378,9 +378,10 @@ static void rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done)
uint64_t device;
ssize_t done_len;

iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc,
rx_buf->recv_entry->rxm_iov.count,
&device);
iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov,
rx_buf->recv_entry->rxm_iov.desc,
rx_buf->recv_entry->rxm_iov.count,
&device);

done_len = ofi_copy_to_hmem_iov(iface, device,
rx_buf->recv_entry->rxm_iov.iov,
Expand Down Expand Up @@ -629,6 +630,7 @@ void rxm_handle_eager(struct rxm_rx_buf *rx_buf)
rx_buf->recv_entry->rxm_iov.desc, rx_buf->data,
rx_buf->pkt.hdr.size, rx_buf->recv_entry->rxm_iov.iov,
rx_buf->recv_entry->rxm_iov.count, 0);

assert((size_t) done_len == rx_buf->pkt.hdr.size);

rxm_finish_recv(rx_buf, done_len);
Expand All @@ -640,9 +642,10 @@ void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf)
uint64_t device;
ssize_t done_len;

iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc,
rx_buf->recv_entry->rxm_iov.count,
&device);
iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov,
rx_buf->recv_entry->rxm_iov.desc,
rx_buf->recv_entry->rxm_iov.count,
&device);

done_len = ofi_copy_to_hmem_iov(iface, device,
rx_buf->recv_entry->rxm_iov.iov,
Expand Down Expand Up @@ -1247,9 +1250,10 @@ static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep,
" msg_id: 0x%" PRIx64 "\n", rx_buf->pkt.hdr.op,
rx_buf->pkt.ctrl_hdr.msg_id);

iface = rxm_mr_desc_to_hmem_iface_dev(tx_buf->atomic_result.desc,
tx_buf->atomic_result.count,
&device);
iface = rxm_iov_desc_to_hmem_iface_dev(tx_buf->atomic_result.iov,
tx_buf->atomic_result.desc,
tx_buf->atomic_result.count,
&device);

assert(!(rx_buf->comp_flags & ~(FI_RECV | FI_REMOTE_CQ_DATA)));

Expand Down
18 changes: 15 additions & 3 deletions prov/rxm/src/rxm_domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -474,12 +474,24 @@ int rxm_msg_mr_reg_internal(struct rxm_domain *rxm_domain, const void *buf,
size_t len, uint64_t acs, uint64_t flags, struct fid_mr **mr)
{
int ret, tries = 0;
struct iovec iov = {
.iov_base = (void *)buf,
.iov_len = len,
};
struct fi_mr_attr attr = {
.mr_iov = &iov,
.iov_count = 1,
.access = acs,
.iface = FI_HMEM_SYSTEM,
};

if (rxm_detect_hmem_iface)
attr.iface = ofi_get_hmem_iface(buf, &attr.device.reserved, NULL);

/* If we can't get a key within 1024 tries, give up */
do {
ret = fi_mr_reg(rxm_domain->msg_domain, buf, len, acs, 0,
rxm_domain->mr_key++ | (1UL << 31),
flags, mr, NULL);
attr.requested_key = rxm_domain->mr_key++ | (1UL << 31);
ret = fi_mr_regattr(rxm_domain->msg_domain, &attr, flags, mr);
} while (ret == -FI_ENOKEY && tries++ < 1024);

return ret;
Expand Down
8 changes: 8 additions & 0 deletions prov/rxm/src/rxm_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ size_t rxm_packet_size;
int rxm_passthru = 0; /* disable by default, need to analyze performance */
int force_auto_progress;
int rxm_use_write_rndv;
int rxm_detect_hmem_iface;
enum fi_wait_obj def_wait_obj = FI_WAIT_FD, def_tcp_wait_obj = FI_WAIT_UNSPEC;

char *rxm_proto_state_str[] = {
Expand Down Expand Up @@ -700,6 +701,11 @@ RXM_INI
"to the tcp provider, depending on the capabilities "
"requested by the application.");

fi_param_define(&rxm_prov, "detect_hmem_iface", FI_PARAM_BOOL,
"Detect iface for user buffers with NULL desc passed "
"in. This allows such buffers be copied or registered "
"internally by RxM. (default: false).");

/* passthru supported disabled - to re-enable would need to fix call to
* fi_cq_read to pass in the correct data structure. However, passthru
* will not be needed at all with in-work tcp changes.
Expand All @@ -725,6 +731,8 @@ RXM_INI
"(FI_OFI_RXM_DATA_AUTO_PROGRESS = 1), domain threading "
"level would be set to FI_THREAD_SAFE\n");

fi_param_get_bool(&rxm_prov, "detect_hmem_iface", &rxm_detect_hmem_iface);

#if HAVE_RXM_DL
ofi_mem_init();
ofi_hmem_init();
Expand Down
4 changes: 2 additions & 2 deletions prov/rxm/src/rxm_msg.c
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ rxm_send_sar(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
ssize_t ret;

assert(segs_cnt >= 2);
iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device);
iface = rxm_iov_desc_to_hmem_iface_dev(iov, desc, count, &device);

first_tx_buf = rxm_init_segment(rxm_ep, rxm_conn, context,
data_len, rxm_buffer_size,
Expand Down Expand Up @@ -709,7 +709,7 @@ rxm_send_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
(data_len > rxm_ep->rxm_info->tx_attr->inject_size)) ||
(data_len <= rxm_ep->rxm_info->tx_attr->inject_size));

iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device);
iface = rxm_iov_desc_to_hmem_iface_dev(iov, desc, count, &device);
if (iface == FI_HMEM_ZE || iface == FI_HMEM_SYNAPSEAI)
goto rndv_send;

Expand Down

0 comments on commit d553031

Please sign in to comment.