Skip to content

Commit

Permalink
prov/efa: Implement FI_OPT_SHARED_MEMORY_PERMITTED
Browse files Browse the repository at this point in the history
This commit implements FI_OPT_SHARED_MEMORY_PERMITTED for the EFA
provider. All SHM provider resources are disabled when
FI_OPT_SHARED_MEMORY_PERMITTED is set to true.

Signed-off-by: Sai Sunku <sunkusa@amazon.com>
  • Loading branch information
sunkuamzn authored and shijin-aws committed Jan 23, 2024
1 parent d4d0433 commit 62be5f1
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 2 deletions.
6 changes: 6 additions & 0 deletions man/fi_efa.7.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,12 @@ These OFI runtime parameters apply only to the RDM endpoint.
[`ptrace protection`](https://wiki.ubuntu.com/SecurityTeam/Roadmap/KernelHardening#ptrace_Protection)
is turned on. You can turn it off to enable shm transfer.

FI_EFA_ENABLE_SHM_TRANSFER is parsed during the fi_domain call and is related to the FI_OPT_SHARED_MEMORY_PERMITTED endpoint option.
If FI_EFA_ENABLE_SHM_TRANSFER is set to true, the FI_OPT_SHARED_MEMORY_PERMITTED endpoint
option overrides FI_EFA_ENABLE_SHM_TRANSFER. If FI_EFA_ENABLE_SHM_TRANSFER is set to false,
but the FI_OPT_SHARED_MEMORY_PERMITTED is set to true, the FI_OPT_SHARED_MEMORY_PERMITTED
setopt call will fail with -FI_EINVAL.

*FI_EFA_SHM_AV_SIZE*
: Defines the maximum number of entries in SHM provider's address vector.

Expand Down
1 change: 1 addition & 0 deletions prov/efa/src/rdm/efa_rdm_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ struct efa_rdm_ep {
int hmem_p2p_opt; /* what to do for hmem transfers */
struct fid_ep *peer_srx_ep; /* support sharing receive context with peer providers */
bool cuda_api_permitted; /**< whether end point is permitted to call CUDA API */
bool shm_permitted; /* Whether the endpoint is allowed to use shared memory for intra-node communication */

/* use_device_rdma:
Can be set via fi_setopt in API >= 1.18.
Expand Down
42 changes: 40 additions & 2 deletions prov/efa/src/rdm/efa_rdm_ep_fiops.c
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info,
efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size;
efa_rdm_ep->efa_device_iov_limit = efa_domain->device->rdm_info->tx_attr->iov_limit;
efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version);
efa_rdm_ep->shm_permitted = true;

cq_attr.size = MAX(efa_rdm_ep->rx_size + efa_rdm_ep->tx_size,
efa_env.cq_size);
Expand Down Expand Up @@ -999,9 +1000,10 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep)
* AWS Neuron and Habana Synapse, have no SHM provider
* support anyways, so disabling SHM will not impact them.
*/
if ((ep->user_info->caps & FI_HMEM)
if (((ep->user_info->caps & FI_HMEM)
&& hmem_ops[FI_HMEM_CUDA].initialized
&& !ep->cuda_api_permitted) {
&& !ep->cuda_api_permitted)
|| !ep->shm_permitted) {
use_shm = false;
}

Expand Down Expand Up @@ -1175,6 +1177,35 @@ static int efa_rdm_ep_set_cuda_api_permitted(struct efa_rdm_ep *ep, bool cuda_ap
return 0;
}

/**
* @brief act on shared_memory_permitted flag called by efa_rdm_ep_setopt
* @param[in,out] ep endpoint
* @param[in] shm_permitted whether shared memory is permitted
* @return 0 on success,
* -FI_EINVAL if shm is requested but the FI_EFA_ENABLE_SHM_TRANSFER environment variable is set to false
* @related efa_rdm_ep
*/
static int efa_rdm_ep_set_shared_memory_permitted(struct efa_rdm_ep *ep, bool shm_permitted)
{
if (!shm_permitted) {
EFA_WARN(FI_LOG_EP_CTRL,
"FI_OPT_SHARED_MEMORY_PERMITTED set to false");
ep->shm_permitted = false;
return FI_SUCCESS;
}

if (!efa_env.enable_shm_transfer) {
EFA_WARN(FI_LOG_EP_CTRL,
"FI_OPT_SHARED_MEMORY_PERMITTED endpoint option set "
"to true but FI_EFA_ENABLE_SHM_TRANSFER environment "
"variable is set to false.");
return -FI_EINVAL;
}

ep->shm_permitted = true;
return 0;
}

/**
* @brief set use_device_rdma flag in efa_rdm_ep.
*
Expand Down Expand Up @@ -1365,6 +1396,13 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname,
if (ret)
return ret;
break;
case FI_OPT_SHARED_MEMORY_PERMITTED:
if (optlen != sizeof(bool))
return -FI_EINVAL;
ret = efa_rdm_ep_set_shared_memory_permitted(efa_rdm_ep, *(bool *)optval);
if (ret)
return ret;
break;
case FI_OPT_EFA_USE_DEVICE_RDMA:
if (optlen != sizeof(bool))
return -FI_EINVAL;
Expand Down
21 changes: 21 additions & 0 deletions prov/efa/test/efa_unit_test_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -581,3 +581,24 @@ void test_efa_rdm_ep_getopt_oversized_optlen(struct efa_resource **state)
{
test_efa_rdm_ep_getopt(state, 16, FI_SUCCESS);
}

void test_efa_rdm_ep_setopt_shared_memory_permitted(struct efa_resource **state)
{
struct efa_resource *resource = *state;
struct efa_rdm_ep *ep;
bool optval = false;

efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM);

ep = container_of(resource->ep, struct efa_rdm_ep,
base_ep.util_ep.ep_fid);

assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT,
FI_OPT_SHARED_MEMORY_PERMITTED, &optval,
sizeof(optval)),
0);

assert_int_equal(fi_enable(resource->ep), 0);

assert_null(ep->shm_ep);
}
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ int main(void)
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_handshake_receive_without_peer_host_id_and_do_not_send_local_host_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_getopt_undersized_optlen, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_getopt_oversized_optlen, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_setopt_shared_memory_permitted, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_cq_create_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_pkt_pool_flags, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_pkt_pool_page_alignment, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
Expand Down
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ void test_efa_rdm_ep_dc_atomic_error_handling();
void test_efa_rdm_ep_send_with_shm_no_copy();
void test_efa_rdm_ep_rma_without_caps();
void test_efa_rdm_ep_atomic_without_caps();
void test_efa_rdm_ep_setopt_shared_memory_permitted();
void test_dgram_cq_read_empty_cq();
void test_ibv_cq_ex_read_empty_cq();
void test_ibv_cq_ex_read_failed_poll();
Expand Down

0 comments on commit 62be5f1

Please sign in to comment.