Skip to content

Commit

Permalink
prov/opx: Add GPU support to expected TID
Browse files Browse the repository at this point in the history
Signed-off-by: Ben Lynam <Ben.Lynam@cornelisnetworks.com>
  • Loading branch information
belynam authored and j-xiong committed Mar 14, 2024
1 parent 7376f5c commit 863a3d7
Show file tree
Hide file tree
Showing 14 changed files with 664 additions and 644 deletions.
63 changes: 27 additions & 36 deletions prov/opx/include/fi_opx_tid.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022-2023 Cornelis Networks.
* Copyright (C) 2022-2024 Cornelis Networks.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
Expand Down Expand Up @@ -111,20 +111,11 @@
ctx->__hfi_tidexpcnt) */
#define OPX_MAX_TID_COUNT 2048

#define OPX_TID_VADDR(tid_reuse_cache) (tid_reuse_cache->tid_vaddr)
#define OPX_TID_LENGTH(tid_reuse_cache) (tid_reuse_cache->tid_length)
#define OPX_TID_NINFO(tid_reuse_cache) (tid_reuse_cache->ninfo)
#define OPX_TID_INFO(tid_reuse_cache, idx) (tid_reuse_cache->info[idx])
#define OPX_TID_NPAIRS(tid_reuse_cache) (tid_reuse_cache->npairs)
#define OPX_TID_PAIR(tid_reuse_cache, idx) (tid_reuse_cache->pairs[idx])
#define OPX_TID_IS_INVALID(tid_reuse_cache) (tid_reuse_cache->invalid)
#define OPX_TID_INVALID(tid_reuse_cache) (tid_reuse_cache->invalid = 1)
#define OPX_TID_VALID(tid_reuse_cache) (tid_reuse_cache->invalid = 0)
#define OPX_TID_NPAGES(tid_reuse_cache, npages) \
do { \
npages = 0; \
const uint32_t *tids = &OPX_TID_INFO(tid_reuse_cache, 0); \
const uint32_t ntids = OPX_TID_NINFO(tid_reuse_cache); \
const uint32_t *tids = &tid_reuse_cache->info[0]; \
const uint32_t ntids = tid_reuse_cache->ninfo; \
for (int i = 0; i < ntids; ++i) { \
npages += (int)FI_OPX_EXP_TID_GET(tids[i], LEN); \
FI_DBG(fi_opx_global.prov, FI_LOG_MR, \
Expand Down Expand Up @@ -211,10 +202,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...)
__func__, __LINE__, \
string, \
tid_vaddr, tid_vaddr + tid_length, tid_length, \
OPX_TID_VADDR(tid_reuse_cache), \
OPX_TID_VADDR(tid_reuse_cache) + \
OPX_TID_LENGTH(tid_reuse_cache), \
OPX_TID_LENGTH(tid_reuse_cache), count); \
tid_reuse_cache->tid_vaddr, \
tid_reuse_cache->tid_vaddr + \
tid_reuse_cache->tid_length, \
tid_reuse_cache->tid_length, count); \
last_vaddr = tid_vaddr; \
last_length = tid_length; \
count = 0; \
Expand All @@ -226,10 +217,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...)
"tid vaddr [%#lx - %#lx] length %lu\n", \
string, tid_vaddr, \
tid_vaddr + tid_length, tid_length, \
OPX_TID_VADDR(tid_reuse_cache), \
OPX_TID_VADDR(tid_reuse_cache) + \
OPX_TID_LENGTH(tid_reuse_cache), \
OPX_TID_LENGTH(tid_reuse_cache)); \
tid_reuse_cache->tid_vaddr, \
tid_reuse_cache->tid_vaddr + \
tid_reuse_cache->tid_length, \
tid_reuse_cache->tid_length); \
} while (0)
#else
/* noisier regular debug logging */
Expand All @@ -240,10 +231,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...)
"tid vaddr [%#lx - %#lx] length %lu\n", \
string, tid_vaddr, \
tid_vaddr + tid_length, tid_length, \
OPX_TID_VADDR(tid_reuse_cache), \
OPX_TID_VADDR(tid_reuse_cache) + \
OPX_TID_LENGTH(tid_reuse_cache), \
OPX_TID_LENGTH(tid_reuse_cache));
tid_reuse_cache->tid_vaddr, \
tid_reuse_cache->tid_vaddr + \
tid_reuse_cache->tid_length, \
tid_reuse_cache->tid_length);
#endif

/* Special debug for expected receive data ONLY */
Expand All @@ -253,22 +244,22 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...)
static int count = 0; \
static uint64_t last_vaddr = 0UL; \
static int32_t last_length = 0; \
if ((last_vaddr != OPX_TID_VADDR(tid_reuse_cache)) || \
(last_length != OPX_TID_LENGTH(tid_reuse_cache))) { \
if ((last_vaddr != tid_reuse_cache->tid_vaddr) || \
(last_length != tid_reuse_cache->tid_length)) { \
fprintf(stderr, \
"## %s:%u OPX_TID_CACHE_VERBOSE_DEBUG %s TIDs " \
"input vaddr [%#lx - %#lx] length %lu, " \
"tid vaddr [%#lx - %#lx] length %u, " \
"last count %u\n", \
__func__, __LINE__, \
string, \
OPX_TID_VADDR(tid_reuse_cache), \
OPX_TID_VADDR(tid_reuse_cache) + \
OPX_TID_LENGTH(tid_reuse_cache), \
OPX_TID_LENGTH(tid_reuse_cache), last_vaddr, \
tid_reuse_cache->tid_vaddr, \
tid_reuse_cache->tid_vaddr + \
tid_reuse_cache->tid_length, \
tid_reuse_cache->tid_length, last_vaddr, \
last_vaddr + last_length, last_length, count); \
last_vaddr = OPX_TID_VADDR(tid_reuse_cache); \
last_length = OPX_TID_LENGTH(tid_reuse_cache); \
last_vaddr = tid_reuse_cache->tid_vaddr; \
last_length = tid_reuse_cache->tid_length; \
count = 0; \
} \
++count; \
Expand All @@ -279,10 +270,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...)
"OPX_TID_CACHE_VERBOSE_DEBUG %s TIDs " \
"tid vaddr [%#lx - %#lx] length %lu\n", \
string, \
OPX_TID_VADDR(tid_reuse_cache), \
OPX_TID_VADDR(tid_reuse_cache) + \
OPX_TID_LENGTH(tid_reuse_cache), \
OPX_TID_LENGTH(tid_reuse_cache))
tid_reuse_cache->tid_vaddr, \
tid_reuse_cache->tid_vaddr + \
tid_reuse_cache->tid_length, \
tid_reuse_cache->tid_length)
#endif

#endif /* _FI_PROV_OPX_TID_H_ */
6 changes: 4 additions & 2 deletions prov/opx/include/fi_opx_tid_cache.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022-2023 Cornelis Networks.
* Copyright (C) 2022-2024 Cornelis Networks.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
Expand Down Expand Up @@ -96,6 +96,8 @@ struct fi_opx_hfi1_rx_rzv_rts_params;
* returns non-zero on failure (fallback to Eager rendezvous)
*/
int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params,
const uint64_t tid_vaddr, const uint64_t tid_length);
const uint64_t tid_vaddr, const uint64_t tid_length,
const enum fi_hmem_iface tid_iface,
const uint64_t tid_device);

#endif /* _FI_PROV_OPX_TID_CACHE_H_ */
8 changes: 4 additions & 4 deletions prov/opx/include/opa_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
GPL LICENSE SUMMARY
Copyright(c) 2015 Intel Corporation.
Copyright(c) 2021-2023 Cornelis Networks.
Copyright(c) 2021-2024 Cornelis Networks.
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
Expand All @@ -23,7 +23,7 @@
BSD LICENSE
Copyright(c) 2015 Intel Corporation.
Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2021-2024 Cornelis Networks.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -111,8 +111,8 @@ enum OPX_HFI_CMD {
OPX_HFI_CMD_TID_INVAL_READ, /* read TID cache invalidations */
OPX_HFI_CMD_GET_VERS, /* get the version of the user cdev */

#ifdef PSM_CUDA
OPX_HFI_CMD_TID_UPDATE_V2 = 28,
#ifdef OPX_HMEM
OPX_HFI_CMD_TID_UPDATE_V3,
#endif
OPX_HFI_CMD_LAST,
};
Expand Down
59 changes: 34 additions & 25 deletions prov/opx/include/opa_user_gen1.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
GPL LICENSE SUMMARY
Copyright(c) 2015 Intel Corporation.
Copyright(c) 2021-2023 Cornelis Networks.
Copyright(c) 2021-2024 Cornelis Networks.
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
Expand All @@ -23,7 +23,7 @@
BSD LICENSE
Copyright(c) 2015 Intel Corporation.
Copyright(c) 2021-2022 Cornelis Networks.
Copyright(c) 2021-2024 Cornelis Networks.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -542,13 +542,17 @@ static __inline__ void opx_hfi_hdrset_seq(__le32 *rbuf, uint32_t val)
See full description at declaration */
static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl,
uint64_t vaddr, uint32_t *length,
uint64_t tidlist, uint32_t *tidcnt, uint16_t flags)
uint64_t tidlist, uint32_t *tidcnt,
uint64_t flags)
{
struct hfi1_cmd cmd;

#ifdef OPX_HMEM
struct hfi1_tid_info_v3 tidinfo;
#else
struct hfi1_tid_info tidinfo;
#ifdef PSM_CUDA
struct hfi1_tid_info_v2 tidinfov2;
#endif

int err;

tidinfo.vaddr = vaddr; /* base address for this send to map */
Expand All @@ -557,25 +561,20 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl,
tidinfo.tidlist = tidlist; /* driver copies tids back directly */
tidinfo.tidcnt = 0; /* clear to zero */

FI_DBG(&fi_opx_provider, FI_LOG_MR,"OPX_DEBUG_ENTRY update [%p - %p], length %u (pages %u)\n", (void*)vaddr,(void*)(vaddr + *length), *length, (*length)/4096);

#ifdef OPX_HMEM
cmd.type = OPX_HFI_CMD_TID_UPDATE_V3;
tidinfo.flags = flags;
tidinfo.context = 0ull;
#else
cmd.type = OPX_HFI_CMD_TID_UPDATE; /* HFI1_IOCTL_TID_UPDATE */
#endif
FI_DBG(&fi_opx_provider, FI_LOG_MR,
"OPX_DEBUG_ENTRY update [%p - %p], length %u (pages %u)\n",
(void*)vaddr, (void*) (vaddr + *length), *length, (*length) / 4096);

cmd.len = sizeof(tidinfo);
cmd.addr = (__u64) &tidinfo;
#ifdef PSM_CUDA
if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) {
/* Copy values to v2 struct */
tidinfov2.vaddr = tidinfo.vaddr;
tidinfov2.length = tidinfo.length;
tidinfov2.tidlist = tidinfo.tidlist;
tidinfov2.tidcnt = tidinfo.tidcnt;
tidinfov2.flags = flags;

cmd.type = OPX_HFI_CMD_TID_UPDATE_V2;
cmd.len = sizeof(tidinfov2);
cmd.addr = (__u64) &tidinfov2;
}
#endif

errno = 0;
err = opx_hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd));
__attribute__((__unused__)) int saved_errno = errno;
Expand All @@ -584,15 +583,25 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl,
struct hfi1_tid_info *rettidinfo =
(struct hfi1_tid_info *)cmd.addr;
if ((rettidinfo->length != *length) || (rettidinfo->tidcnt == 0) ) {
FI_WARN(&fi_opx_provider, FI_LOG_MR,"PARTIAL UPDATE errno %d \"%s\" INPUTS vaddr [%p - %p] length %u (pages %u), OUTPUTS vaddr [%p - %p] length %u (pages %u), tidcnt %u\n", saved_errno, strerror(saved_errno), (void*)vaddr,(void*)(vaddr + *length), *length, (*length)/4096, (void*)vaddr,(void*)(vaddr + rettidinfo->length), rettidinfo->length, rettidinfo->length/4096, rettidinfo->tidcnt);
FI_WARN(&fi_opx_provider, FI_LOG_MR,
"PARTIAL UPDATE errno %d \"%s\" INPUTS vaddr [%p - %p] length %u (pages %u), OUTPUTS vaddr [%p - %p] length %u (pages %u), tidcnt %u\n",
saved_errno, strerror(saved_errno), (void*)vaddr,
(void*)(vaddr + *length), *length, (*length)/4096,
(void*)rettidinfo->vaddr,(void*)(rettidinfo->vaddr + rettidinfo->length),
rettidinfo->length, rettidinfo->length/4096,
rettidinfo->tidcnt);
}
/* Always update outputs, even on soft errors */
*length = rettidinfo->length;
*tidcnt = rettidinfo->tidcnt;
FI_DBG(&fi_opx_provider, FI_LOG_MR,"OPX_DEBUG_EXIT OUTPUTS errno %d \"%s\" vaddr [%p - %p] length %u (pages %u), tidcnt %u\n", saved_errno, strerror(saved_errno), (void*)vaddr,(void*)(vaddr + *length), *length, (*length)/4096, *tidcnt);

FI_DBG(&fi_opx_provider, FI_LOG_MR,
"OPX_DEBUG_EXIT OUTPUTS errno %d \"%s\" vaddr [%p - %p] length %u (pages %u), tidcnt %u\n",
saved_errno, strerror(saved_errno), (void*)vaddr,
(void*)(vaddr + *length), *length, (*length)/4096, *tidcnt);
} else {
FI_WARN(&fi_opx_provider, FI_LOG_MR,"FAILED ERR %d errno %d \"%s\"\n", err, saved_errno, strerror(saved_errno));
FI_WARN(&fi_opx_provider, FI_LOG_MR,
"FAILED ERR %d errno %d \"%s\"\n",
err, saved_errno, strerror(saved_errno));
/* Hard error, we can't trust these */
*length = 0;
*tidcnt = 0;
Expand Down
49 changes: 42 additions & 7 deletions prov/opx/include/rdma/opx/fi_opx_debug_counters.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2023 Cornelis Networks.
* Copyright (C) 2021-2024 Cornelis Networks.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
Expand Down Expand Up @@ -169,12 +169,22 @@ struct fi_opx_debug_counters {
} sdma;

struct {
uint64_t total_requests;
uint64_t tid_updates;
uint64_t tid_resource_limit;
uint64_t tid_resource_limit_length_chunk_short;
uint64_t tid_resource_limit_length_chunk_long;
uint64_t tid_resource_limit_tidcnt_chunk_zero;
uint64_t tid_invalidate_needed;
uint64_t tid_replays;
uint64_t rts_fallback_eager;
uint64_t tid_rcv_pkts;
uint64_t tid_rcv_pkts_replays;
uint64_t rts_tid_ineligible;
uint64_t rts_tid_eligible;
uint64_t rts_fallback_eager_immediate;
uint64_t rts_fallback_eager_misaligned_thrsh;
uint64_t rts_fallback_eager_reg_rzv;
uint64_t rts_tid_setup_retries;
uint64_t rts_tid_setup_retry_success;
uint64_t rts_tid_setup_success;
uint64_t tid_buckets[4];
uint64_t first_tidpair_minlen;
uint64_t first_tidpair_maxlen;
Expand Down Expand Up @@ -245,6 +255,9 @@ struct fi_opx_debug_counters {
uint64_t rma_atomic_fetch_intranode;
uint64_t rma_atomic_cmp_fetch_hfi;
uint64_t rma_atomic_cmp_fetch_intranode;

uint64_t tid_update;
uint64_t tid_recv;
} hmem;
};

Expand Down Expand Up @@ -331,12 +344,31 @@ void fi_opx_debug_counters_print(struct fi_opx_debug_counters *counters)
#endif

#ifdef OPX_DEBUG_COUNTERS_EXPECTED_RECEIVE
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.total_requests);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_updates);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_resource_limit);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_resource_limit_length_chunk_short);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_resource_limit_tidcnt_chunk_zero);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_invalidate_needed);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_replays);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_rcv_pkts);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_rcv_pkts_replays);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_ineligible);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_eligible);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager_immediate);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager_misaligned_thrsh);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager_reg_rzv);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_setup_retries);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_setup_retry_success);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_setup_success);
uint64_t rts_sum = counters->expected_receive.rts_fallback_eager_immediate +
counters->expected_receive.rts_fallback_eager_misaligned_thrsh +
counters->expected_receive.rts_fallback_eager_reg_rzv +
counters->expected_receive.rts_tid_setup_success;
if (rts_sum != counters->expected_receive.rts_tid_eligible) {
fprintf(stderr,
"(%d) ### WARN: rts_tid_eligible (%lu) != SUM(rts_tid_setup_success + rts_fallback*) (%lu)! Accounting error?\n\n",
pid,
counters->expected_receive.rts_tid_eligible, rts_sum);
}
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER_ARR(pid, expected_receive.tid_buckets, 4);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.first_tidpair_minlen);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.first_tidpair_maxlen);
Expand Down Expand Up @@ -433,6 +465,9 @@ void fi_opx_debug_counters_print(struct fi_opx_debug_counters *counters)
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.rma_atomic_fetch_hfi);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.rma_atomic_cmp_fetch_intranode);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.rma_atomic_cmp_fetch_hfi);

FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.tid_update);
FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.tid_recv);
#endif
}

Expand Down
3 changes: 2 additions & 1 deletion prov/opx/include/rdma/opx/fi_opx_endpoint.h
Original file line number Diff line number Diff line change
Expand Up @@ -2136,6 +2136,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep,
break;
case FI_OPX_HFI_DPUT_OPCODE_RZV_TID:
{
FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_rcv_pkts);
struct fi_opx_rzv_completion * rzv_comp = (struct fi_opx_rzv_completion *)(hdr->dput.target.rzv.completion_vaddr);
union fi_opx_context *target_context = rzv_comp->context;
assert(target_context);
Expand Down Expand Up @@ -2182,7 +2183,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep,
} else {
memcpy(rbuf_qws, sbuf_qws, bytes);
}
FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_replays);
FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_rcv_pkts_replays);
}
#ifndef NDEBUG
else { /* Debug, tracking where the TID wrote even though we don't memcpy here */
Expand Down

0 comments on commit 863a3d7

Please sign in to comment.