Skip to content

Commit

Permalink
rdma: use separate freelists for ctrl and eager rx buffers
Browse files Browse the repository at this point in the history
For default eager max size, ctrl buffers are much smaller than eager
buffers.

Signed-off-by: Eric Raut <eraut@amazon.com>
  • Loading branch information
rauteric committed Jan 11, 2025
1 parent adc46d7 commit 9e833ce
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 20 deletions.
12 changes: 8 additions & 4 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -734,12 +734,16 @@ struct nccl_net_ofi_rdma_ep {
/* Pending requests queue */
nccl_ofi_deque_t *pending_reqs_queue;

/* Free list of rx buffers */
nccl_ofi_freelist_t *rx_buff_fl;
/* Free list of ctrl rx buffers */
nccl_ofi_freelist_t *ctrl_rx_buff_fl;
/* Free list of eager rx buffers */
nccl_ofi_freelist_t *eager_rx_buff_fl;
/* Free list of rx buffer requests */
nccl_ofi_freelist_t *rx_buff_reqs_fl;
/* Size of rx buffers */
size_t rx_buff_size;
/* Size of ctrl rx buffers */
size_t ctrl_rx_buff_size;
/* Size of eager rx buffers */
size_t eager_rx_buff_size;

/* true if the current endpoint is a endpoint_per_communicator
receive communicator */
Expand Down
51 changes: 35 additions & 16 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -2306,7 +2306,7 @@ static inline int free_eager_rx_buff_req(nccl_net_ofi_rdma_req_t *req,
nccl_net_ofi_rdma_ep_t *ep = rx_buff_data->ep;
/* Free buffer */
if (rx_buff_data->rx_buff_fl_elem) {
nccl_ofi_freelist_entry_free(ep->rx_buff_fl, rx_buff_data->rx_buff_fl_elem);
nccl_ofi_freelist_entry_free(ep->eager_rx_buff_fl, rx_buff_data->rx_buff_fl_elem);
}
return free_base_req(NULL, ep->rx_buff_reqs_fl, req, false);
}
Expand All @@ -2325,7 +2325,7 @@ static inline nccl_net_ofi_rdma_req_t *eager_rx_buff_req_alloc(nccl_net_ofi_rdma
rdma_req_rx_buff_data_t *rx_buff_data = get_rx_buff_data(req);

nccl_ofi_freelist_elem_t *rx_buff_fl_elem =
nccl_ofi_freelist_entry_alloc(ep->rx_buff_fl);
nccl_ofi_freelist_entry_alloc(ep->eager_rx_buff_fl);
if (!rx_buff_fl_elem) {
NCCL_OFI_WARN("Failed to allocate rx_buff_fl_elem");
req->free(req, false);
Expand All @@ -2334,7 +2334,7 @@ static inline nccl_net_ofi_rdma_req_t *eager_rx_buff_req_alloc(nccl_net_ofi_rdma
assert(NCCL_OFI_IS_PTR_ALIGNED(rx_buff_fl_elem->ptr, EAGER_RX_BUFFER_ALIGNMENT));

rx_buff_data->rx_buff_fl_elem = rx_buff_fl_elem;
rx_buff_data->buff_len = ep->rx_buff_size;
rx_buff_data->buff_len = ep->eager_rx_buff_size;
rx_buff_data->rail = rail;
rx_buff_data->ep = ep;
return req;
Expand All @@ -2348,7 +2348,7 @@ static inline int ctrl_rx_buff_req_free(nccl_net_ofi_rdma_req_t *req,
nccl_net_ofi_rdma_ep_t *ep = rx_buff_data->ep;
/* Free buffer */
if (rx_buff_data->rx_buff_fl_elem) {
nccl_ofi_freelist_entry_free(ep->rx_buff_fl, rx_buff_data->rx_buff_fl_elem);
nccl_ofi_freelist_entry_free(ep->ctrl_rx_buff_fl, rx_buff_data->rx_buff_fl_elem);
}
return free_base_req(NULL, ep->rx_buff_reqs_fl, req, false);
}
Expand All @@ -2367,16 +2367,15 @@ static inline nccl_net_ofi_rdma_req_t *ctrl_rx_buff_req_alloc(nccl_net_ofi_rdma_
rdma_req_rx_buff_data_t *rx_buff_data = get_rx_buff_data(req);

nccl_ofi_freelist_elem_t *rx_buff_fl_elem =
nccl_ofi_freelist_entry_alloc(ep->rx_buff_fl);
nccl_ofi_freelist_entry_alloc(ep->ctrl_rx_buff_fl);
if (!rx_buff_fl_elem) {
NCCL_OFI_WARN("Failed to allocate rx_buff_fl_elem");
req->free(req, false);
return NULL;
}
assert(NCCL_OFI_IS_PTR_ALIGNED(rx_buff_fl_elem->ptr, EAGER_RX_BUFFER_ALIGNMENT));

rx_buff_data->rx_buff_fl_elem = rx_buff_fl_elem;
rx_buff_data->buff_len = ep->rx_buff_size;
rx_buff_data->buff_len = ep->ctrl_rx_buff_size;
rx_buff_data->rail = rail;
rx_buff_data->ep = ep;
return req;
Expand Down Expand Up @@ -5515,8 +5514,9 @@ static int post_rx_buffer(nccl_net_ofi_rdma_req_t *req,
* accessible but undefined to cover cases where the buffer
* gets re-posted */
nccl_net_ofi_rdma_ep_t *ep = rx_buff_data->ep;
nccl_ofi_freelist_entry_set_undefined(ep->rx_buff_fl,
rx_buff_fl_elem->ptr);
nccl_ofi_freelist_t *fl = (req->type == NCCL_OFI_RDMA_EAGER_RX_BUFF ?
ep->eager_rx_buff_fl : ep->ctrl_rx_buff_fl);
nccl_ofi_freelist_entry_set_undefined(fl, rx_buff_fl_elem->ptr);

iov.iov_base = rx_buff_fl_elem->ptr;
iov.iov_len = rx_buff_data->buff_len;
Expand Down Expand Up @@ -6194,17 +6194,28 @@ static inline int init_rx_buffers(nccl_net_ofi_rdma_ep_t *ep)
return ret;
}

ret = nccl_ofi_freelist_init_mr(ep->rx_buff_size,
ret = nccl_ofi_freelist_init_mr(ep->ctrl_rx_buff_size,
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
freelist_regmr_host_fn, freelist_deregmr_host_fn,
ep, EAGER_RX_BUFFER_ALIGNMENT, &ep->rx_buff_fl);
ep, 1, &ep->ctrl_rx_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init rx_buff_fl");
NCCL_OFI_WARN("Failed to init ctrl_rx_buff_fl");
if (nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl))
NCCL_OFI_WARN("Also failed to freelist_fini rx_buff_reqs_fl");
return ret;
}

ret = nccl_ofi_freelist_init_mr(ep->eager_rx_buff_size,
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
freelist_regmr_host_fn, freelist_deregmr_host_fn,
ep, EAGER_RX_BUFFER_ALIGNMENT, &ep->eager_rx_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init eager_rx_buff_size");
nccl_ofi_freelist_fini(ep->ctrl_rx_buff_fl);
nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl);
return ret;
}

/*
* The *_rx_buff_posted limits are used in the progress engine to
* determine if the receive queue is hydrated with sufficient buffers.
Expand Down Expand Up @@ -6254,9 +6265,15 @@ static inline int fini_rx_buffers(nccl_net_ofi_rdma_ep_t *ep)
int ret = 0;
nccl_net_ofi_ep_rail_t *rail;

ret = nccl_ofi_freelist_fini(ep->rx_buff_fl);
ret = nccl_ofi_freelist_fini(ep->ctrl_rx_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to fini ctrl_rx_buff_fl");
return ret;
}

ret = nccl_ofi_freelist_fini(ep->eager_rx_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to fini rx_buff_fl");
NCCL_OFI_WARN("Failed to fini eager_rx_buff_fl");
return ret;
}

Expand Down Expand Up @@ -7218,8 +7235,10 @@ static int nccl_net_ofi_rdma_domain_create_endpoint(nccl_net_ofi_domain_t *base_
goto error;
}

ep->rx_buff_size = NCCL_OFI_MAX(NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t), eager_max_size),
sizeof(nccl_ofi_rdma_connection_info_t));
ep->ctrl_rx_buff_size = NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t),
NCCL_OFI_MAX(sizeof(nccl_ofi_rdma_connection_info_t),
sizeof(nccl_net_ofi_rdma_close_msg_t)));
ep->eager_rx_buff_size = eager_max_size;

ep->is_endpoint_per_communicator_ep = false;

Expand Down

0 comments on commit 9e833ce

Please sign in to comment.