Skip to content

Commit eeb3e7d

Browse files
committed
rdma: use separate freelists for ctrl and eager rx buffers
For default eager max size, ctrl buffers are much smaller than eager buffers. Signed-off-by: Eric Raut <eraut@amazon.com>
1 parent aa56671 commit eeb3e7d

File tree

2 files changed

+44
-20
lines changed

2 files changed

+44
-20
lines changed

include/nccl_ofi_rdma.h

+8-4
Original file line numberDiff line numberDiff line change
@@ -734,12 +734,16 @@ struct nccl_net_ofi_rdma_ep {
734734
/* Pending requests queue */
735735
nccl_ofi_deque_t *pending_reqs_queue;
736736

737-
/* Free list of rx buffers */
738-
nccl_ofi_freelist_t *rx_buff_fl;
737+
/* Free list of ctrl rx buffers */
738+
nccl_ofi_freelist_t *ctrl_rx_buff_fl;
739+
/* Free list of eager rx buffers */
740+
nccl_ofi_freelist_t *eager_rx_buff_fl;
739741
/* Free list of rx buffer requests */
740742
nccl_ofi_freelist_t *rx_buff_reqs_fl;
741-
/* Size of rx buffers */
742-
size_t rx_buff_size;
743+
/* Size of ctrl rx buffers */
744+
size_t ctrl_rx_buff_size;
745+
/* Size of eager rx buffers */
746+
size_t eager_rx_buff_size;
743747

744748
/* true if the current endpoint is a endpoint_per_communicator
745749
receive communicator */

src/nccl_ofi_rdma.c

+36-16
Original file line numberDiff line numberDiff line change
@@ -2306,7 +2306,7 @@ static inline int free_eager_rx_buff_req(nccl_net_ofi_rdma_req_t *req,
23062306
nccl_net_ofi_rdma_ep_t *ep = rx_buff_data->ep;
23072307
/* Free buffer */
23082308
if (rx_buff_data->rx_buff_fl_elem) {
2309-
nccl_ofi_freelist_entry_free(ep->rx_buff_fl, rx_buff_data->rx_buff_fl_elem);
2309+
nccl_ofi_freelist_entry_free(ep->eager_rx_buff_fl, rx_buff_data->rx_buff_fl_elem);
23102310
}
23112311
return free_base_req(NULL, ep->rx_buff_reqs_fl, req, false);
23122312
}
@@ -2325,7 +2325,7 @@ static inline nccl_net_ofi_rdma_req_t *eager_rx_buff_req_alloc(nccl_net_ofi_rdma
23252325
rdma_req_rx_buff_data_t *rx_buff_data = get_rx_buff_data(req);
23262326

23272327
nccl_ofi_freelist_elem_t *rx_buff_fl_elem =
2328-
nccl_ofi_freelist_entry_alloc(ep->rx_buff_fl);
2328+
nccl_ofi_freelist_entry_alloc(ep->eager_rx_buff_fl);
23292329
if (!rx_buff_fl_elem) {
23302330
NCCL_OFI_WARN("Failed to allocate rx_buff_fl_elem");
23312331
req->free(req, false);
@@ -2334,7 +2334,7 @@ static inline nccl_net_ofi_rdma_req_t *eager_rx_buff_req_alloc(nccl_net_ofi_rdma
23342334
assert(NCCL_OFI_IS_PTR_ALIGNED(rx_buff_fl_elem->ptr, EAGER_RX_BUFFER_ALIGNMENT));
23352335

23362336
rx_buff_data->rx_buff_fl_elem = rx_buff_fl_elem;
2337-
rx_buff_data->buff_len = ep->rx_buff_size;
2337+
rx_buff_data->buff_len = ep->eager_rx_buff_size;
23382338
rx_buff_data->rail = rail;
23392339
rx_buff_data->ep = ep;
23402340
return req;
@@ -2348,7 +2348,7 @@ static inline int ctrl_rx_buff_req_free(nccl_net_ofi_rdma_req_t *req,
23482348
nccl_net_ofi_rdma_ep_t *ep = rx_buff_data->ep;
23492349
/* Free buffer */
23502350
if (rx_buff_data->rx_buff_fl_elem) {
2351-
nccl_ofi_freelist_entry_free(ep->rx_buff_fl, rx_buff_data->rx_buff_fl_elem);
2351+
nccl_ofi_freelist_entry_free(ep->ctrl_rx_buff_fl, rx_buff_data->rx_buff_fl_elem);
23522352
}
23532353
return free_base_req(NULL, ep->rx_buff_reqs_fl, req, false);
23542354
}
@@ -2367,16 +2367,15 @@ static inline nccl_net_ofi_rdma_req_t *ctrl_rx_buff_req_alloc(nccl_net_ofi_rdma_
23672367
rdma_req_rx_buff_data_t *rx_buff_data = get_rx_buff_data(req);
23682368

23692369
nccl_ofi_freelist_elem_t *rx_buff_fl_elem =
2370-
nccl_ofi_freelist_entry_alloc(ep->rx_buff_fl);
2370+
nccl_ofi_freelist_entry_alloc(ep->ctrl_rx_buff_fl);
23712371
if (!rx_buff_fl_elem) {
23722372
NCCL_OFI_WARN("Failed to allocate rx_buff_fl_elem");
23732373
req->free(req, false);
23742374
return NULL;
23752375
}
2376-
assert(NCCL_OFI_IS_PTR_ALIGNED(rx_buff_fl_elem->ptr, EAGER_RX_BUFFER_ALIGNMENT));
23772376

23782377
rx_buff_data->rx_buff_fl_elem = rx_buff_fl_elem;
2379-
rx_buff_data->buff_len = ep->rx_buff_size;
2378+
rx_buff_data->buff_len = ep->ctrl_rx_buff_size;
23802379
rx_buff_data->rail = rail;
23812380
rx_buff_data->ep = ep;
23822381
return req;
@@ -5517,8 +5516,9 @@ static int post_rx_buffer(nccl_net_ofi_rdma_req_t *req,
55175516
* accessible but undefined to cover cases where the buffer
55185517
* gets re-posted */
55195518
nccl_net_ofi_rdma_ep_t *ep = rx_buff_data->ep;
5520-
nccl_ofi_freelist_entry_set_undefined(ep->rx_buff_fl,
5521-
rx_buff_fl_elem->ptr);
5519+
nccl_ofi_freelist_t *fl = (req->type == NCCL_OFI_RDMA_EAGER_RX_BUFF ?
5520+
ep->eager_rx_buff_fl : ep->ctrl_rx_buff_fl);
5521+
nccl_ofi_freelist_entry_set_undefined(fl, rx_buff_fl_elem->ptr);
55225522

55235523
iov.iov_base = rx_buff_fl_elem->ptr;
55245524
iov.iov_len = rx_buff_data->buff_len;
@@ -6195,17 +6195,28 @@ static inline int init_rx_buffers(nccl_net_ofi_rdma_ep_t *ep)
61956195
return ret;
61966196
}
61976197

6198-
ret = nccl_ofi_freelist_init_mr(ep->rx_buff_size,
6198+
ret = nccl_ofi_freelist_init_mr(ep->ctrl_rx_buff_size,
61996199
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
62006200
freelist_regmr_host_fn, freelist_deregmr_host_fn,
6201-
ep, EAGER_RX_BUFFER_ALIGNMENT, &ep->rx_buff_fl);
6201+
ep, 1, &ep->ctrl_rx_buff_fl);
62026202
if (ret != 0) {
6203-
NCCL_OFI_WARN("Failed to init rx_buff_fl");
6203+
NCCL_OFI_WARN("Failed to init ctrl_rx_buff_fl");
62046204
if (nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl))
62056205
NCCL_OFI_WARN("Also failed to freelist_fini rx_buff_reqs_fl");
62066206
return ret;
62076207
}
62086208

6209+
ret = nccl_ofi_freelist_init_mr(ep->eager_rx_buff_size,
6210+
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
6211+
freelist_regmr_host_fn, freelist_deregmr_host_fn,
6212+
ep, EAGER_RX_BUFFER_ALIGNMENT, &ep->eager_rx_buff_fl);
6213+
if (ret != 0) {
6214+
NCCL_OFI_WARN("Failed to init eager_rx_buff_size");
6215+
nccl_ofi_freelist_fini(ep->ctrl_rx_buff_fl);
6216+
nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl);
6217+
return ret;
6218+
}
6219+
62096220
/*
62106221
* The *_rx_buff_posted limits are used in the progress engine to
62116222
* determine if the receive queue is hydrated with sufficient buffers.
@@ -6255,9 +6266,15 @@ static inline int fini_rx_buffers(nccl_net_ofi_rdma_ep_t *ep)
62556266
int ret = 0;
62566267
nccl_net_ofi_ep_rail_t *rail;
62576268

6258-
ret = nccl_ofi_freelist_fini(ep->rx_buff_fl);
6269+
ret = nccl_ofi_freelist_fini(ep->ctrl_rx_buff_fl);
6270+
if (ret != 0) {
6271+
NCCL_OFI_WARN("Failed to fini ctrl_rx_buff_fl");
6272+
return ret;
6273+
}
6274+
6275+
ret = nccl_ofi_freelist_fini(ep->eager_rx_buff_fl);
62596276
if (ret != 0) {
6260-
NCCL_OFI_WARN("Failed to fini rx_buff_fl");
6277+
NCCL_OFI_WARN("Failed to fini eager_rx_buff_fl");
62616278
return ret;
62626279
}
62636280

@@ -7219,8 +7236,11 @@ static int nccl_net_ofi_rdma_domain_create_endpoint(nccl_net_ofi_domain_t *base_
72197236
goto error;
72207237
}
72217238

7222-
ep->rx_buff_size = NCCL_OFI_MAX(NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t), eager_max_size),
7223-
sizeof(nccl_ofi_rdma_connection_info_t));
7239+
ep->ctrl_rx_buff_size =
7240+
NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t),
7241+
NCCL_OFI_MAX(sizeof(nccl_ofi_rdma_connection_info_t),
7242+
sizeof(nccl_net_ofi_rdma_close_msg_t)));
7243+
ep->eager_rx_buff_size = eager_max_size;
72247244

72257245
ep->is_endpoint_per_communicator_ep = false;
72267246

0 commit comments

Comments
 (0)