diff --git a/include/nccl_ofi_param.h b/include/nccl_ofi_param.h index b3cf25d1d..ccd5f6612 100644 --- a/include/nccl_ofi_param.h +++ b/include/nccl_ofi_param.h @@ -239,14 +239,27 @@ OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0); OFI_NCCL_PARAM_UINT(round_robin_threshold, "ROUND_ROBIN_THRESHOLD", (256 * 1024)); /* - * Minimum bounce buffers posted per rail. The plugin will attempt to post + * Minimum ctrl recv buffers posted per rail. The plugin will attempt to post + * more buffers if we dip below this threshold, allocating new buffers if needed. + */ +OFI_NCCL_PARAM_INT(rdma_min_posted_ctrl_recv_buffers, "RDMA_MIN_POSTED_CTRL_RECV_BUFFERS", 64); + +/* + * Maximum ctrl recv buffers posted per rail. The plugin will not attempt to + * post more buffers if we reach this threshold, returning available buffers to + * the free list if needed + */ +OFI_NCCL_PARAM_INT(rdma_max_posted_ctrl_recv_buffers, "RDMA_MAX_POSTED_CTRL_RECV_BUFFERS", 128); + +/* + * Minimum (eager) bounce buffers posted per rail. The plugin will attempt to post * more bounce buffers if we dip below this threshold, allocating new bounce * buffers if needed. */ OFI_NCCL_PARAM_INT(rdma_min_posted_bounce_buffers, "RDMA_MIN_POSTED_BOUNCE_BUFFERS", 16); /* - * Maximum bounce buffers posted per rail. The plugin will not attempt to + * Maximum (eager) bounce buffers posted per rail. The plugin will not attempt to * post more bounce buffers if we reach this threshold, returning available * buffers to the free list if needed */ diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index b89f69279..896cab6ea 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -5720,12 +5720,13 @@ static inline nccl_net_ofi_rdma_send_comm_t *calloc_rdma_send_comm(int num_rails * non-zero, on error */ static inline int init_bounce_buffers_rail(nccl_net_ofi_ep_rail_t *ep_rail, nccl_net_ofi_rdma_ep_t *ep, - size_t buff_size, size_t entry_alignment) + size_t buff_size, size_t entry_alignment, + size_t min_posted_count, size_t max_posted_count) { int ret = 0; ret = nccl_ofi_freelist_init(sizeof(nccl_net_ofi_rdma_req_t), - ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0, + max_posted_count, 16, 0, &ep_rail->bounce_buff_reqs_fl); if (ret != 0) { NCCL_OFI_WARN("Failed to init bounce_buff_reqs_fl"); @@ -5734,7 +5735,7 @@ static inline int init_bounce_buffers_rail(nccl_net_ofi_ep_rail_t *ep_rail, nccl ep_rail->buff_size = buff_size; ret = nccl_ofi_freelist_init_mr(buff_size, - ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0, + max_posted_count, 16, 0, freelist_regmr_host_fn, freelist_deregmr_host_fn, ep, 0, entry_alignment, &ep_rail->bounce_buff_fl); if (ret != 0) { @@ -5742,10 +5743,12 @@ static inline int init_bounce_buffers_rail(nccl_net_ofi_ep_rail_t *ep_rail, nccl goto error; } - ep_rail->min_bounce_posted = ofi_nccl_rdma_min_posted_bounce_buffers(); - ep_rail->max_bounce_posted = ofi_nccl_rdma_max_posted_bounce_buffers(); + ep_rail->min_bounce_posted = min_posted_count; + ep_rail->max_bounce_posted = max_posted_count; ep_rail->num_bounce_posted = 0; + assert(ep_rail->max_bounce_posted >= ep_rail->min_bounce_posted); + ret = nccl_net_ofi_mutex_init(&ep_rail->bounce_mutex, NULL); if (ret != 0) { goto error; @@ -5803,7 +5806,9 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) sizeof(nccl_ofi_rdma_connection_info_t)), sizeof(nccl_net_ofi_rdma_close_msg_t)); ret = init_bounce_buffers_rail(&ep->control_rail, ep, - buff_size, BOUNCE_BUFFER_ALIGNMENT); + buff_size, BOUNCE_BUFFER_ALIGNMENT, + ofi_nccl_rdma_min_posted_ctrl_recv_buffers(), + ofi_nccl_rdma_max_posted_ctrl_recv_buffers()); if (ret != 0) { return ret; } @@ -5817,7 +5822,9 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id); ret = init_bounce_buffers_rail(rail, ep, buff_size, - BOUNCE_BUFFER_ALIGNMENT); + BOUNCE_BUFFER_ALIGNMENT, + ofi_nccl_rdma_min_posted_bounce_buffers(), + ofi_nccl_rdma_max_posted_bounce_buffers()); if (ret != 0) { /* Cleanup previously-established rails */