net/smc: make wr buffer count configurable

Think SMC_WR_BUF_CNT_SEND := SMC_WR_BUF_CNT used in send context and
SMC_WR_BUF_CNT_RECV := 3 * SMC_WR_BUF_CNT used in recv context. Those
get replaced with lgr->max_send_wr and lgr->max_recv_wr respective.

Please note that although with the default sysctl values
qp_attr.cap.max_send_wr ==  qp_attr.cap.max_recv_wr is maintained but
can not be assumed to be generally true any more. I see no downside to
that, but my confidence level is rather modest.

Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
Reviewed-by: Sidraya Jayagond <sidraya@linux.ibm.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Tested-by: Mahanta Jambigi <mjambigi@linux.ibm.com>
Link: https://patch.msgid.link/20251027224856.2970019-2-pasic@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
pull/1354/merge
Halil Pasic 2025-10-27 23:48:55 +01:00 committed by Paolo Abeni
parent ea7d0d60eb
commit aef3cdb47b
9 changed files with 91 additions and 22 deletions

View File

@ -71,3 +71,39 @@ smcr_max_conns_per_lgr - INTEGER
acceptable value ranges from 16 to 255. Only for SMC-R v2.1 and later.
Default: 255
smcr_max_send_wr - INTEGER
So-called work request buffers are SMCR link (and RDMA queue pair) level
resources necessary for performing RDMA operations. Since up to 255
connections can share a link group and thus also a link and the number
of the work request buffers is decided when the link is allocated,
depending on the workload it can be a bottleneck in a sense that threads
have to wait for work request buffers to become available. Before the
introduction of this control the maximal number of work request buffers
available on the send path used to be hard coded to 16. With this control
it becomes configurable. The acceptable range is between 2 and 2048.
Please be aware that all the buffers need to be allocated as a physically
continuous array in which each element is a single buffer and has the size
of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails we give up much
like before having this control.
Default: 16
smcr_max_recv_wr - INTEGER
So-called work request buffers are SMCR link (and RDMA queue pair) level
resources necessary for performing RDMA operations. Since up to 255
connections can share a link group and thus also a link and the number
of the work request buffers is decided when the link is allocated,
depending on the workload it can be a bottleneck in a sense that threads
have to wait for work request buffers to become available. Before the
introduction of this control the maximal number of work request buffers
available on the receive path used to be hard coded to 16. With this control
it becomes configurable. The acceptable range is between 2 and 2048.
Please be aware that all the buffers need to be allocated as a physically
continuous array in which each element is a single buffer and has the size
of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails we give up much
like before having this control.
Default: 48

View File

@ -24,5 +24,7 @@ struct netns_smc {
int sysctl_rmem;
int sysctl_max_links_per_lgr;
int sysctl_max_conns_per_lgr;
unsigned int sysctl_smcr_max_send_wr;
unsigned int sysctl_smcr_max_recv_wr;
};
#endif

View File

@ -34,6 +34,8 @@
* distributions may modify it to a value between
* 16-255 as needed.
*/
#define SMCR_MAX_SEND_WR_DEF 16 /* Default number of work requests per send queue */
#define SMCR_MAX_RECV_WR_DEF 48 /* Default number of work requests per recv queue */
struct smc_lgr_list { /* list of link group definition */
struct list_head list;
@ -366,6 +368,10 @@ struct smc_link_group {
/* max conn can be assigned to lgr */
u8 max_links;
/* max links can be added in lgr */
u16 max_send_wr;
/* number of WR buffers on send */
u16 max_recv_wr;
/* number of WR buffers on recv */
};
struct { /* SMC-D */
struct smcd_gid peer_gid;

View File

@ -669,11 +669,6 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
.recv_cq = lnk->smcibdev->roce_cq_recv,
.srq = NULL,
.cap = {
/* include unsolicited rdma_writes as well,
* there are max. 2 RDMA_WRITE per 1 WR_SEND
*/
.max_send_wr = SMC_WR_BUF_CNT * 3,
.max_recv_wr = SMC_WR_BUF_CNT * 3,
.max_send_sge = SMC_IB_MAX_SEND_SGE,
.max_recv_sge = lnk->wr_rx_sge_cnt,
.max_inline_data = 0,
@ -683,6 +678,11 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
};
int rc;
/* include unsolicited rdma_writes as well,
* there are max. 2 RDMA_WRITE per 1 WR_SEND
*/
qp_attr.cap.max_send_wr = 3 * lnk->lgr->max_send_wr;
qp_attr.cap.max_recv_wr = lnk->lgr->max_recv_wr;
lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
if (IS_ERR(lnk->roce_qp))

View File

@ -2157,6 +2157,8 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
init_waitqueue_head(&lgr->llc_msg_waiter);
init_rwsem(&lgr->llc_conf_mutex);
lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time);
lgr->max_send_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_send_wr));
lgr->max_recv_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_recv_wr));
}
/* called after lgr was removed from lgr_list */

View File

@ -29,6 +29,8 @@ static int links_per_lgr_min = SMC_LINKS_ADD_LNK_MIN;
static int links_per_lgr_max = SMC_LINKS_ADD_LNK_MAX;
static int conns_per_lgr_min = SMC_CONN_PER_LGR_MIN;
static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX;
static unsigned int smcr_max_wr_min = 2;
static unsigned int smcr_max_wr_max = 2048;
static struct ctl_table smc_table[] = {
{
@ -99,6 +101,24 @@ static struct ctl_table smc_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "smcr_max_send_wr",
.data = &init_net.smc.sysctl_smcr_max_send_wr,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &smcr_max_wr_min,
.extra2 = &smcr_max_wr_max,
},
{
.procname = "smcr_max_recv_wr",
.data = &init_net.smc.sysctl_smcr_max_recv_wr,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &smcr_max_wr_min,
.extra2 = &smcr_max_wr_max,
},
};
int __net_init smc_sysctl_net_init(struct net *net)
@ -130,6 +150,8 @@ int __net_init smc_sysctl_net_init(struct net *net)
WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init);
net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER;
net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER;
net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF;
net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF;
/* disable handshake limitation by default */
net->smc.limit_smc_hs = 0;

View File

@ -25,6 +25,8 @@ static inline int smc_sysctl_net_init(struct net *net)
net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER;
net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER;
net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF;
net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF;
return 0;
}

View File

@ -547,9 +547,9 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
IB_QP_DEST_QPN,
&init_attr);
lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
lnk->wr_tx_cnt = min_t(size_t, lnk->lgr->max_send_wr,
lnk->qp_attr.cap.max_send_wr);
lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
lnk->wr_rx_cnt = min_t(size_t, lnk->lgr->max_recv_wr,
lnk->qp_attr.cap.max_recv_wr);
}
@ -741,50 +741,51 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
int smc_wr_alloc_link_mem(struct smc_link *link)
{
/* allocate link related memory */
link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
link->wr_tx_bufs = kcalloc(link->lgr->max_send_wr,
SMC_WR_BUF_SIZE, GFP_KERNEL);
if (!link->wr_tx_bufs)
goto no_mem;
link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen,
link->wr_rx_bufs = kcalloc(link->lgr->max_recv_wr, link->wr_rx_buflen,
GFP_KERNEL);
if (!link->wr_rx_bufs)
goto no_mem_wr_tx_bufs;
link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
GFP_KERNEL);
link->wr_tx_ibs = kcalloc(link->lgr->max_send_wr,
sizeof(link->wr_tx_ibs[0]), GFP_KERNEL);
if (!link->wr_tx_ibs)
goto no_mem_wr_rx_bufs;
link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
link->wr_rx_ibs = kcalloc(link->lgr->max_recv_wr,
sizeof(link->wr_rx_ibs[0]),
GFP_KERNEL);
if (!link->wr_rx_ibs)
goto no_mem_wr_tx_ibs;
link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
link->wr_tx_rdmas = kcalloc(link->lgr->max_send_wr,
sizeof(link->wr_tx_rdmas[0]),
GFP_KERNEL);
if (!link->wr_tx_rdmas)
goto no_mem_wr_rx_ibs;
link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
link->wr_tx_rdma_sges = kcalloc(link->lgr->max_send_wr,
sizeof(link->wr_tx_rdma_sges[0]),
GFP_KERNEL);
if (!link->wr_tx_rdma_sges)
goto no_mem_wr_tx_rdmas;
link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
link->wr_tx_sges = kcalloc(link->lgr->max_send_wr, sizeof(link->wr_tx_sges[0]),
GFP_KERNEL);
if (!link->wr_tx_sges)
goto no_mem_wr_tx_rdma_sges;
link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
link->wr_rx_sges = kcalloc(link->lgr->max_recv_wr,
sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt,
GFP_KERNEL);
if (!link->wr_rx_sges)
goto no_mem_wr_tx_sges;
link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL);
link->wr_tx_mask = bitmap_zalloc(link->lgr->max_send_wr, GFP_KERNEL);
if (!link->wr_tx_mask)
goto no_mem_wr_rx_sges;
link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
link->wr_tx_pends = kcalloc(link->lgr->max_send_wr,
sizeof(link->wr_tx_pends[0]),
GFP_KERNEL);
if (!link->wr_tx_pends)
goto no_mem_wr_tx_mask;
link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
link->wr_tx_compl = kcalloc(link->lgr->max_send_wr,
sizeof(link->wr_tx_compl[0]),
GFP_KERNEL);
if (!link->wr_tx_compl)
@ -905,7 +906,7 @@ int smc_wr_create_link(struct smc_link *lnk)
goto dma_unmap;
}
smc_wr_init_sge(lnk);
bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
bitmap_zero(lnk->wr_tx_mask, lnk->lgr->max_send_wr);
init_waitqueue_head(&lnk->wr_tx_wait);
rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL);
if (rc)

View File

@ -19,8 +19,6 @@
#include "smc.h"
#include "smc_core.h"
#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */