[PATCH v2] smb: server: allocate enough space for RW WRs and ib_drain_qp()

Stefan Metzmacher metze at samba.org
Thu Oct 16 12:18:39 UTC 2025


Sorry for the spam!

I'll send a v3 that avoids passing struct ib_qp_cap *cap
from smb_direct_init_params() to smb_direct_create_qpair().

We can do everything in smb_direct_create_qpair(),
which simplify my further changes in order to use common
code in 6.19.

metze

Am 16.10.25 um 13:20 schrieb Stefan Metzmacher:
> Make use of rdma_rw_mr_factor() to calculate the number of rw
> credits and the number of pages per RDMA RW operation.
> 
> We get the same numbers for iWarp connections, tested
> with siw.ko and irdma.ko (in iWarp mode).
> 
> siw:
> 
> CIFS: max_qp_rd_atom=128, max_fast_reg_page_list_len = 256
> CIFS: max_sgl_rd=0, max_sge_rd=1
> CIFS: responder_resources=32 max_frmr_depth=256 mr_io.type=0
> CIFS: max_send_wr 384, device reporting max_cqe 3276800 max_qp_wr 32768
> ksmbd: max_fast_reg_page_list_len = 256, max_sgl_rd=0, max_sge_rd=1
> ksmbd: device reporting max_cqe 3276800 max_qp_wr 32768
> ksmbd: Old sc->rw_io.credits: max = 9, num_pages = 256
> ksmbd: New sc->rw_io.credits: max = 9, num_pages = 256, maxpages=2048
> ksmbd: Info: rdma_send_wr 27 + max_send_wr 256 = 283
> 
> irdma (in iWarp mode):
> 
> CIFS: max_qp_rd_atom=127, max_fast_reg_page_list_len = 262144
> CIFS: max_sgl_rd=0, max_sge_rd=13
> CIFS: responder_resources=32 max_frmr_depth=2048 mr_io.type=0
> CIFS: max_send_wr 384, device reporting max_cqe 1048574 max_qp_wr 4063
> ksmbd: max_fast_reg_page_list_len = 262144, max_sgl_rd=0, max_sge_rd=13
> ksmbd: device reporting max_cqe 1048574 max_qp_wr 4063
> ksmbd: Old sc->rw_io.credits: max = 9, num_pages = 256
> ksmbd: New sc->rw_io.credits: max = 9, num_pages = 256, maxpages=2048
> ksmbd: rdma_send_wr 27 + max_send_wr 256 = 283
> 
> This means that we get the different correct numbers for ROCE,
> tested with rdma_rxe.ko and irdma.ko (in RoCEv2 mode).
> 
> rxe:
> 
> CIFS: max_qp_rd_atom=128, max_fast_reg_page_list_len = 512
> CIFS: max_sgl_rd=0, max_sge_rd=32
> CIFS: responder_resources=32 max_frmr_depth=512 mr_io.type=0
> CIFS: max_send_wr 384, device reporting max_cqe 32767 max_qp_wr 1048576
> ksmbd: max_fast_reg_page_list_len = 512, max_sgl_rd=0, max_sge_rd=32
> ksmbd: device reporting max_cqe 32767 max_qp_wr 1048576
> ksmbd: Old sc->rw_io.credits: max = 9, num_pages = 256
> ksmbd: New sc->rw_io.credits: max = 65, num_pages = 32, maxpages=2048
> ksmbd: rdma_send_wr 65 + max_send_wr 256 = 321
> 
> irdma (in RoCEv2 mode):
> 
> CIFS: max_qp_rd_atom=127, max_fast_reg_page_list_len = 262144,
> CIFS: max_sgl_rd=0, max_sge_rd=13
> CIFS: responder_resources=32 max_frmr_depth=2048 mr_io.type=0
> CIFS: max_send_wr 384, device reporting max_cqe 1048574 max_qp_wr 4063
> ksmbd: max_fast_reg_page_list_len = 262144, max_sgl_rd=0, max_sge_rd=13
> ksmbd: device reporting max_cqe 1048574 max_qp_wr 4063
> ksmbd: Old sc->rw_io.credits: max = 9, num_pages = 256,
> ksmbd: New sc->rw_io.credits: max = 159, num_pages = 13, maxpages=2048
> ksmbd: rdma_send_wr 159 + max_send_wr 256 = 415
> 
> And rely on rdma_rw_init_qp() to setup ib_mr_pool_init() for
> RW MRs.
> 
> It seems the code was implemented before the rdma_rw_* layer
> was fully established in the kernel.
> 
> While there also add additional space for ib_drain_qp().
> 
> This should make sure ib_post_send() will never fail
> because the submission queue is full.
> 
> Fixes: ddbdc861e37c ("ksmbd: smbd: introduce read/write credits for RDMA read/write")
> Fixes: 4c564f03e23b ("smb: server: make use of common smbdirect_socket")
> Fixes: 177368b99243 ("smb: server: make use of common smbdirect_socket_parameters")
> Fixes: 95475d8886bd ("smb: server: make use smbdirect_socket.rw_io.credits")
> Cc: Namjae Jeon <linkinjeon at kernel.org>
> Cc: Steve French <smfrench at gmail.com>
> Cc: Tom Talpey <tom at talpey.com>
> Cc: linux-cifs at vger.kernel.org
> Cc: samba-technical at lists.samba.org
> Signed-off-by: Stefan Metzmacher <metze at samba.org>
> ---
>   fs/smb/server/transport_rdma.c | 204 ++++++++++++++++++++-------------
>   1 file changed, 126 insertions(+), 78 deletions(-)
> 
> diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
> index 94851ff25a02..3c0719777891 100644
> --- a/fs/smb/server/transport_rdma.c
> +++ b/fs/smb/server/transport_rdma.c
> @@ -1870,20 +1870,12 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
>   	return ret;
>   }
>   
> -static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc)
> -{
> -	return min_t(unsigned int,
> -		     sc->ib.dev->attrs.max_fast_reg_page_list_len,
> -		     256);
> -}
> -
>   static int smb_direct_init_params(struct smbdirect_socket *sc,
>   				  struct ib_qp_cap *cap)
>   {
>   	struct smbdirect_socket_parameters *sp = &sc->parameters;
> -	struct ib_device *device = sc->ib.dev;
> -	int max_send_sges, max_rw_wrs, max_send_wrs;
> -	unsigned int max_sge_per_wr, wrs_per_credit;
> +	int max_send_sges;
> +	unsigned int maxpages;
>   
>   	/* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
>   	 * SMB2 response could be mapped.
> @@ -1894,63 +1886,29 @@ static int smb_direct_init_params(struct smbdirect_socket *sc,
>   		return -EINVAL;
>   	}
>   
> -	/* Calculate the number of work requests for RDMA R/W.
> -	 * The maximum number of pages which can be registered
> -	 * with one Memory region can be transferred with one
> -	 * R/W credit. And at least 4 work requests for each credit
> -	 * are needed for MR registration, RDMA R/W, local & remote
> -	 * MR invalidation.
> -	 */
> -	sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc);
> -	sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size,
> -					 (sc->rw_io.credits.num_pages - 1) *
> -					 PAGE_SIZE);
> -
> -	max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge,
> -			       device->attrs.max_sge_rd);
> -	max_sge_per_wr = max_t(unsigned int, max_sge_per_wr,
> -			       max_send_sges);
> -	wrs_per_credit = max_t(unsigned int, 4,
> -			       DIV_ROUND_UP(sc->rw_io.credits.num_pages,
> -					    max_sge_per_wr) + 1);
> -	max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit;
> -
> -	max_send_wrs = sp->send_credit_target + max_rw_wrs;
> -	if (max_send_wrs > device->attrs.max_cqe ||
> -	    max_send_wrs > device->attrs.max_qp_wr) {
> -		pr_err("consider lowering send_credit_target = %d\n",
> -		       sp->send_credit_target);
> -		pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
> -		       device->attrs.max_cqe, device->attrs.max_qp_wr);
> -		return -EINVAL;
> -	}
> -
> -	if (sp->recv_credit_max > device->attrs.max_cqe ||
> -	    sp->recv_credit_max > device->attrs.max_qp_wr) {
> -		pr_err("consider lowering receive_credit_max = %d\n",
> -		       sp->recv_credit_max);
> -		pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
> -		       device->attrs.max_cqe, device->attrs.max_qp_wr);
> -		return -EINVAL;
> -	}
> -
> -	if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) {
> -		pr_err("warning: device max_send_sge = %d too small\n",
> -		       device->attrs.max_send_sge);
> -		return -EINVAL;
> -	}
> -	if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
> -		pr_err("warning: device max_recv_sge = %d too small\n",
> -		       device->attrs.max_recv_sge);
> -		return -EINVAL;
> -	}
> +	maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE);
> +	sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev,
> +						  sc->rdma.cm_id->port_num,
> +						  maxpages);
> +	sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max);
> +	/* add one extra in order to handle unaligned pages */
> +	sc->rw_io.credits.max += 1;
>   
>   	sc->recv_io.credits.target = 1;
>   
>   	atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max);
>   
> -	cap->max_send_wr = max_send_wrs;
> -	cap->max_recv_wr = sp->recv_credit_max;
> +	/*
> +	 * Note that {rdma,ib}_create_qp() will call
> +	 * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0.
> +	 * It will adjust cap->max_send_wr to the required
> +	 * number of additional WRs for the RDMA RW operations.
> +	 * It will cap cap->max_send_wr to the device limit.
> +	 *
> +	 * +1 for ib_drain_qp
> +	 */
> +	cap->max_send_wr = sp->send_credit_target + 1;
> +	cap->max_recv_wr = sp->recv_credit_max + 1;
>   	cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
>   	cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
>   	cap->max_inline_data = 0;
> @@ -2028,13 +1986,108 @@ static int smb_direct_create_pools(struct smbdirect_socket *sc)
>   	return -ENOMEM;
>   }
>   
> +static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, struct ib_qp_init_attr *attr)
> +{
> +	/*
> +	 * This could be split out of rdma_rw_init_qp()
> +	 * and be a helper function next to rdma_rw_mr_factor()
> +	 *
> +	 * We can't check unlikely(rdma_rw_force_mr) here,
> +	 * but that is most likely 0 anyway.
> +	 */
> +	u32 factor;
> +
> +	WARN_ON_ONCE(attr->port_num == 0);
> +
> +	/*
> +	 * Each context needs at least one RDMA READ or WRITE WR.
> +	 *
> +	 * For some hardware we might need more, eventually we should ask the
> +	 * HCA driver for a multiplier here.
> +	 */
> +	factor = 1;
> +
> +	/*
> +	 * If the device needs MRs to perform RDMA READ or WRITE operations,
> +	 * we'll need two additional MRs for the registrations and the
> +	 * invalidation.
> +	 */
> +	if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd)
> +		factor += 2;	/* inv + reg */
> +
> +	return factor * attr->cap.max_rdma_ctxs;
> +}
> +
>   static int smb_direct_create_qpair(struct smbdirect_socket *sc,
>   				   struct ib_qp_cap *cap)
>   {
>   	struct smbdirect_socket_parameters *sp = &sc->parameters;
>   	int ret;
>   	struct ib_qp_init_attr qp_attr;
> -	int pages_per_rw;
> +	u32 max_send_wr;
> +	u32 rdma_send_wr;
> +
> +	/*
> +	 * Find out the number of max_send_wr
> +	 * after rdma_rw_init_qp() adjusted it.
> +	 *
> +	 * We only do it on a temporary variable,
> +	 * as rdma_create_qp() will trigger
> +	 * rdma_rw_init_qp() again.
> +	 */
> +	memset(&qp_attr, 0, sizeof(qp_attr));
> +	qp_attr.cap = *cap;
> +	qp_attr.port_num = sc->rdma.cm_id->port_num;
> +	rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr);
> +	max_send_wr = qp_attr.cap.max_send_wr + rdma_send_wr;
> +
> +	if (cap->max_send_wr > sc->ib.dev->attrs.max_cqe ||
> +	    cap->max_send_wr > sc->ib.dev->attrs.max_qp_wr) {
> +		pr_err("Possible CQE overrun: max_send_wr %d, "
> +		       "device reporting max_cqe %d max_qp_wr %d\n",
> +		       cap->max_send_wr,
> +		       sc->ib.dev->attrs.max_cqe,
> +		       sc->ib.dev->attrs.max_qp_wr);
> +		pr_err("consider lowering send_credit_target = %d\n",
> +		       sp->send_credit_target);
> +		return -EINVAL;
> +	}
> +
> +	if (cap->max_rdma_ctxs &&
> +	    (max_send_wr >= sc->ib.dev->attrs.max_cqe ||
> +	     max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) {
> +		pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d, "
> +		       "device reporting max_cqe %d max_qp_wr %d\n",
> +		       rdma_send_wr, cap->max_send_wr, max_send_wr,
> +		       sc->ib.dev->attrs.max_cqe,
> +		       sc->ib.dev->attrs.max_qp_wr);
> +		pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n",
> +		       sp->send_credit_target, cap->max_rdma_ctxs);
> +		return -EINVAL;
> +	}
> +
> +	if (cap->max_recv_wr > sc->ib.dev->attrs.max_cqe ||
> +	    cap->max_recv_wr > sc->ib.dev->attrs.max_qp_wr) {
> +		pr_err("Possible CQE overrun: max_recv_wr %d, "
> +		       "device reporting max_cpe %d max_qp_wr %d\n",
> +		       cap->max_recv_wr,
> +		       sc->ib.dev->attrs.max_cqe,
> +		       sc->ib.dev->attrs.max_qp_wr);
> +		pr_err("consider lowering receive_credit_max = %d\n",
> +		       sp->recv_credit_max);
> +		return -EINVAL;
> +	}
> +
> +	if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) {
> +		pr_err("warning: device max_send_sge = %d too small\n",
> +		       sc->ib.dev->attrs.max_send_sge);
> +		return -EINVAL;
> +	}
> +	if (sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
> +		pr_err("warning: device max_recv_sge = %d too small\n",
> +		       sc->ib.dev->attrs.max_recv_sge);
> +		return -EINVAL;
> +	}
>   
>   	sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
>   	if (IS_ERR(sc->ib.pd)) {
> @@ -2045,8 +2098,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
>   	}
>   
>   	sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc,
> -					 sp->send_credit_target +
> -					 cap->max_rdma_ctxs,
> +					 max_send_wr,
>   					 IB_POLL_WORKQUEUE);
>   	if (IS_ERR(sc->ib.send_cq)) {
>   		pr_err("Can't create RDMA send CQ\n");
> @@ -2056,7 +2108,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
>   	}
>   
>   	sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc,
> -					 sp->recv_credit_max,
> +					 cap->max_recv_wr,
>   					 IB_POLL_WORKQUEUE);
>   	if (IS_ERR(sc->ib.recv_cq)) {
>   		pr_err("Can't create RDMA recv CQ\n");
> @@ -2065,6 +2117,14 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
>   		goto err;
>   	}
>   
> +	/*
> +	 * We reset completely here!
> +	 * As the above use was just temporary
> +	 * to calc max_send_wr and rdma_send_wr.
> +	 *
> +	 * rdma_create_qp() will trigger rdma_rw_init_qp()
> +	 * again if max_rdma_ctxs is not 0.
> +	 */
>   	memset(&qp_attr, 0, sizeof(qp_attr));
>   	qp_attr.event_handler = smb_direct_qpair_handler;
>   	qp_attr.qp_context = sc;
> @@ -2084,18 +2144,6 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
>   	sc->ib.qp = sc->rdma.cm_id->qp;
>   	sc->rdma.cm_id->event_handler = smb_direct_cm_handler;
>   
> -	pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1;
> -	if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) {
> -		ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs,
> -				      sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG,
> -				      sc->rw_io.credits.num_pages, 0);
> -		if (ret) {
> -			pr_err("failed to init mr pool count %zu pages %zu\n",
> -			       sc->rw_io.credits.max, sc->rw_io.credits.num_pages);
> -			goto err;
> -		}
> -	}
> -
>   	return 0;
>   err:
>   	if (sc->ib.qp) {




More information about the samba-technical mailing list