Discussion:
mlx5(4) jumbo receive
(too old to reply)
Konstantin Belousov
2018-04-24 08:55:53 UTC
Permalink
Hello,
the patch below is of interest for people who use Mellanox Connect-X/4
and 5 ethernet adapters and configure it for jumbo frames. The patch
should improve the system behavior on low or fragmented memory situations.
See the commit message for detailed description. Also, the patch should
not cause performance degradation for the normal 1500 MTU by keeping the
same configuration as currently unpatched driver.

I am interested in hearing about regressions (or not) caused by the
change.

commit f25c0c624177a1ca06ca051b0adb842acb66ec11
Author: Konstantin Belousov <***@mellanox.com>
Date: Wed Nov 22 20:14:53 2017 +0200

mlx5en: Handle jumbo frames without requiring big clusters.

The scatter list is formed by the chunks of MCLBYTES each, and larger
than default packets are returned to the stack as the mbuf chain.

This is an improvements over the original patch by ***@mellanox.com
which solves some busdma issues and sizes WQEs scatter list to only
have the minimal sufficient number of elements. In particular, for
the default MTU 1500 bytes the receive queue format is not changed
comparing to the unpatched driver, avoiding a reported performance
regression.

Issue: 1065432
Issue: 1184316
Change-Id: Ib63a5ef55abd6ef1ec9b296e2cef88e4604bbd29
Signed-off-by: Konstantin Belousov <***@mellanox.com>

diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
index b5000c32eaf..ff5ef446e34 100644
--- a/sys/dev/mlx5/mlx5_en/en.h
+++ b/sys/dev/mlx5/mlx5_en/en.h
@@ -80,8 +80,19 @@
#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE 0xa
#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE 0xe

-/* freeBSD HW LRO is limited by 16KB - the size of max mbuf */
+#define MLX5E_MAX_RX_SEGS 7
+
+#ifndef MLX5E_MAX_RX_BYTES
+#define MLX5E_MAX_RX_BYTES MCLBYTES
+#endif
+
+#if (MLX5E_MAX_RX_SEGS == 1)
+/* FreeBSD HW LRO is limited by 16KB - the size of max mbuf */
#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ MJUM16BYTES
+#else
+#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ \
+ MIN(65535, MLX5E_MAX_RX_SEGS * MLX5E_MAX_RX_BYTES)
+#endif
#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC 0x10
#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3
#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS 0x20
@@ -530,6 +541,7 @@ struct mlx5e_rq {
struct mtx mtx;
bus_dma_tag_t dma_tag;
u32 wqe_sz;
+ u32 nsegs;
struct mlx5e_rq_mbuf *mbuf;
struct ifnet *ifp;
struct mlx5e_rq_stats stats;
@@ -795,9 +807,12 @@ struct mlx5e_tx_wqe {

struct mlx5e_rx_wqe {
struct mlx5_wqe_srq_next_seg next;
- struct mlx5_wqe_data_seg data;
+ struct mlx5_wqe_data_seg data[];
};

+/* the size of the structure above must be power of two */
+CTASSERT(powerof2(sizeof(struct mlx5e_rx_wqe)));
+
struct mlx5e_eeprom {
int lock_bit;
int i2c_addr;
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
index c2c4b0d7744..25544e561e3 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -33,9 +33,12 @@
#ifndef ETH_DRIVER_VERSION
#define ETH_DRIVER_VERSION "3.4.1"
#endif
+
char mlx5e_version[] = "Mellanox Ethernet driver"
" (" ETH_DRIVER_VERSION ")";

+static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs);
+
struct mlx5e_channel_param {
struct mlx5e_rq_param rq;
struct mlx5e_sq_param sq;
@@ -810,6 +813,11 @@ mlx5e_create_rq(struct mlx5e_channel *c,
int wq_sz;
int err;
int i;
+ u32 nsegs, wqe_sz;
+
+ err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
+ if (err != 0)
+ goto done;

/* Create DMA descriptor TAG */
if ((err = -bus_dma_tag_create(
@@ -819,9 +827,9 @@ mlx5e_create_rq(struct mlx5e_channel *c,
BUS_SPACE_MAXADDR, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
- MJUM16BYTES, /* maxsize */
- 1, /* nsegments */
- MJUM16BYTES, /* maxsegsize */
+ nsegs * MLX5E_MAX_RX_BYTES, /* maxsize */
+ nsegs, /* nsegments */
+ nsegs * MLX5E_MAX_RX_BYTES, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockfuncarg */
&rq->dma_tag)))
@@ -834,23 +842,9 @@ mlx5e_create_rq(struct mlx5e_channel *c,

rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];

- if (priv->params.hw_lro_en) {
- rq->wqe_sz = priv->params.lro_wqe_sz;
- } else {
- rq->wqe_sz = MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
- }
- if (rq->wqe_sz > MJUM16BYTES) {
- err = -ENOMEM;
+ err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs);
+ if (err != 0)
goto err_rq_wq_destroy;
- } else if (rq->wqe_sz > MJUM9BYTES) {
- rq->wqe_sz = MJUM16BYTES;
- } else if (rq->wqe_sz > MJUMPAGESIZE) {
- rq->wqe_sz = MJUM9BYTES;
- } else if (rq->wqe_sz > MCLBYTES) {
- rq->wqe_sz = MJUMPAGESIZE;
- } else {
- rq->wqe_sz = MCLBYTES;
- }

wq_sz = mlx5_wq_ll_get_size(&rq->wq);

@@ -861,7 +855,11 @@ mlx5e_create_rq(struct mlx5e_channel *c,
rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO);
for (i = 0; i != wq_sz; i++) {
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
+#if (MLX5E_MAX_RX_SEGS == 1)
uint32_t byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
+#else
+ int j;
+#endif

err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map);
if (err != 0) {
@@ -869,8 +867,15 @@ mlx5e_create_rq(struct mlx5e_channel *c,
bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
goto err_rq_mbuf_free;
}
- wqe->data.lkey = c->mkey_be;
- wqe->data.byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
+
+ /* set value for constant fields */
+#if (MLX5E_MAX_RX_SEGS == 1)
+ wqe->data[0].lkey = c->mkey_be;
+ wqe->data[0].byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
+#else
+ for (j = 0; j < rq->nsegs; j++)
+ wqe->data[j].lkey = c->mkey_be;
+#endif
}

rq->ifp = c->ifp;
@@ -1809,16 +1814,51 @@ mlx5e_close_channel_wait(struct mlx5e_channel *volatile *pp)
free(c, M_MLX5EN);
}

+static int
+mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs)
+{
+ u32 r, n;
+
+ r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz :
+ MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
+ if (r > MJUM16BYTES)
+ return (-ENOMEM);
+
+ if (r > MJUM9BYTES)
+ r = MJUM16BYTES;
+ else if (r > MJUMPAGESIZE)
+ r = MJUM9BYTES;
+ else if (r > MCLBYTES)
+ r = MJUMPAGESIZE;
+ else
+ r = MCLBYTES;
+
+ /*
+ * n + 1 must be a power of two, because stride size must be.
+ * Stride size is 16 * (n + 1), as the first segment is
+ * control.
+ */
+ for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++)
+ ;
+
+ *wqe_sz = r;
+ *nsegs = n;
+ return (0);
+}
+
static void
mlx5e_build_rq_param(struct mlx5e_priv *priv,
struct mlx5e_rq_param *param)
{
void *rqc = param->rqc;
void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+ u32 wqe_sz, nsegs;

+ mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
- MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe)));
+ MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) +
+ nsegs * sizeof(struct mlx5_wqe_data_seg)));
MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
MLX5_SET(wq, wq, pd, priv->pdn);

diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
index fb14be43b32..5868b0a2f55 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
@@ -32,21 +32,47 @@ static inline int
mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
struct mlx5e_rx_wqe *wqe, u16 ix)
{
- bus_dma_segment_t segs[1];
+ bus_dma_segment_t segs[rq->nsegs];
struct mbuf *mb;
int nsegs;
int err;
-
+#if (MLX5E_MAX_RX_SEGS != 1)
+ struct mbuf *mb_head;
+ int i;
+#endif
if (rq->mbuf[ix].mbuf != NULL)
return (0);

+#if (MLX5E_MAX_RX_SEGS == 1)
mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
if (unlikely(!mb))
return (-ENOMEM);

- /* set initial mbuf length */
mb->m_pkthdr.len = mb->m_len = rq->wqe_sz;
+#else
+ mb_head = mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
+ MLX5E_MAX_RX_BYTES);
+ if (unlikely(mb == NULL))
+ return (-ENOMEM);
+
+ mb->m_len = MLX5E_MAX_RX_BYTES;
+ mb->m_pkthdr.len = MLX5E_MAX_RX_BYTES;

+ for (i = 1; i < rq->nsegs; i++) {
+ if (mb_head->m_pkthdr.len >= rq->wqe_sz)
+ break;
+ mb = mb->m_next = m_getjcl(M_NOWAIT, MT_DATA, 0,
+ MLX5E_MAX_RX_BYTES);
+ if (unlikely(mb == NULL)) {
+ m_freem(mb_head);
+ return (-ENOMEM);
+ }
+ mb->m_len = MLX5E_MAX_RX_BYTES;
+ mb_head->m_pkthdr.len += MLX5E_MAX_RX_BYTES;
+ }
+ /* rewind to first mbuf in chain */
+ mb = mb_head;
+#endif
/* get IP header aligned */
m_adj(mb, MLX5E_NET_IP_ALIGN);

@@ -54,12 +80,26 @@ mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
mb, segs, &nsegs, BUS_DMA_NOWAIT);
if (err != 0)
goto err_free_mbuf;
- if (unlikely(nsegs != 1)) {
+ if (unlikely(nsegs == 0)) {
bus_dmamap_unload(rq->dma_tag, rq->mbuf[ix].dma_map);
err = -ENOMEM;
goto err_free_mbuf;
}
- wqe->data.addr = cpu_to_be64(segs[0].ds_addr);
+#if (MLX5E_MAX_RX_SEGS == 1)
+ wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
+#else
+ wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
+ wqe->data[0].byte_count = cpu_to_be32(segs[0].ds_len |
+ MLX5_HW_START_PADDING);
+ for (i = 1; i != nsegs; i++) {
+ wqe->data[i].addr = cpu_to_be64(segs[i].ds_addr);
+ wqe->data[i].byte_count = cpu_to_be32(segs[i].ds_len);
+ }
+ for (; i < rq->nsegs; i++) {
+ wqe->data[i].addr = 0;
+ wqe->data[i].byte_count = 0;
+ }
+#endif

rq->mbuf[ix].mbuf = mb;
rq->mbuf[ix].data = mb->m_data;
@@ -214,6 +254,9 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
{
struct ifnet *ifp = rq->ifp;
struct mlx5e_channel *c;
+#if (MLX5E_MAX_RX_SEGS != 1)
+ struct mbuf *mb_head;
+#endif
int lro_num_seg; /* HW LRO session aggregated packets counter */
uint64_t tstmp;

@@ -224,7 +267,26 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
rq->stats.lro_bytes += cqe_bcnt;
}

+#if (MLX5E_MAX_RX_SEGS == 1)
mb->m_pkthdr.len = mb->m_len = cqe_bcnt;
+#else
+ mb->m_pkthdr.len = cqe_bcnt;
+ for (mb_head = mb; mb != NULL; mb = mb->m_next) {
+ if (mb->m_len > cqe_bcnt)
+ mb->m_len = cqe_bcnt;
+ cqe_bcnt -= mb->m_len;
+ if (likely(cqe_bcnt == 0)) {
+ if (likely(mb->m_next != NULL)) {
+ /* trim off empty mbufs */
+ m_freem(mb->m_next);
+ mb->m_next = NULL;
+ }
+ break;
+ }
+ }
+ /* rewind to first mbuf in chain */
+ mb = mb_head;
+#endif
/* check if a Toeplitz hash was computed */
if (cqe->rss_hash_type != 0) {
mb->m_pkthdr.flowid = be32_to_cpu(cqe->rss_hash_result);
@@ -447,7 +509,10 @@ mlx5e_rx_cq_comp(struct mlx5_core_cq *mcq)
int i = 0;

#ifdef HAVE_PER_CQ_EVENT_PACKET
- struct mbuf *mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
+#if (MHLEN < 15)
+#error "MHLEN is too small"
+#endif
+ struct mbuf *mb = m_gethdr(M_NOWAIT, MT_DATA);

if (mb != NULL) {
/* this code is used for debugging purpose only */
Ben RUBSON
2018-04-24 09:55:38 UTC
Permalink
On 24 Apr, Konstantin Belousov wrote:

> Hello,
> the patch below is of interest for people who use Mellanox Connect-X/4
> and 5 ethernet adapters and configure it for jumbo frames.

Hi Konstantin,

Good news !
Do you think your work could be easily ported to Connect-X/3 devices (mlx4
driver) ?

Thank you very much,

Ben

For completeness regarding mlx4, some work has been started on it :
https://reviews.freebsd.org/D11560
There's a discussion :
https://lists.freebsd.org/pipermail/freebsd-net/2017-June/048293.html
Which continues here :
https://lists.freebsd.org/pipermail/freebsd-net/2017-September/048906.html
Konstantin Belousov
2018-04-24 11:10:50 UTC
Permalink
On Tue, Apr 24, 2018 at 11:55:38AM +0200, Ben RUBSON wrote:
> On 24 Apr, Konstantin Belousov wrote:
>
> > Hello,
> > the patch below is of interest for people who use Mellanox Connect-X/4
> > and 5 ethernet adapters and configure it for jumbo frames.
>
> Hi Konstantin,
>
> Good news !
> Do you think your work could be easily ported to Connect-X/3 devices (mlx4
> driver) ?
No, it cannot be easily ported. The port might happen, but I do not
promise this.

>
> Thank you very much,
>
> Ben
>
> For completeness regarding mlx4, some work has been started on it :
> https://reviews.freebsd.org/D11560
> There's a discussion :
> https://lists.freebsd.org/pipermail/freebsd-net/2017-June/048293.html
> Which continues here :
> https://lists.freebsd.org/pipermail/freebsd-net/2017-September/048906.html
Rodney W. Grimes
2018-04-24 13:44:24 UTC
Permalink
> Hello,
> the patch below is of interest for people who use Mellanox Connect-X/4
> and 5 ethernet adapters and configure it for jumbo frames. The patch
> should improve the system behavior on low or fragmented memory situations.
> See the commit message for detailed description. Also, the patch should
> not cause performance degradation for the normal 1500 MTU by keeping the
> same configuration as currently unpatched driver.
>
> I am interested in hearing about regressions (or not) caused by the
> change.

I'll hopefully get to test this early next week, but my schedule
is very full until then.

I do have a fist full of sytle(9) and other comments though.

You can probably ignore my nits on capitalize start of comments,
it seems that this may be the style of this code....

> commit f25c0c624177a1ca06ca051b0adb842acb66ec11
> Author: Konstantin Belousov <***@mellanox.com>
> Date: Wed Nov 22 20:14:53 2017 +0200
>
> mlx5en: Handle jumbo frames without requiring big clusters.
>
> The scatter list is formed by the chunks of MCLBYTES each, and larger
> than default packets are returned to the stack as the mbuf chain.
>
> This is an improvements over the original patch by ***@mellanox.com
> which solves some busdma issues and sizes WQEs scatter list to only
> have the minimal sufficient number of elements. In particular, for
> the default MTU 1500 bytes the receive queue format is not changed
> comparing to the unpatched driver, avoiding a reported performance
> regression.
>
> Issue: 1065432
> Issue: 1184316
> Change-Id: Ib63a5ef55abd6ef1ec9b296e2cef88e4604bbd29
> Signed-off-by: Konstantin Belousov <***@mellanox.com>
>
> diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
> index b5000c32eaf..ff5ef446e34 100644
> --- a/sys/dev/mlx5/mlx5_en/en.h
> +++ b/sys/dev/mlx5/mlx5_en/en.h
> @@ -80,8 +80,19 @@
> #define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE 0xa
> #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE 0xe
>
> -/* freeBSD HW LRO is limited by 16KB - the size of max mbuf */
Though it is tempting to fix spelling errors and stylish things
while working on code it usually helps to sepeate those into
either a pre or post commit to reduce the lines of code change
that need to be looked at when reviewing a diff. This also helps
diff(1) to chunk things a bit better.

> +#define MLX5E_MAX_RX_SEGS 7
> +
> +#ifndef MLX5E_MAX_RX_BYTES
> +#define MLX5E_MAX_RX_BYTES MCLBYTES
> +#endif
> +
> +#if (MLX5E_MAX_RX_SEGS == 1)
> +/* FreeBSD HW LRO is limited by 16KB - the size of max mbuf */
> #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ MJUM16BYTES
> +#else
> +#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ \
> + MIN(65535, MLX5E_MAX_RX_SEGS * MLX5E_MAX_RX_BYTES)
> +#endif
> #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC 0x10
> #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3
> #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS 0x20
> @@ -530,6 +541,7 @@ struct mlx5e_rq {
> struct mtx mtx;
> bus_dma_tag_t dma_tag;
> u32 wqe_sz;
> + u32 nsegs;

Alphabetic order, and should just use modify the current u32 line:
u32 nsegs, wqe_sz;

> struct mlx5e_rq_mbuf *mbuf;
> struct ifnet *ifp;
> struct mlx5e_rq_stats stats;
> @@ -795,9 +807,12 @@ struct mlx5e_tx_wqe {
>
> struct mlx5e_rx_wqe {
> struct mlx5_wqe_srq_next_seg next;
> - struct mlx5_wqe_data_seg data;
> + struct mlx5_wqe_data_seg data[];

None standard way to declare a pointer?

> };
>
> +/* the size of the structure above must be power of two */
the should be The.

> +CTASSERT(powerof2(sizeof(struct mlx5e_rx_wqe)));
> +
> struct mlx5e_eeprom {
> int lock_bit;
> int i2c_addr;
> diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> index c2c4b0d7744..25544e561e3 100644
> --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> @@ -33,9 +33,12 @@
> #ifndef ETH_DRIVER_VERSION
> #define ETH_DRIVER_VERSION "3.4.1"
> #endif
> +
Adding white space is a style change, and actually unwanted right here
as the next line is tightly coupled to the previous line.

> char mlx5e_version[] = "Mellanox Ethernet driver"
> " (" ETH_DRIVER_VERSION ")";
>
> +static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs);
> +

Could you move the function before use and eliminate the need
for the forward declare?

> struct mlx5e_channel_param {
> struct mlx5e_rq_param rq;
> struct mlx5e_sq_param sq;
> @@ -810,6 +813,11 @@ mlx5e_create_rq(struct mlx5e_channel *c,
> int wq_sz;
> int err;
> int i;
> + u32 nsegs, wqe_sz;
> +
> + err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
> + if (err != 0)
> + goto done;
>
> /* Create DMA descriptor TAG */
> if ((err = -bus_dma_tag_create(
> @@ -819,9 +827,9 @@ mlx5e_create_rq(struct mlx5e_channel *c,
> BUS_SPACE_MAXADDR, /* lowaddr */
> BUS_SPACE_MAXADDR, /* highaddr */
> NULL, NULL, /* filter, filterarg */
> - MJUM16BYTES, /* maxsize */
> - 1, /* nsegments */
> - MJUM16BYTES, /* maxsegsize */
> + nsegs * MLX5E_MAX_RX_BYTES, /* maxsize */
> + nsegs, /* nsegments */
> + nsegs * MLX5E_MAX_RX_BYTES, /* maxsegsize */
> 0, /* flags */
> NULL, NULL, /* lockfunc, lockfuncarg */
> &rq->dma_tag)))
> @@ -834,23 +842,9 @@ mlx5e_create_rq(struct mlx5e_channel *c,
>
> rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
>
> - if (priv->params.hw_lro_en) {
> - rq->wqe_sz = priv->params.lro_wqe_sz;
> - } else {
> - rq->wqe_sz = MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
> - }
> - if (rq->wqe_sz > MJUM16BYTES) {
> - err = -ENOMEM;
> + err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs);
> + if (err != 0)
> goto err_rq_wq_destroy;
> - } else if (rq->wqe_sz > MJUM9BYTES) {
> - rq->wqe_sz = MJUM16BYTES;
> - } else if (rq->wqe_sz > MJUMPAGESIZE) {
> - rq->wqe_sz = MJUM9BYTES;
> - } else if (rq->wqe_sz > MCLBYTES) {
> - rq->wqe_sz = MJUMPAGESIZE;
> - } else {
> - rq->wqe_sz = MCLBYTES;
> - }
>
> wq_sz = mlx5_wq_ll_get_size(&rq->wq);
>
> @@ -861,7 +855,11 @@ mlx5e_create_rq(struct mlx5e_channel *c,
> rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO);
> for (i = 0; i != wq_sz; i++) {
> struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
> +#if (MLX5E_MAX_RX_SEGS == 1)
> uint32_t byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
> +#else
> + int j;
> +#endif
>
> err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map);
> if (err != 0) {
> @@ -869,8 +867,15 @@ mlx5e_create_rq(struct mlx5e_channel *c,
> bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
> goto err_rq_mbuf_free;
> }
> - wqe->data.lkey = c->mkey_be;
> - wqe->data.byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
> +
> + /* set value for constant fields */
> +#if (MLX5E_MAX_RX_SEGS == 1)
> + wqe->data[0].lkey = c->mkey_be;
> + wqe->data[0].byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
> +#else
> + for (j = 0; j < rq->nsegs; j++)
> + wqe->data[j].lkey = c->mkey_be;
> +#endif
> }
>
> rq->ifp = c->ifp;
> @@ -1809,16 +1814,51 @@ mlx5e_close_channel_wait(struct mlx5e_channel *volatile *pp)
> free(c, M_MLX5EN);
> }
>
> +static int
> +mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs)
> +{
> + u32 r, n;
Alpah order on variables:
u32 n, r;

> +
> + r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz :
> + MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
> + if (r > MJUM16BYTES)
> + return (-ENOMEM);
> +
> + if (r > MJUM9BYTES)
> + r = MJUM16BYTES;
> + else if (r > MJUMPAGESIZE)
> + r = MJUM9BYTES;
> + else if (r > MCLBYTES)
> + r = MJUMPAGESIZE;
> + else
> + r = MCLBYTES;
> +
> + /*
> + * n + 1 must be a power of two, because stride size must be.
> + * Stride size is 16 * (n + 1), as the first segment is
> + * control.
> + */
> + for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++)
> + ;
> +
> + *wqe_sz = r;
> + *nsegs = n;
> + return (0);
> +}
> +
> static void
> mlx5e_build_rq_param(struct mlx5e_priv *priv,
> struct mlx5e_rq_param *param)
> {
> void *rqc = param->rqc;
> void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
> + u32 wqe_sz, nsegs;
Variable order: nsegs, wqe_sz;

>
> + mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
> MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
> MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
> - MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe)));
> + MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) +
> + nsegs * sizeof(struct mlx5_wqe_data_seg)));
> MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
> MLX5_SET(wq, wq, pd, priv->pdn);
>
> diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> index fb14be43b32..5868b0a2f55 100644
> --- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> @@ -32,21 +32,47 @@ static inline int
> mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
> struct mlx5e_rx_wqe *wqe, u16 ix)
> {
> - bus_dma_segment_t segs[1];
> + bus_dma_segment_t segs[rq->nsegs];
This code line compiles? Or another odd pointer declare?

> struct mbuf *mb;
> int nsegs;
> int err;
> -
> +#if (MLX5E_MAX_RX_SEGS != 1)
> + struct mbuf *mb_head;
> + int i;
> +#endif
> if (rq->mbuf[ix].mbuf != NULL)
> return (0);
>
> +#if (MLX5E_MAX_RX_SEGS == 1)
> mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
> if (unlikely(!mb))
> return (-ENOMEM);
>
> - /* set initial mbuf length */
> mb->m_pkthdr.len = mb->m_len = rq->wqe_sz;
> +#else
> + mb_head = mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
> + MLX5E_MAX_RX_BYTES);
> + if (unlikely(mb == NULL))
> + return (-ENOMEM);
> +
> + mb->m_len = MLX5E_MAX_RX_BYTES;
> + mb->m_pkthdr.len = MLX5E_MAX_RX_BYTES;
>
> + for (i = 1; i < rq->nsegs; i++) {
> + if (mb_head->m_pkthdr.len >= rq->wqe_sz)
> + break;
> + mb = mb->m_next = m_getjcl(M_NOWAIT, MT_DATA, 0,
> + MLX5E_MAX_RX_BYTES);
> + if (unlikely(mb == NULL)) {
> + m_freem(mb_head);
> + return (-ENOMEM);
> + }
> + mb->m_len = MLX5E_MAX_RX_BYTES;
> + mb_head->m_pkthdr.len += MLX5E_MAX_RX_BYTES;
> + }
> + /* rewind to first mbuf in chain */
> + mb = mb_head;
> +#endif
> /* get IP header aligned */
> m_adj(mb, MLX5E_NET_IP_ALIGN);
>
> @@ -54,12 +80,26 @@ mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
> mb, segs, &nsegs, BUS_DMA_NOWAIT);
> if (err != 0)
> goto err_free_mbuf;
> - if (unlikely(nsegs != 1)) {
> + if (unlikely(nsegs == 0)) {
> bus_dmamap_unload(rq->dma_tag, rq->mbuf[ix].dma_map);
> err = -ENOMEM;
> goto err_free_mbuf;
> }
> - wqe->data.addr = cpu_to_be64(segs[0].ds_addr);
> +#if (MLX5E_MAX_RX_SEGS == 1)
> + wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
> +#else
> + wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
> + wqe->data[0].byte_count = cpu_to_be32(segs[0].ds_len |
> + MLX5_HW_START_PADDING);
> + for (i = 1; i != nsegs; i++) {
> + wqe->data[i].addr = cpu_to_be64(segs[i].ds_addr);
> + wqe->data[i].byte_count = cpu_to_be32(segs[i].ds_len);
> + }
> + for (; i < rq->nsegs; i++) {
> + wqe->data[i].addr = 0;
> + wqe->data[i].byte_count = 0;
> + }
> +#endif
>
> rq->mbuf[ix].mbuf = mb;
> rq->mbuf[ix].data = mb->m_data;
> @@ -214,6 +254,9 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
> {
> struct ifnet *ifp = rq->ifp;
> struct mlx5e_channel *c;
> +#if (MLX5E_MAX_RX_SEGS != 1)
> + struct mbuf *mb_head;
> +#endif
> int lro_num_seg; /* HW LRO session aggregated packets counter */
> uint64_t tstmp;
>
> @@ -224,7 +267,26 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
> rq->stats.lro_bytes += cqe_bcnt;
> }
>
> +#if (MLX5E_MAX_RX_SEGS == 1)
> mb->m_pkthdr.len = mb->m_len = cqe_bcnt;
> +#else
> + mb->m_pkthdr.len = cqe_bcnt;
> + for (mb_head = mb; mb != NULL; mb = mb->m_next) {
> + if (mb->m_len > cqe_bcnt)
> + mb->m_len = cqe_bcnt;
> + cqe_bcnt -= mb->m_len;
> + if (likely(cqe_bcnt == 0)) {
> + if (likely(mb->m_next != NULL)) {
> + /* trim off empty mbufs */
Trim

> + m_freem(mb->m_next);
> + mb->m_next = NULL;
> + }
> + break;
> + }
> + }
> + /* rewind to first mbuf in chain */
Rewind

> + mb = mb_head;
> +#endif
> /* check if a Toeplitz hash was computed */
> if (cqe->rss_hash_type != 0) {
> mb->m_pkthdr.flowid = be32_to_cpu(cqe->rss_hash_result);
> @@ -447,7 +509,10 @@ mlx5e_rx_cq_comp(struct mlx5_core_cq *mcq)
> int i = 0;
>
> #ifdef HAVE_PER_CQ_EVENT_PACKET
> - struct mbuf *mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
> +#if (MHLEN < 15)
> +#error "MHLEN is too small"
> +#endif
> + struct mbuf *mb = m_gethdr(M_NOWAIT, MT_DATA);
>
> if (mb != NULL) {
> /* this code is used for debugging purpose only */
> _______________________________________________
> freebsd-***@freebsd.org mailing list
> https://lists.freebsd.org/mailman/listinfo/freebsd-net
> To unsubscribe, send any mail to "freebsd-net-***@freebsd.org"
>

--
Rod Grimes ***@freebsd.org
Konstantin Belousov
2018-04-24 13:59:59 UTC
Permalink
On Tue, Apr 24, 2018 at 06:44:24AM -0700, Rodney W. Grimes wrote:
> > Hello,
> > the patch below is of interest for people who use Mellanox Connect-X/4
> > and 5 ethernet adapters and configure it for jumbo frames. The patch
> > should improve the system behavior on low or fragmented memory situations.
> > See the commit message for detailed description. Also, the patch should
> > not cause performance degradation for the normal 1500 MTU by keeping the
> > same configuration as currently unpatched driver.
> >
> > I am interested in hearing about regressions (or not) caused by the
> > change.
>
> I'll hopefully get to test this early next week, but my schedule
> is very full until then.
>
> I do have a fist full of sytle(9) and other comments though.
>
> You can probably ignore my nits on capitalize start of comments,
> it seems that this may be the style of this code....
>
> > commit f25c0c624177a1ca06ca051b0adb842acb66ec11
> > Author: Konstantin Belousov <***@mellanox.com>
> > Date: Wed Nov 22 20:14:53 2017 +0200
> >
> > mlx5en: Handle jumbo frames without requiring big clusters.
> >
> > The scatter list is formed by the chunks of MCLBYTES each, and larger
> > than default packets are returned to the stack as the mbuf chain.
> >
> > This is an improvements over the original patch by ***@mellanox.com
> > which solves some busdma issues and sizes WQEs scatter list to only
> > have the minimal sufficient number of elements. In particular, for
> > the default MTU 1500 bytes the receive queue format is not changed
> > comparing to the unpatched driver, avoiding a reported performance
> > regression.
> >
> > Issue: 1065432
> > Issue: 1184316
> > Change-Id: Ib63a5ef55abd6ef1ec9b296e2cef88e4604bbd29
> > Signed-off-by: Konstantin Belousov <***@mellanox.com>
> >
> > diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
> > index b5000c32eaf..ff5ef446e34 100644
> > --- a/sys/dev/mlx5/mlx5_en/en.h
> > +++ b/sys/dev/mlx5/mlx5_en/en.h
> > @@ -80,8 +80,19 @@
> > #define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE 0xa
> > #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE 0xe
> >
> > -/* freeBSD HW LRO is limited by 16KB - the size of max mbuf */
> Though it is tempting to fix spelling errors and stylish things
> while working on code it usually helps to sepeate those into
> either a pre or post commit to reduce the lines of code change
> that need to be looked at when reviewing a diff. This also helps
> diff(1) to chunk things a bit better.
>
> > +#define MLX5E_MAX_RX_SEGS 7
> > +
> > +#ifndef MLX5E_MAX_RX_BYTES
> > +#define MLX5E_MAX_RX_BYTES MCLBYTES
> > +#endif
> > +
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> > +/* FreeBSD HW LRO is limited by 16KB - the size of max mbuf */
> > #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ MJUM16BYTES
> > +#else
> > +#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ \
> > + MIN(65535, MLX5E_MAX_RX_SEGS * MLX5E_MAX_RX_BYTES)
> > +#endif
> > #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC 0x10
> > #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3
> > #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS 0x20
> > @@ -530,6 +541,7 @@ struct mlx5e_rq {
> > struct mtx mtx;
> > bus_dma_tag_t dma_tag;
> > u32 wqe_sz;
> > + u32 nsegs;
>
> Alphabetic order, and should just use modify the current u32 line:
> u32 nsegs, wqe_sz;
>
> > struct mlx5e_rq_mbuf *mbuf;
> > struct ifnet *ifp;
> > struct mlx5e_rq_stats stats;
> > @@ -795,9 +807,12 @@ struct mlx5e_tx_wqe {
> >
> > struct mlx5e_rx_wqe {
> > struct mlx5_wqe_srq_next_seg next;
> > - struct mlx5_wqe_data_seg data;
> > + struct mlx5_wqe_data_seg data[];
>
> None standard way to declare a pointer?
This is not a pointer declaration. It is a flexible array member.

For the style issues, I will take look later.
>
> > };
> >
> > +/* the size of the structure above must be power of two */
> the should be The.
>
> > +CTASSERT(powerof2(sizeof(struct mlx5e_rx_wqe)));
> > +
> > struct mlx5e_eeprom {
> > int lock_bit;
> > int i2c_addr;
> > diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> > index c2c4b0d7744..25544e561e3 100644
> > --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> > +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> > @@ -33,9 +33,12 @@
> > #ifndef ETH_DRIVER_VERSION
> > #define ETH_DRIVER_VERSION "3.4.1"
> > #endif
> > +
> Adding white space is a style change, and actually unwanted right here
> as the next line is tightly coupled to the previous line.
>
> > char mlx5e_version[] = "Mellanox Ethernet driver"
> > " (" ETH_DRIVER_VERSION ")";
> >
> > +static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs);
> > +
>
> Could you move the function before use and eliminate the need
> for the forward declare?
>
> > struct mlx5e_channel_param {
> > struct mlx5e_rq_param rq;
> > struct mlx5e_sq_param sq;
> > @@ -810,6 +813,11 @@ mlx5e_create_rq(struct mlx5e_channel *c,
> > int wq_sz;
> > int err;
> > int i;
> > + u32 nsegs, wqe_sz;
> > +
> > + err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
> > + if (err != 0)
> > + goto done;
> >
> > /* Create DMA descriptor TAG */
> > if ((err = -bus_dma_tag_create(
> > @@ -819,9 +827,9 @@ mlx5e_create_rq(struct mlx5e_channel *c,
> > BUS_SPACE_MAXADDR, /* lowaddr */
> > BUS_SPACE_MAXADDR, /* highaddr */
> > NULL, NULL, /* filter, filterarg */
> > - MJUM16BYTES, /* maxsize */
> > - 1, /* nsegments */
> > - MJUM16BYTES, /* maxsegsize */
> > + nsegs * MLX5E_MAX_RX_BYTES, /* maxsize */
> > + nsegs, /* nsegments */
> > + nsegs * MLX5E_MAX_RX_BYTES, /* maxsegsize */
> > 0, /* flags */
> > NULL, NULL, /* lockfunc, lockfuncarg */
> > &rq->dma_tag)))
> > @@ -834,23 +842,9 @@ mlx5e_create_rq(struct mlx5e_channel *c,
> >
> > rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
> >
> > - if (priv->params.hw_lro_en) {
> > - rq->wqe_sz = priv->params.lro_wqe_sz;
> > - } else {
> > - rq->wqe_sz = MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
> > - }
> > - if (rq->wqe_sz > MJUM16BYTES) {
> > - err = -ENOMEM;
> > + err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs);
> > + if (err != 0)
> > goto err_rq_wq_destroy;
> > - } else if (rq->wqe_sz > MJUM9BYTES) {
> > - rq->wqe_sz = MJUM16BYTES;
> > - } else if (rq->wqe_sz > MJUMPAGESIZE) {
> > - rq->wqe_sz = MJUM9BYTES;
> > - } else if (rq->wqe_sz > MCLBYTES) {
> > - rq->wqe_sz = MJUMPAGESIZE;
> > - } else {
> > - rq->wqe_sz = MCLBYTES;
> > - }
> >
> > wq_sz = mlx5_wq_ll_get_size(&rq->wq);
> >
> > @@ -861,7 +855,11 @@ mlx5e_create_rq(struct mlx5e_channel *c,
> > rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO);
> > for (i = 0; i != wq_sz; i++) {
> > struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> > uint32_t byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
> > +#else
> > + int j;
> > +#endif
> >
> > err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map);
> > if (err != 0) {
> > @@ -869,8 +867,15 @@ mlx5e_create_rq(struct mlx5e_channel *c,
> > bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
> > goto err_rq_mbuf_free;
> > }
> > - wqe->data.lkey = c->mkey_be;
> > - wqe->data.byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
> > +
> > + /* set value for constant fields */
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> > + wqe->data[0].lkey = c->mkey_be;
> > + wqe->data[0].byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
> > +#else
> > + for (j = 0; j < rq->nsegs; j++)
> > + wqe->data[j].lkey = c->mkey_be;
> > +#endif
> > }
> >
> > rq->ifp = c->ifp;
> > @@ -1809,16 +1814,51 @@ mlx5e_close_channel_wait(struct mlx5e_channel *volatile *pp)
> > free(c, M_MLX5EN);
> > }
> >
> > +static int
> > +mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs)
> > +{
> > + u32 r, n;
> Alpah order on variables:
> u32 n, r;
>
> > +
> > + r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz :
> > + MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
> > + if (r > MJUM16BYTES)
> > + return (-ENOMEM);
> > +
> > + if (r > MJUM9BYTES)
> > + r = MJUM16BYTES;
> > + else if (r > MJUMPAGESIZE)
> > + r = MJUM9BYTES;
> > + else if (r > MCLBYTES)
> > + r = MJUMPAGESIZE;
> > + else
> > + r = MCLBYTES;
> > +
> > + /*
> > + * n + 1 must be a power of two, because stride size must be.
> > + * Stride size is 16 * (n + 1), as the first segment is
> > + * control.
> > + */
> > + for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++)
> > + ;
> > +
> > + *wqe_sz = r;
> > + *nsegs = n;
> > + return (0);
> > +}
> > +
> > static void
> > mlx5e_build_rq_param(struct mlx5e_priv *priv,
> > struct mlx5e_rq_param *param)
> > {
> > void *rqc = param->rqc;
> > void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
> > + u32 wqe_sz, nsegs;
> Variable order: nsegs, wqe_sz;
>
> >
> > + mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
> > MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
> > MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
> > - MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe)));
> > + MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) +
> > + nsegs * sizeof(struct mlx5_wqe_data_seg)));
> > MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
> > MLX5_SET(wq, wq, pd, priv->pdn);
> >
> > diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> > index fb14be43b32..5868b0a2f55 100644
> > --- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> > +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> > @@ -32,21 +32,47 @@ static inline int
> > mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
> > struct mlx5e_rx_wqe *wqe, u16 ix)
> > {
> > - bus_dma_segment_t segs[1];
> > + bus_dma_segment_t segs[rq->nsegs];
> This code line compiles? Or another odd pointer declare?
>
> > struct mbuf *mb;
> > int nsegs;
> > int err;
> > -
> > +#if (MLX5E_MAX_RX_SEGS != 1)
> > + struct mbuf *mb_head;
> > + int i;
> > +#endif
> > if (rq->mbuf[ix].mbuf != NULL)
> > return (0);
> >
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> > mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
> > if (unlikely(!mb))
> > return (-ENOMEM);
> >
> > - /* set initial mbuf length */
> > mb->m_pkthdr.len = mb->m_len = rq->wqe_sz;
> > +#else
> > + mb_head = mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
> > + MLX5E_MAX_RX_BYTES);
> > + if (unlikely(mb == NULL))
> > + return (-ENOMEM);
> > +
> > + mb->m_len = MLX5E_MAX_RX_BYTES;
> > + mb->m_pkthdr.len = MLX5E_MAX_RX_BYTES;
> >
> > + for (i = 1; i < rq->nsegs; i++) {
> > + if (mb_head->m_pkthdr.len >= rq->wqe_sz)
> > + break;
> > + mb = mb->m_next = m_getjcl(M_NOWAIT, MT_DATA, 0,
> > + MLX5E_MAX_RX_BYTES);
> > + if (unlikely(mb == NULL)) {
> > + m_freem(mb_head);
> > + return (-ENOMEM);
> > + }
> > + mb->m_len = MLX5E_MAX_RX_BYTES;
> > + mb_head->m_pkthdr.len += MLX5E_MAX_RX_BYTES;
> > + }
> > + /* rewind to first mbuf in chain */
> > + mb = mb_head;
> > +#endif
> > /* get IP header aligned */
> > m_adj(mb, MLX5E_NET_IP_ALIGN);
> >
> > @@ -54,12 +80,26 @@ mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
> > mb, segs, &nsegs, BUS_DMA_NOWAIT);
> > if (err != 0)
> > goto err_free_mbuf;
> > - if (unlikely(nsegs != 1)) {
> > + if (unlikely(nsegs == 0)) {
> > bus_dmamap_unload(rq->dma_tag, rq->mbuf[ix].dma_map);
> > err = -ENOMEM;
> > goto err_free_mbuf;
> > }
> > - wqe->data.addr = cpu_to_be64(segs[0].ds_addr);
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> > + wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
> > +#else
> > + wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
> > + wqe->data[0].byte_count = cpu_to_be32(segs[0].ds_len |
> > + MLX5_HW_START_PADDING);
> > + for (i = 1; i != nsegs; i++) {
> > + wqe->data[i].addr = cpu_to_be64(segs[i].ds_addr);
> > + wqe->data[i].byte_count = cpu_to_be32(segs[i].ds_len);
> > + }
> > + for (; i < rq->nsegs; i++) {
> > + wqe->data[i].addr = 0;
> > + wqe->data[i].byte_count = 0;
> > + }
> > +#endif
> >
> > rq->mbuf[ix].mbuf = mb;
> > rq->mbuf[ix].data = mb->m_data;
> > @@ -214,6 +254,9 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
> > {
> > struct ifnet *ifp = rq->ifp;
> > struct mlx5e_channel *c;
> > +#if (MLX5E_MAX_RX_SEGS != 1)
> > + struct mbuf *mb_head;
> > +#endif
> > int lro_num_seg; /* HW LRO session aggregated packets counter */
> > uint64_t tstmp;
> >
> > @@ -224,7 +267,26 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
> > rq->stats.lro_bytes += cqe_bcnt;
> > }
> >
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> > mb->m_pkthdr.len = mb->m_len = cqe_bcnt;
> > +#else
> > + mb->m_pkthdr.len = cqe_bcnt;
> > + for (mb_head = mb; mb != NULL; mb = mb->m_next) {
> > + if (mb->m_len > cqe_bcnt)
> > + mb->m_len = cqe_bcnt;
> > + cqe_bcnt -= mb->m_len;
> > + if (likely(cqe_bcnt == 0)) {
> > + if (likely(mb->m_next != NULL)) {
> > + /* trim off empty mbufs */
> Trim
>
> > + m_freem(mb->m_next);
> > + mb->m_next = NULL;
> > + }
> > + break;
> > + }
> > + }
> > + /* rewind to first mbuf in chain */
> Rewind
>
> > + mb = mb_head;
> > +#endif
> > /* check if a Toeplitz hash was computed */
> > if (cqe->rss_hash_type != 0) {
> > mb->m_pkthdr.flowid = be32_to_cpu(cqe->rss_hash_result);
> > @@ -447,7 +509,10 @@ mlx5e_rx_cq_comp(struct mlx5_core_cq *mcq)
> > int i = 0;
> >
> > #ifdef HAVE_PER_CQ_EVENT_PACKET
> > - struct mbuf *mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
> > +#if (MHLEN < 15)
> > +#error "MHLEN is too small"
> > +#endif
> > + struct mbuf *mb = m_gethdr(M_NOWAIT, MT_DATA);
> >
> > if (mb != NULL) {
> > /* this code is used for debugging purpose only */
> > _______________________________________________
> > freebsd-***@freebsd.org mailing list
> > https://lists.freebsd.org/mailman/listinfo/freebsd-net
> > To unsubscribe, send any mail to "freebsd-net-***@freebsd.org"
> >
>
> --
> Rod Grimes ***@freebsd.org
Ryan Stone
2018-04-25 20:04:13 UTC
Permalink
On Tue, Apr 24, 2018 at 4:55 AM, Konstantin Belousov
<***@gmail.com> wrote:
> +#ifndef MLX5E_MAX_RX_BYTES
> +#define MLX5E_MAX_RX_BYTES MCLBYTES
> +#endif

Why do you use a 2KB buffer rather than a PAGE_SIZE'd buffer?
MJUMPAGESIZE should offer significantly better performance for jumbo
frames without increasing the risk of memory fragmentation.
Rick Macklem
2018-04-25 22:21:43 UTC
Permalink
Ryan Stone wrote:
>On Tue, Apr 24, 2018 at 4:55 AM, Konstantin Belousov >><***@gmail.com>wrote:
>> +#ifndef MLX5E_MAX_RX_BYTES
>> +#define MLX5E_MAX_RX_BYTES MCLBYTES
>> +#endif
>
>Why do you use a 2KB buffer rather than a PAGE_SIZE'd buffer?
>MJUMPAGESIZE should offer significantly better performance for jumbo
>frames without increasing the risk of memory fragmentation.
Actually, when I was playing with using jumbo mbuf clusters for NFS, I was able
to get it to fragment to the point where allocations failed when mixing 2K and
4K mbuf clusters.
Admittedly I was using a 256Mbyte i386 and it wasn't easily reproduced, but
it was possible.
--> Using a mix of 2K and 4K mbuf clusters can result in fragmentation, although
I suspect that it isn't nearly as serious as what can happen when using 9K
mbuf clusters.

rick
Ben RUBSON
2018-04-26 06:43:01 UTC
Permalink
On 26 Apr 2018, Rick Macklem wrote:

> Ryan Stone wrote:
>> On Tue, Apr 24, 2018 at 4:55 AM, Konstantin Belousov
>> >><***@gmail.com>wrote:
>>> +#ifndef MLX5E_MAX_RX_BYTES
>>> +#define MLX5E_MAX_RX_BYTES MCLBYTES
>>> +#endif
>>
>> Why do you use a 2KB buffer rather than a PAGE_SIZE'd buffer?
>> MJUMPAGESIZE should offer significantly better performance for jumbo
>> frames without increasing the risk of memory fragmentation.
> Actually, when I was playing with using jumbo mbuf clusters for NFS, I
> was able
> to get it to fragment to the point where allocations failed when mixing
> 2K and
> 4K mbuf clusters.
> Admittedly I was using a 256Mbyte i386 and it wasn't easily reproduced, but
> it was possible.
> --> Using a mix of 2K and 4K mbuf clusters can result in fragmentation,
> although
> I suspect that it isn't nearly as serious as what can happen when using 9K
> mbuf clusters.

I used to face the fragmentation issue easily with MTU set to 9000
(x86_64 / 64GB / Connect-X/3).
I then decreased the MTU until 9K mbufs are not more used, to 4072 bytes
then.
The other used interface of this 2-ports card is set with a 1500 MTU.
Until now (about a year later), no more issue.

# vmstat -z | grep mbuf
ITEM SIZE LIMIT USED FREE REQ FAIL SLEEP
mbuf_packet: 256, 26080155, 16400, 9652, 999757417, 0, 0
mbuf: 256, 26080155, 16419, 11349, 85322301444, 0, 0
mbuf_cluster: 2048, 4075022, 26052, 550, 1059817, 0, 0
mbuf_jumbo_page: 4096, 2037511, 16400, 9522, 45863599682, 0, 0
mbuf_jumbo_9k: 9216, 603707, 0, 0, 0, 0, 0
mbuf_jumbo_16k: 16384, 339585, 0, 0, 0, 0, 0

Here's my experience :)

Ben
Konstantin Belousov
2018-04-26 08:02:58 UTC
Permalink
On Wed, Apr 25, 2018 at 04:04:13PM -0400, Ryan Stone wrote:
> On Tue, Apr 24, 2018 at 4:55 AM, Konstantin Belousov
> <***@gmail.com> wrote:
> > +#ifndef MLX5E_MAX_RX_BYTES
> > +#define MLX5E_MAX_RX_BYTES MCLBYTES
> > +#endif
>
> Why do you use a 2KB buffer rather than a PAGE_SIZE'd buffer?
> MJUMPAGESIZE should offer significantly better performance for jumbo
> frames without increasing the risk of memory fragmentation.
Part of the answer is that the patch was not written in one go (even not
by one person), but evolved, and this is how it shaped.

Another part is that indeed, as Rick stated, I am not sure about mixing
the different sizes for mbuf allocator. This might be more FUD than
factual-based considerations, but still.

I believe that the patch as is provides the important improvements.
If developing mlx4(4) change of the same nature, I will probably take
this into the exp stage from the beginning. For mlx5(4), I think
that the patch should be applied as is, then I might experiment
with PAGE_SIZE as the later step.
Continue reading on narkive:
Search results for 'mlx5(4) jumbo receive' (Questions and Answers)
7
replies
whats the best Electric guitar Ever???
started 2007-11-09 23:21:08 UTC
music
Loading...