From 39fa294f580a3be47fc2b502c143dec76737c8e6 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 18 Jun 2024 13:34:40 +0200
Subject: [PATCH 1/7] mlxsw: pci: Split NAPI setup/teardown into two steps

mlxsw_pci_cq_napi_setup() includes both NAPI initialization and
enablement, similar to teardown function. Next patches will add support
for page pool in mlxsw driver, then we use NAPI instance for page pool.

Page pool initialization should be done before NAPI enablement, same for
page pool destruction which should be done after NAPI disablement.

As preparation, split NAPI setup/teardown into two steps, then page pool
setup will be done between the phases.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/8dbf37e859f07247498fca17109b8858ff2b0498.1718709196.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlxsw/pci.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index bf66d996e32e7..fe6df5a92616b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -832,13 +832,10 @@ static void mlxsw_pci_cq_napi_setup(struct mlxsw_pci_queue *q,
 			       mlxsw_pci_napi_poll_cq_rx);
 		break;
 	}
-
-	napi_enable(&q->u.cq.napi);
 }
 
 static void mlxsw_pci_cq_napi_teardown(struct mlxsw_pci_queue *q)
 {
-	napi_disable(&q->u.cq.napi);
 	netif_napi_del(&q->u.cq.napi);
 }
 
@@ -875,6 +872,7 @@ static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 	if (err)
 		return err;
 	mlxsw_pci_cq_napi_setup(q, mlxsw_pci_cq_type(mlxsw_pci, q));
+	napi_enable(&q->u.cq.napi);
 	mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
 	mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
 	return 0;
@@ -883,6 +881,7 @@ static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 static void mlxsw_pci_cq_fini(struct mlxsw_pci *mlxsw_pci,
 			      struct mlxsw_pci_queue *q)
 {
+	napi_disable(&q->u.cq.napi);
 	mlxsw_pci_cq_napi_teardown(q);
 	mlxsw_cmd_hw2sw_cq(mlxsw_pci->core, q->num);
 }

From 7555b7f3385fb0cf5416cf38b4f7bd2bc1312f23 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 18 Jun 2024 13:34:41 +0200
Subject: [PATCH 2/7] mlxsw: pci: Store CQ pointer as part of RDQ structure

Next patches will add support for page pool in mlxsw driver. Page pool will
be used to allocate buffers for RDQ and will use NAPI instance of the
appropriate CQ (RDQ is mapped 1:1 to CQ).

To allow pool initialization as part of CQ init, when NAPI is initialized,
page_pool structure will be as part of CQ structure. Later, the allocations
for RDQ will be done from the pool in the appropriate CQ. To allow access
to the appropriate pool, set CQ pointer as part of RDQ initialization.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/d60918ca1e142a554af1df9c1152cdac83854a3b.1718709196.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlxsw/pci.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index fe6df5a92616b..8751da09a6c1b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -92,6 +92,9 @@ struct mlxsw_pci_queue {
 		struct {
 			struct tasklet_struct tasklet;
 		} eq;
+		struct {
+			struct mlxsw_pci_queue *cq;
+		} rdq;
 	} u;
 };
 
@@ -434,6 +437,7 @@ static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 
 	cq = mlxsw_pci_cq_get(mlxsw_pci, cq_num);
 	cq->u.cq.dq = q;
+	q->u.rdq.cq = cq;
 
 	mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
 
@@ -455,6 +459,7 @@ static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
 		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
 	}
+	q->u.rdq.cq = NULL;
 	cq->u.cq.dq = NULL;
 	mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num);
 

From 5642c6a086939bd1c0bba692f3a4afe4df668a50 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 18 Jun 2024 13:34:42 +0200
Subject: [PATCH 3/7] mlxsw: pci: Initialize page pool per CQ

Next patch will use page pool to allocate buffers for RDQ. Initialize
page pool for each CQ, which is mapped 1:1 to RDQ. Page pool for each Rx
queue enhances Rx side performance by reclaiming buffers back to each queue
specific pool.

When only one NAPI instance is the consumer of pages from page pool, it is
recommended to pass it as part of 'page_pool_params', then page pool APIs
will be done without special locks. mlxsw driver holds NAPI instance per
CQ, so add page pool per CQ and use the existing NAPI instance.

For now, pages are not allocated from the pool, next patch will use it.

Some notes regarding 'page_pool_params':
* Use PP_FLAG_DMA_MAP to allow page pool handles DMA mapping, for now
  do not use sync flag, as only the device writes to this memory and we
  read it only when it finishes writing there. This will probably be
  changed when we will support XDP.
* Define 'order' according to maximum MTU and take into account software
  overhead. Some round up are done, which means that we allocate more pages
  than we really need. This can be improved later by using fragmented
  buffers.
* Use pool_size = MLXSW_PCI_WQE_COUNT. This will be the size of 'ptr_ring',
  and should be the maximum amount of packets that page pool will allocate
  memory for. In our case, this is the queue size, defined as
  MLXSW_PCI_WQE_COUNT.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/02e5856ae7c572d4293ce6bb92c286ee6cfec800.1718709196.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlxsw/Kconfig |  1 +
 drivers/net/ethernet/mellanox/mlxsw/pci.c   | 60 ++++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/Kconfig b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
index a510bf2cff2f6..74f7e27b490f9 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
@@ -33,6 +33,7 @@ config MLXSW_CORE_THERMAL
 config MLXSW_PCI
 	tristate "PCI bus implementation for Mellanox Technologies Switch ASICs"
 	depends on PCI && HAS_IOMEM && MLXSW_CORE
+	select PAGE_POOL
 	default m
 	help
 	  This is PCI bus implementation for Mellanox Technologies Switch ASICs.
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 8751da09a6c1b..2486f0fde5d9d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -13,6 +13,7 @@
 #include <linux/if_vlan.h>
 #include <linux/log2.h>
 #include <linux/string.h>
+#include <net/page_pool/helpers.h>
 
 #include "pci_hw.h"
 #include "pci.h"
@@ -88,6 +89,7 @@ struct mlxsw_pci_queue {
 			enum mlxsw_pci_cqe_v v;
 			struct mlxsw_pci_queue *dq;
 			struct napi_struct napi;
+			struct page_pool *page_pool;
 		} cq;
 		struct {
 			struct tasklet_struct tasklet;
@@ -338,6 +340,12 @@ static void mlxsw_pci_sdq_fini(struct mlxsw_pci *mlxsw_pci,
 	mlxsw_cmd_hw2sw_sdq(mlxsw_pci->core, q->num);
 }
 
+#define MLXSW_PCI_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN)
+
+#define MLXSW_PCI_RX_BUF_SW_OVERHEAD		\
+		(MLXSW_PCI_SKB_HEADROOM +	\
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
 static int mlxsw_pci_wqe_frag_map(struct mlxsw_pci *mlxsw_pci, char *wqe,
 				  int index, char *frag_data, size_t frag_len,
 				  int direction)
@@ -844,9 +852,47 @@ static void mlxsw_pci_cq_napi_teardown(struct mlxsw_pci_queue *q)
 	netif_napi_del(&q->u.cq.napi);
 }
 
+static int mlxsw_pci_cq_page_pool_init(struct mlxsw_pci_queue *q,
+				       enum mlxsw_pci_cq_type cq_type)
+{
+	struct page_pool_params pp_params = {};
+	struct mlxsw_pci *mlxsw_pci = q->pci;
+	struct page_pool *page_pool;
+	u32 max_pkt_size;
+
+	if (cq_type != MLXSW_PCI_CQ_RDQ)
+		return 0;
+
+	max_pkt_size = MLXSW_PORT_MAX_MTU + MLXSW_PCI_RX_BUF_SW_OVERHEAD;
+	pp_params.order = get_order(max_pkt_size);
+	pp_params.flags = PP_FLAG_DMA_MAP;
+	pp_params.pool_size = MLXSW_PCI_WQE_COUNT;
+	pp_params.nid = dev_to_node(&mlxsw_pci->pdev->dev);
+	pp_params.dev = &mlxsw_pci->pdev->dev;
+	pp_params.napi = &q->u.cq.napi;
+	pp_params.dma_dir = DMA_FROM_DEVICE;
+
+	page_pool = page_pool_create(&pp_params);
+	if (IS_ERR(page_pool))
+		return PTR_ERR(page_pool);
+
+	q->u.cq.page_pool = page_pool;
+	return 0;
+}
+
+static void mlxsw_pci_cq_page_pool_fini(struct mlxsw_pci_queue *q,
+					enum mlxsw_pci_cq_type cq_type)
+{
+	if (cq_type != MLXSW_PCI_CQ_RDQ)
+		return;
+
+	page_pool_destroy(q->u.cq.page_pool);
+}
+
 static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 			     struct mlxsw_pci_queue *q)
 {
+	enum mlxsw_pci_cq_type cq_type = mlxsw_pci_cq_type(mlxsw_pci, q);
 	int i;
 	int err;
 
@@ -876,17 +922,29 @@ static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 	err = mlxsw_cmd_sw2hw_cq(mlxsw_pci->core, mbox, q->num);
 	if (err)
 		return err;
-	mlxsw_pci_cq_napi_setup(q, mlxsw_pci_cq_type(mlxsw_pci, q));
+	mlxsw_pci_cq_napi_setup(q, cq_type);
+
+	err = mlxsw_pci_cq_page_pool_init(q, cq_type);
+	if (err)
+		goto err_page_pool_init;
+
 	napi_enable(&q->u.cq.napi);
 	mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
 	mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
 	return 0;
+
+err_page_pool_init:
+	mlxsw_pci_cq_napi_teardown(q);
+	return err;
 }
 
 static void mlxsw_pci_cq_fini(struct mlxsw_pci *mlxsw_pci,
 			      struct mlxsw_pci_queue *q)
 {
+	enum mlxsw_pci_cq_type cq_type = mlxsw_pci_cq_type(mlxsw_pci, q);
+
 	napi_disable(&q->u.cq.napi);
+	mlxsw_pci_cq_page_pool_fini(q, cq_type);
 	mlxsw_pci_cq_napi_teardown(q);
 	mlxsw_cmd_hw2sw_cq(mlxsw_pci->core, q->num);
 }

From b5b60bb491b26b0e5961e021b2988562eaa6a8c7 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 18 Jun 2024 13:34:43 +0200
Subject: [PATCH 4/7] mlxsw: pci: Use page pool for Rx buffers allocation

As part of driver init, all Rx queues are filled with buffers for
hardware usage. Later, when a packet is received, a new buffer should be
allocated to be used by hardware instead of the received buffer.
Packet's processing time includes allocation time, which can be improved
using page pool.

Using page pool, DMA mapping is done only for first allocation of buffers.
As subsequent buffers allocation avoid DMA mapping, it results in
performance improvement. The purpose of page pool is to allocate pages fast
from cache without locking. This lockless guarantee naturally comes from
running under a NAPI.

Use page pool to allocate the data buffer only, so hardware will use it to
fill the packet. At completion time, attach the data buffer (now filled
with packet payload) to new SKB which is allocated around the received
buffer. SKB building at completion time prevents cache miss for each
packet, as now the SKB is allocated right before packets will be handled by
networking stack.

Page pool for each Rx queue enhances Rx side performance by reclaiming
buffers back to each queue specific pool. This change significantly
improves driver performance, CPU can handle about 345% of the packets per
second it previously handled.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/1cf788a8f43c70aae6d526018ef77becb27ad6d3.1718709196.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlxsw/pci.c | 103 ++++++++++++++--------
 1 file changed, 64 insertions(+), 39 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 2486f0fde5d9d..4b34db2ae8f01 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -62,6 +62,7 @@ struct mlxsw_pci_mem_item {
 };
 
 struct mlxsw_pci_queue_elem_info {
+	struct page *page;
 	char *elem; /* pointer to actual dma mapped element mem chunk */
 	union {
 		struct {
@@ -346,6 +347,19 @@ static void mlxsw_pci_sdq_fini(struct mlxsw_pci *mlxsw_pci,
 		(MLXSW_PCI_SKB_HEADROOM +	\
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
 
+static void
+mlxsw_pci_wqe_rx_frag_set(struct mlxsw_pci *mlxsw_pci, struct page *page,
+			  char *wqe, int index, size_t frag_len)
+{
+	dma_addr_t mapaddr;
+
+	mapaddr = page_pool_get_dma_addr(page);
+	mapaddr += MLXSW_PCI_SKB_HEADROOM;
+
+	mlxsw_pci_wqe_address_set(wqe, index, mapaddr);
+	mlxsw_pci_wqe_byte_count_set(wqe, index, frag_len);
+}
+
 static int mlxsw_pci_wqe_frag_map(struct mlxsw_pci *mlxsw_pci, char *wqe,
 				  int index, char *frag_data, size_t frag_len,
 				  int direction)
@@ -375,43 +389,46 @@ static void mlxsw_pci_wqe_frag_unmap(struct mlxsw_pci *mlxsw_pci, char *wqe,
 	dma_unmap_single(&pdev->dev, mapaddr, frag_len, direction);
 }
 
-static int mlxsw_pci_rdq_skb_alloc(struct mlxsw_pci *mlxsw_pci,
-				   struct mlxsw_pci_queue_elem_info *elem_info,
-				   gfp_t gfp)
+static struct sk_buff *mlxsw_pci_rdq_build_skb(struct page *page,
+					       u16 byte_count)
+{
+	void *data = page_address(page);
+	unsigned int allocated_size;
+	struct sk_buff *skb;
+
+	allocated_size = page_size(page);
+	skb = napi_build_skb(data, allocated_size);
+	if (unlikely(!skb))
+		return ERR_PTR(-ENOMEM);
+
+	skb_reserve(skb, MLXSW_PCI_SKB_HEADROOM);
+	skb_put(skb, byte_count);
+	return skb;
+}
+
+static int mlxsw_pci_rdq_page_alloc(struct mlxsw_pci_queue *q,
+				    struct mlxsw_pci_queue_elem_info *elem_info)
 {
+	struct mlxsw_pci_queue *cq = q->u.rdq.cq;
 	size_t buf_len = MLXSW_PORT_MAX_MTU;
 	char *wqe = elem_info->elem;
-	struct sk_buff *skb;
-	int err;
+	struct page *page;
 
-	skb = __netdev_alloc_skb_ip_align(NULL, buf_len, gfp);
-	if (!skb)
+	page = page_pool_dev_alloc_pages(cq->u.cq.page_pool);
+	if (unlikely(!page))
 		return -ENOMEM;
 
-	err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data,
-				     buf_len, DMA_FROM_DEVICE);
-	if (err)
-		goto err_frag_map;
-
-	elem_info->u.rdq.skb = skb;
+	mlxsw_pci_wqe_rx_frag_set(q->pci, page, wqe, 0, buf_len);
+	elem_info->page = page;
 	return 0;
-
-err_frag_map:
-	dev_kfree_skb_any(skb);
-	return err;
 }
 
-static void mlxsw_pci_rdq_skb_free(struct mlxsw_pci *mlxsw_pci,
-				   struct mlxsw_pci_queue_elem_info *elem_info)
+static void mlxsw_pci_rdq_page_free(struct mlxsw_pci_queue *q,
+				    struct mlxsw_pci_queue_elem_info *elem_info)
 {
-	struct sk_buff *skb;
-	char *wqe;
+	struct mlxsw_pci_queue *cq = q->u.rdq.cq;
 
-	skb = elem_info->u.rdq.skb;
-	wqe = elem_info->elem;
-
-	mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
-	dev_kfree_skb_any(skb);
+	page_pool_put_page(cq->u.cq.page_pool, elem_info->page, -1, false);
 }
 
 static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
@@ -452,7 +469,7 @@ static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 	for (i = 0; i < q->count; i++) {
 		elem_info = mlxsw_pci_queue_elem_info_producer_get(q);
 		BUG_ON(!elem_info);
-		err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info, GFP_KERNEL);
+		err = mlxsw_pci_rdq_page_alloc(q, elem_info);
 		if (err)
 			goto rollback;
 		/* Everything is set up, ring doorbell to pass elem to HW */
@@ -465,7 +482,7 @@ static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 rollback:
 	for (i--; i >= 0; i--) {
 		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
-		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
+		mlxsw_pci_rdq_page_free(q, elem_info);
 	}
 	q->u.rdq.cq = NULL;
 	cq->u.cq.dq = NULL;
@@ -483,7 +500,7 @@ static void mlxsw_pci_rdq_fini(struct mlxsw_pci *mlxsw_pci,
 	mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num);
 	for (i = 0; i < q->count; i++) {
 		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
-		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
+		mlxsw_pci_rdq_page_free(q, elem_info);
 	}
 }
 
@@ -618,26 +635,38 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
 {
 	struct pci_dev *pdev = mlxsw_pci->pdev;
 	struct mlxsw_pci_queue_elem_info *elem_info;
+	struct mlxsw_pci_queue *cq = q->u.rdq.cq;
 	struct mlxsw_rx_info rx_info = {};
-	char wqe[MLXSW_PCI_WQE_SIZE];
 	struct sk_buff *skb;
+	struct page *page;
 	u16 byte_count;
 	int err;
 
 	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
-	skb = elem_info->u.rdq.skb;
-	memcpy(wqe, elem_info->elem, MLXSW_PCI_WQE_SIZE);
 
 	if (q->consumer_counter++ != consumer_counter_limit)
 		dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n");
 
-	err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info, GFP_ATOMIC);
+	byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
+	if (mlxsw_pci_cqe_crc_get(cqe_v, cqe))
+		byte_count -= ETH_FCS_LEN;
+
+	page = elem_info->page;
+
+	err = mlxsw_pci_rdq_page_alloc(q, elem_info);
 	if (err) {
-		dev_err_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n");
+		dev_err_ratelimited(&pdev->dev, "Failed to alloc page\n");
+		goto out;
+	}
+
+	skb = mlxsw_pci_rdq_build_skb(page, byte_count);
+	if (IS_ERR(skb)) {
+		dev_err_ratelimited(&pdev->dev, "Failed to build skb for RDQ\n");
+		page_pool_recycle_direct(cq->u.cq.page_pool, page);
 		goto out;
 	}
 
-	mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+	skb_mark_for_recycle(skb);
 
 	if (mlxsw_pci_cqe_lag_get(cqe_v, cqe)) {
 		rx_info.is_lag = true;
@@ -670,10 +699,6 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
 
 	mlxsw_pci_skb_cb_ts_set(mlxsw_pci, skb, cqe_v, cqe);
 
-	byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
-	if (mlxsw_pci_cqe_crc_get(cqe_v, cqe))
-		byte_count -= ETH_FCS_LEN;
-	skb_put(skb, byte_count);
 	mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info);
 
 out:

From 0f3cd437a1d88b2a77dfb146c88a297bea974ceb Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 18 Jun 2024 13:34:44 +0200
Subject: [PATCH 5/7] mlxsw: pci: Optimize data buffer access

Before accessing data buffer, call net_prefetch() to load it into the
cache. This change improves driver performance, CPU can handle about
7.1% more packets per second.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/1fa07c510890866a6f201163ab7e78890ba28b3b.1718709196.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlxsw/pci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 4b34db2ae8f01..6f41747e8a763 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -396,6 +396,7 @@ static struct sk_buff *mlxsw_pci_rdq_build_skb(struct page *page,
 	unsigned int allocated_size;
 	struct sk_buff *skb;
 
+	net_prefetch(data);
 	allocated_size = page_size(page);
 	skb = napi_build_skb(data, allocated_size);
 	if (unlikely(!skb))

From e8441b1f6b64e7157b9e7777a2f7b79c2215cba1 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 18 Jun 2024 13:34:45 +0200
Subject: [PATCH 6/7] mlxsw: pci: Do not store SKB for RDQ elements

The previous patch used page pool to allocate buffers for RDQ. With this
change, 'elem_info->u.rdq.skb' is not used anymore, as we do not allocate
SKB before getting the packet, we hold page pointer and build the SKB
around it once packet is received.

Remove the union and store SKB pointer for SDQ only.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/23a531008936dc9a1a298643fb1e4f9a7b8e6eb3.1718709196.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlxsw/pci.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 6f41747e8a763..b530b6a80ab74 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -64,14 +64,9 @@ struct mlxsw_pci_mem_item {
 struct mlxsw_pci_queue_elem_info {
 	struct page *page;
 	char *elem; /* pointer to actual dma mapped element mem chunk */
-	union {
-		struct {
-			struct sk_buff *skb;
-		} sdq;
-		struct {
-			struct sk_buff *skb;
-		} rdq;
-	} u;
+	struct {
+		struct sk_buff *skb;
+	} sdq;
 };
 
 struct mlxsw_pci_queue {
@@ -557,8 +552,8 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
 
 	spin_lock(&q->lock);
 	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
-	tx_info = mlxsw_skb_cb(elem_info->u.sdq.skb)->tx_info;
-	skb = elem_info->u.sdq.skb;
+	tx_info = mlxsw_skb_cb(elem_info->sdq.skb)->tx_info;
+	skb = elem_info->sdq.skb;
 	wqe = elem_info->elem;
 	for (i = 0; i < MLXSW_PCI_WQE_SG_ENTRIES; i++)
 		mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, i, DMA_TO_DEVICE);
@@ -573,7 +568,7 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
 
 	if (skb)
 		dev_kfree_skb_any(skb);
-	elem_info->u.sdq.skb = NULL;
+	elem_info->sdq.skb = NULL;
 
 	if (q->consumer_counter++ != consumer_counter_limit)
 		dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in SDQ\n");
@@ -2007,7 +2002,7 @@ static int mlxsw_pci_skb_transmit(void *bus_priv, struct sk_buff *skb,
 		goto unlock;
 	}
 	mlxsw_skb_cb(skb)->tx_info = *tx_info;
-	elem_info->u.sdq.skb = skb;
+	elem_info->sdq.skb = skb;
 
 	wqe = elem_info->elem;
 	mlxsw_pci_wqe_c_set(wqe, 1); /* always report completion */

From d94ae6415becdf437d58e60d1f153af39dddb247 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 18 Jun 2024 13:34:46 +0200
Subject: [PATCH 7/7] mlxsw: pci: Use napi_consume_skb() to free SKB as part of
 Tx completion

Currently, as part of Tx completion, the driver calls dev_kfree_skb_any()
to free the SKB. For this flow, the correct function is napi_consume_skb().
This function and dev_consume_skb_any() were added to be used for consumed
SKBs, which were not dropped, so the skb:kfree_skb tracepoint is not
triggered, and we can get better diagnostics about dropped packets.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/a9f9f3dc884c0d1be4bd4c9d72030c88c7ac004f.1718709196.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlxsw/pci.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index b530b6a80ab74..cb043379c01c7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -541,7 +541,7 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
 				     struct mlxsw_pci_queue *q,
 				     u16 consumer_counter_limit,
 				     enum mlxsw_pci_cqe_v cqe_v,
-				     char *cqe)
+				     char *cqe, int budget)
 {
 	struct pci_dev *pdev = mlxsw_pci->pdev;
 	struct mlxsw_pci_queue_elem_info *elem_info;
@@ -567,7 +567,7 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
 	}
 
 	if (skb)
-		dev_kfree_skb_any(skb);
+		napi_consume_skb(skb, budget);
 	elem_info->sdq.skb = NULL;
 
 	if (q->consumer_counter++ != consumer_counter_limit)
@@ -819,7 +819,7 @@ static int mlxsw_pci_napi_poll_cq_tx(struct napi_struct *napi, int budget)
 		mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
 
 		mlxsw_pci_cqe_sdq_handle(mlxsw_pci, sdq,
-					 wqe_counter, q->u.cq.v, ncqe);
+					 wqe_counter, q->u.cq.v, ncqe, budget);
 
 		work_done++;
 	}