2021-09-22 09:46:38

by Yunsheng Lin

[permalink] [raw]
Subject: [PATCH net-next 0/7] some optimization for page pool

Patch 1: disable dma mapping support for 32-bit arch with 64-bit
DMA.
Patch 2: support non-split page when PP_FLAG_PAGE_FRAG is set.
patch 3: avoid calling compound_head() for skb frag page
Patch 4-7: use pp_magic to identify pp page uniquely.

V3:
1. add patch 1/4/6/7.
2. use pp_magic to identify pp page uniquely too.
3. avoid unnecessary compound_head() calling.

V2: add patch 2, adjust the commit log accroding to the discussion
in V1, and fix a compiler error reported by kernel test robot.

Yunsheng Lin (7):
page_pool: disable dma mapping support for 32-bit arch with 64-bit DMA
page_pool: support non-split page with PP_FLAG_PAGE_FRAG
pool_pool: avoid calling compound_head() for skb frag page
page_pool: change BIAS_MAX to support incrementing
skbuff: keep track of pp page when __skb_frag_ref() is called
skbuff: only use pp_magic identifier for a skb' head page
skbuff: remove unused skb->pp_recycle

.../net/ethernet/hisilicon/hns3/hns3_enet.c | 6 ---
drivers/net/ethernet/marvell/mvneta.c | 2 -
.../net/ethernet/marvell/mvpp2/mvpp2_main.c | 4 +-
drivers/net/ethernet/marvell/sky2.c | 2 +-
drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +-
drivers/net/ethernet/ti/cpsw.c | 2 -
drivers/net/ethernet/ti/cpsw_new.c | 2 -
include/linux/mm_types.h | 13 +-----
include/linux/skbuff.h | 39 ++++++++----------
include/net/page_pool.h | 31 ++++++++------
net/core/page_pool.c | 40 +++++++------------
net/core/skbuff.c | 36 ++++++-----------
net/tls/tls_device.c | 2 +-
13 files changed, 67 insertions(+), 114 deletions(-)

--
2.33.0


2021-09-22 09:46:43

by Yunsheng Lin

[permalink] [raw]
Subject: [PATCH net-next 5/7] skbuff: keep track of pp page when __skb_frag_ref() is called

As the skb->pp_recycle and page->pp_magic may not be enough
to track if a frag page is from page pool after the calling
of __skb_frag_ref(), mostly because of a data race, see:
commit 2cc3aeb5eccc ("skbuff: Fix a potential race while
recycling page_pool packets").

There may be clone and expand head case that might lose the
track if a frag page is from page pool or not.

And not being able to keep track of pp page may cause problem
for the skb_split() case in tso_fragment() too:
Supposing a skb has 3 frag pages, all coming from a page pool,
and is split to skb1 and skb2:
skb1: first frag page + first half of second frag page
skb2: second half of second frag page + third frag page

How do we set the skb->pp_recycle of skb1 and skb2?
1. If we set both of them to 1, then we may have a similar
race as the above commit for second frag page.
2. If we set only one of them to 1, then we may have resource
leaking problem as both first frag page and third frag page
are indeed from page pool.

So increment the frag count when __skb_frag_ref() is called,
and only use page->pp_magic to indicate if a frag page is from
page pool, to avoid the above data race.

Signed-off-by: Yunsheng Lin <[email protected]>
---
drivers/net/ethernet/marvell/sky2.c | 2 +-
drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +-
include/linux/skbuff.h | 30 +++++++++++++++++-----
include/net/page_pool.h | 19 +++++++++++++-
net/core/page_pool.c | 14 +---------
net/core/skbuff.c | 4 +--
net/tls/tls_device.c | 2 +-
7 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index e9fc74e54b22..91b62f468a26 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -2503,7 +2503,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,

if (length == 0) {
/* don't need this page */
- __skb_frag_unref(frag, false);
+ __skb_frag_unref(frag);
--skb_shinfo(skb)->nr_frags;
} else {
size = min(length, (unsigned) PAGE_SIZE);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 7f6d3b82c29b..ce31b1fd554f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -526,7 +526,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
fail:
while (nr > 0) {
nr--;
- __skb_frag_unref(skb_shinfo(skb)->frags + nr, false);
+ __skb_frag_unref(skb_shinfo(skb)->frags + nr);
}
return 0;
}
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 35eebc2310a5..a2d3b6fe0c32 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3073,7 +3073,16 @@ static inline struct page *skb_frag_page(const skb_frag_t *frag)
*/
static inline void __skb_frag_ref(skb_frag_t *frag)
{
- get_page(skb_frag_page(frag));
+ struct page *page = skb_frag_page(frag);
+
+#ifdef CONFIG_PAGE_POOL
+ if (page_pool_is_pp_page(page)) {
+ page_pool_atomic_inc_frag_count(page);
+ return;
+ }
+#endif
+
+ get_page(page);
}

/**
@@ -3091,18 +3100,19 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
/**
* __skb_frag_unref - release a reference on a paged fragment.
* @frag: the paged fragment
- * @recycle: recycle the page if allocated via page_pool
*
* Releases a reference on the paged fragment @frag
* or recycles the page via the page_pool API.
*/
-static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
+static inline void __skb_frag_unref(skb_frag_t *frag)
{
struct page *page = skb_frag_page(frag);

#ifdef CONFIG_PAGE_POOL
- if (recycle && page_pool_return_skb_page(page))
+ if (page_pool_is_pp_page(page)) {
+ page_pool_return_skb_page(page);
return;
+ }
#endif
put_page(page);
}
@@ -3116,7 +3126,7 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
*/
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
- __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
+ __skb_frag_unref(&skb_shinfo(skb)->frags[f]);
}

/**
@@ -4720,9 +4730,17 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)

static inline bool skb_pp_recycle(struct sk_buff *skb, void *data)
{
+ struct page *page;
+
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
- return page_pool_return_skb_page(virt_to_head_page(data));
+
+ page = virt_to_head_page(data);
+ if (!page_pool_is_pp_page(page))
+ return false;
+
+ page_pool_return_skb_page(page);
+ return true;
}

#endif /* __KERNEL__ */
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 3855f069627f..f9738da37d9f 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -164,7 +164,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
return pool->p.dma_dir;
}

-bool page_pool_return_skb_page(struct page *page);
+void page_pool_return_skb_page(struct page *page);

struct page_pool *page_pool_create(const struct page_pool_params *params);

@@ -231,6 +231,23 @@ static inline void page_pool_set_frag_count(struct page *page, long nr)
atomic_long_set(&page->pp_frag_count, nr);
}

+static inline void page_pool_atomic_inc_frag_count(struct page *page)
+{
+ atomic_long_inc(&page->pp_frag_count);
+}
+
+static inline bool page_pool_is_pp_page(struct page *page)
+{
+ /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
+ * in order to preserve any existing bits, such as bit 0 for the
+ * head page of compound page and bit 1 for pfmemalloc page, so
+ * mask those bits for freeing side when doing below checking,
+ * and page_is_pfmemalloc() is checked in __page_pool_put_page()
+ * to avoid recycling the pfmemalloc page.
+ */
+ return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
+}
+
static inline long page_pool_atomic_sub_frag_count_return(struct page *page,
long nr)
{
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index e9516477f9d2..96a28accce0e 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -738,20 +738,10 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
}
EXPORT_SYMBOL(page_pool_update_nid);

-bool page_pool_return_skb_page(struct page *page)
+void page_pool_return_skb_page(struct page *page)
{
struct page_pool *pp;

- /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
- * in order to preserve any existing bits, such as bit 0 for the
- * head page of compound page and bit 1 for pfmemalloc page, so
- * mask those bits for freeing side when doing below checking,
- * and page_is_pfmemalloc() is checked in __page_pool_put_page()
- * to avoid recycling the pfmemalloc page.
- */
- if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
- return false;
-
pp = page->pp;

/* Driver set this to memory recycling info. Reset it on recycle.
@@ -760,7 +750,5 @@ bool page_pool_return_skb_page(struct page *page)
* 'flipped' fragment being in use or not.
*/
page_pool_put_full_page(pp, page, false);
-
- return true;
}
EXPORT_SYMBOL(page_pool_return_skb_page);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2170bea2c7de..db8af3eff255 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -668,7 +668,7 @@ static void skb_release_data(struct sk_buff *skb)
skb_zcopy_clear(skb, true);

for (i = 0; i < shinfo->nr_frags; i++)
- __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+ __skb_frag_unref(&shinfo->frags[i]);

if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
@@ -3563,7 +3563,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragto = &skb_shinfo(tgt)->frags[merge];

skb_frag_size_add(fragto, skb_frag_size(fragfrom));
- __skb_frag_unref(fragfrom, skb->pp_recycle);
+ __skb_frag_unref(fragfrom);
}

/* Reposition in the original skb */
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index b932469ee69c..bd9f1567aa39 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -128,7 +128,7 @@ static void destroy_record(struct tls_record_info *record)
int i;

for (i = 0; i < record->num_frags; i++)
- __skb_frag_unref(&record->frags[i], false);
+ __skb_frag_unref(&record->frags[i]);
kfree(record);
}

--
2.33.0

2021-09-22 09:46:46

by Yunsheng Lin

[permalink] [raw]
Subject: [PATCH net-next 7/7] skbuff: remove unused skb->pp_recycle

As we have used pp_magic to identify pp page for the head
and frag page of a skb, the skb->pp_recycle is not used, so
remove it.

Signed-off-by: Yunsheng Lin <[email protected]>
---
.../net/ethernet/hisilicon/hns3/hns3_enet.c | 6 ------
drivers/net/ethernet/marvell/mvneta.c | 2 --
.../net/ethernet/marvell/mvpp2/mvpp2_main.c | 4 +---
drivers/net/ethernet/ti/cpsw.c | 2 --
drivers/net/ethernet/ti/cpsw_new.c | 2 --
include/linux/skbuff.h | 12 +----------
net/core/skbuff.c | 21 +------------------
7 files changed, 3 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 22af3d6ce178..5331e0f2cee4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3864,9 +3864,6 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length,
return 0;
}

- if (ring->page_pool)
- skb_mark_for_recycle(skb);
-
u64_stats_update_begin(&ring->syncp);
ring->stats.seg_pkt_cnt++;
u64_stats_update_end(&ring->syncp);
@@ -3906,9 +3903,6 @@ static int hns3_add_frag(struct hns3_enet_ring *ring)
return -ENXIO;
}

- if (ring->page_pool)
- skb_mark_for_recycle(new_skb);
-
ring->frag_num = 0;

if (ring->tail_skb) {
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 9d460a270601..c852e0dd6d38 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2327,8 +2327,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
if (!skb)
return ERR_PTR(-ENOMEM);

- skb_mark_for_recycle(skb);
-
skb_reserve(skb, xdp->data - xdp->data_hard_start);
skb_put(skb, xdp->data_end - xdp->data);
skb->ip_summed = mvneta_rx_csum(pp, desc_status);
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index d5c92e43f89e..bacae115c6c6 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -3994,9 +3994,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,
goto err_drop_frame;
}

- if (pp)
- skb_mark_for_recycle(skb);
- else
+ if (!pp)
dma_unmap_single_attrs(dev->dev.parent, dma_addr,
bm_pool->buf_size, DMA_FROM_DEVICE,
DMA_ATTR_SKIP_CPU_SYNC);
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 66f7ddd9b1f9..2fb5a4545b8b 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -430,8 +430,6 @@ static void cpsw_rx_handler(void *token, int len, int status)
cpts_rx_timestamp(cpsw->cpts, skb);
skb->protocol = eth_type_trans(skb, ndev);

- /* mark skb for recycling */
- skb_mark_for_recycle(skb);
netif_receive_skb(skb);

ndev->stats.rx_bytes += len;
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 7968f24d99c8..1e74d484852d 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -374,8 +374,6 @@ static void cpsw_rx_handler(void *token, int len, int status)
cpts_rx_timestamp(cpsw->cpts, skb);
skb->protocol = eth_type_trans(skb, ndev);

- /* mark skb for recycling */
- skb_mark_for_recycle(skb);
netif_receive_skb(skb);

ndev->stats.rx_bytes += len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b77ee060b64d..d4bb0e160fef 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -668,8 +668,6 @@ typedef unsigned char *sk_buff_data_t;
* @head_frag: skb was allocated from page fragments,
* not allocated by kmalloc() or vmalloc().
* @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
- * @pp_recycle: mark the packet for recycling instead of freeing (implies
- * page_pool support on driver)
* @active_extensions: active extensions (skb_ext_id types)
* @ndisc_nodetype: router type (from link layer)
* @ooo_okay: allow the mapping of a socket to a queue to be changed
@@ -795,8 +793,7 @@ struct sk_buff {
fclone:2,
peeked:1,
head_frag:1,
- pfmemalloc:1,
- pp_recycle:1; /* page_pool recycle indicator */
+ pfmemalloc:1;
#ifdef CONFIG_SKB_EXTENSIONS
__u8 active_extensions;
#endif
@@ -4721,12 +4718,5 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
#endif
}

-#ifdef CONFIG_PAGE_POOL
-static inline void skb_mark_for_recycle(struct sk_buff *skb)
-{
- skb->pp_recycle = 1;
-}
-#endif
-
#endif /* __KERNEL__ */
#endif /* _LINUX_SKBUFF_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3718898da499..85ae59f4349a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -670,7 +670,7 @@ static void skb_release_data(struct sk_buff *skb)
if (skb->cloned &&
atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
&shinfo->dataref))
- goto exit;
+ return;

skb_zcopy_clear(skb, true);

@@ -681,17 +681,6 @@ static void skb_release_data(struct sk_buff *skb)
kfree_skb_list(shinfo->frag_list);

skb_free_head(skb);
-exit:
- /* When we clone an SKB we copy the reycling bit. The pp_recycle
- * bit is only set on the head though, so in order to avoid races
- * while trying to recycle fragments on __skb_frag_unref() we need
- * to make one SKB responsible for triggering the recycle path.
- * So disable the recycling bit if an SKB is cloned and we have
- * additional references to to the fragmented part of the SKB.
- * Eventually the last SKB will have the recycling bit set and it's
- * dataref set to 0, which will trigger the recycling
- */
- skb->pp_recycle = 0;
}

/*
@@ -1073,7 +1062,6 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
n->nohdr = 0;
n->peeked = 0;
C(pfmemalloc);
- C(pp_recycle);
n->destructor = NULL;
C(tail);
C(end);
@@ -5368,13 +5356,6 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (skb_cloned(to))
return false;

- /* The page pool signature of struct page will eventually figure out
- * which pages can be recycled or not but for now let's prohibit slab
- * allocated and page_pool allocated SKBs from being coalesced.
- */
- if (to->pp_recycle != from->pp_recycle)
- return false;
-
if (len <= skb_tailroom(to)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
--
2.33.0

2021-09-22 09:49:14

by Yunsheng Lin

[permalink] [raw]
Subject: [PATCH net-next 2/7] page_pool: support non-split page with PP_FLAG_PAGE_FRAG

Currently when PP_FLAG_PAGE_FRAG is set, the caller is not
expected to call page_pool_alloc_pages() directly because of
the PP_FLAG_PAGE_FRAG checking in __page_pool_put_page().

The patch removes the above checking to enable non-split page
support when PP_FLAG_PAGE_FRAG is set.

Reviewed-by: Alexander Duyck <[email protected]>
Signed-off-by: Yunsheng Lin <[email protected]>
---
net/core/page_pool.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index a65bd7972e37..f7e71dcb6a2e 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -315,11 +315,14 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)

/* Fast-path: Get a page from cache */
page = __page_pool_get_cached(pool);
- if (page)
- return page;

/* Slow-path: cache empty, do real allocation */
- page = __page_pool_alloc_pages_slow(pool, gfp);
+ if (!page)
+ page = __page_pool_alloc_pages_slow(pool, gfp);
+
+ if (likely(page))
+ page_pool_set_frag_count(page, 1);
+
return page;
}
EXPORT_SYMBOL(page_pool_alloc_pages);
@@ -428,8 +431,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
unsigned int dma_sync_size, bool allow_direct)
{
/* It is not the last user for the page frag case */
- if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
- page_pool_atomic_sub_frag_count_return(page, 1))
+ if (page_pool_atomic_sub_frag_count_return(page, 1))
return NULL;

/* This allocator is optimized for the XDP mode that uses
--
2.33.0

2021-09-23 07:11:27

by Ilias Apalodimas

[permalink] [raw]
Subject: Re: [PATCH net-next 0/7] some optimization for page pool

Hi Yunsheng,

On Wed, Sep 22, 2021 at 05:41:24PM +0800, Yunsheng Lin wrote:
> Patch 1: disable dma mapping support for 32-bit arch with 64-bit
> DMA.
> Patch 2: support non-split page when PP_FLAG_PAGE_FRAG is set.
> patch 3: avoid calling compound_head() for skb frag page
> Patch 4-7: use pp_magic to identify pp page uniquely.

There's some subtle changes in this patchset that might affect XDP.

What I forgot when I proposed removing the recycling bit, is that it also
serves as an 'opt-in' mechanism for drivers that want to use page_pool but
do the recycling internally. With that removed we need to make sure
nothing bad happens to them. In theory the page refcnt for mlx5
specifically will be elevated, so we'll just end up unmapping the buffer.
Arguably we could add a similar mechanism internally into page pool,
which would allow us to enable and disable recycling, but that's
an extra if per packet allocation and I don't know if we want that on the XDP
case.
A few numbers pre/post patch for XDP would help, but iirc hns3 doesn't have
XDP support yet?

It's plumbers week so I'll do some testing starting Monday.

Thanks
/Ilias

>
> V3:
> 1. add patch 1/4/6/7.
> 2. use pp_magic to identify pp page uniquely too.
> 3. avoid unnecessary compound_head() calling.
>
> V2: add patch 2, adjust the commit log accroding to the discussion
> in V1, and fix a compiler error reported by kernel test robot.
>
> Yunsheng Lin (7):
> page_pool: disable dma mapping support for 32-bit arch with 64-bit DMA
> page_pool: support non-split page with PP_FLAG_PAGE_FRAG
> pool_pool: avoid calling compound_head() for skb frag page
> page_pool: change BIAS_MAX to support incrementing
> skbuff: keep track of pp page when __skb_frag_ref() is called
> skbuff: only use pp_magic identifier for a skb' head page
> skbuff: remove unused skb->pp_recycle
>
> .../net/ethernet/hisilicon/hns3/hns3_enet.c | 6 ---
> drivers/net/ethernet/marvell/mvneta.c | 2 -
> .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 4 +-
> drivers/net/ethernet/marvell/sky2.c | 2 +-
> drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +-
> drivers/net/ethernet/ti/cpsw.c | 2 -
> drivers/net/ethernet/ti/cpsw_new.c | 2 -
> include/linux/mm_types.h | 13 +-----
> include/linux/skbuff.h | 39 ++++++++----------
> include/net/page_pool.h | 31 ++++++++------
> net/core/page_pool.c | 40 +++++++------------
> net/core/skbuff.c | 36 ++++++-----------
> net/tls/tls_device.c | 2 +-
> 13 files changed, 67 insertions(+), 114 deletions(-)
>
> --
> 2.33.0
>

2021-09-23 11:14:47

by Yunsheng Lin

[permalink] [raw]
Subject: Re: [PATCH net-next 0/7] some optimization for page pool

On 2021/9/23 15:07, Ilias Apalodimas wrote:
> Hi Yunsheng,
>
> On Wed, Sep 22, 2021 at 05:41:24PM +0800, Yunsheng Lin wrote:
>> Patch 1: disable dma mapping support for 32-bit arch with 64-bit
>> DMA.
>> Patch 2: support non-split page when PP_FLAG_PAGE_FRAG is set.
>> patch 3: avoid calling compound_head() for skb frag page
>> Patch 4-7: use pp_magic to identify pp page uniquely.
>
> There's some subtle changes in this patchset that might affect XDP.
>
> What I forgot when I proposed removing the recycling bit, is that it also
> serves as an 'opt-in' mechanism for drivers that want to use page_pool but
> do the recycling internally. With that removed we need to make sure
> nothing bad happens to them. In theory the page refcnt for mlx5

It seems odd that mlx5 is adding its own page cache on top of page pool,
is it about support both "struct sk_buff" and "struct xdp_buff" for the
same queue?

> specifically will be elevated, so we'll just end up unmapping the buffer.
> Arguably we could add a similar mechanism internally into page pool,
> which would allow us to enable and disable recycling, but that's
> an extra if per packet allocation and I don't know if we want that on the XDP
> case.

Or we could change mlx5e_rx_cache_get() to check for "page->pp_frag_count
== 1" too, and adjust mlx5e_page_release() accordingly?

> A few numbers pre/post patch for XDP would help, but iirc hns3 doesn't have
> XDP support yet?

You are right, hns3 doesn't have XDP support yet.

>
> It's plumbers week so I'll do some testing starting Monday.
>
> Thanks
> /Ilias
>
>>
>> V3:
>> 1. add patch 1/4/6/7.
>> 2. use pp_magic to identify pp page uniquely too.
>> 3. avoid unnecessary compound_head() calling.
>>
>> V2: add patch 2, adjust the commit log accroding to the discussion
>> in V1, and fix a compiler error reported by kernel test robot.
>>
>> Yunsheng Lin (7):
>> page_pool: disable dma mapping support for 32-bit arch with 64-bit DMA
>> page_pool: support non-split page with PP_FLAG_PAGE_FRAG
>> pool_pool: avoid calling compound_head() for skb frag page
>> page_pool: change BIAS_MAX to support incrementing
>> skbuff: keep track of pp page when __skb_frag_ref() is called
>> skbuff: only use pp_magic identifier for a skb' head page
>> skbuff: remove unused skb->pp_recycle
>>
>> .../net/ethernet/hisilicon/hns3/hns3_enet.c | 6 ---
>> drivers/net/ethernet/marvell/mvneta.c | 2 -
>> .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 4 +-
>> drivers/net/ethernet/marvell/sky2.c | 2 +-
>> drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +-
>> drivers/net/ethernet/ti/cpsw.c | 2 -
>> drivers/net/ethernet/ti/cpsw_new.c | 2 -
>> include/linux/mm_types.h | 13 +-----
>> include/linux/skbuff.h | 39 ++++++++----------
>> include/net/page_pool.h | 31 ++++++++------
>> net/core/page_pool.c | 40 +++++++------------
>> net/core/skbuff.c | 36 ++++++-----------
>> net/tls/tls_device.c | 2 +-
>> 13 files changed, 67 insertions(+), 114 deletions(-)
>>
>> --
>> 2.33.0
>>
> .
>

2021-09-23 12:10:07

by Jesper Dangaard Brouer

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] page_pool: support non-split page with PP_FLAG_PAGE_FRAG



On 22/09/2021 11.41, Yunsheng Lin wrote:
> Currently when PP_FLAG_PAGE_FRAG is set, the caller is not
> expected to call page_pool_alloc_pages() directly because of
> the PP_FLAG_PAGE_FRAG checking in __page_pool_put_page().
>
> The patch removes the above checking to enable non-split page
> support when PP_FLAG_PAGE_FRAG is set.
>
> Reviewed-by: Alexander Duyck <[email protected]>
> Signed-off-by: Yunsheng Lin <[email protected]>
> ---
> net/core/page_pool.c | 12 +++++++-----
> 1 file changed, 7 insertions(+), 5 deletions(-)
>
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index a65bd7972e37..f7e71dcb6a2e 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -315,11 +315,14 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
>
> /* Fast-path: Get a page from cache */
> page = __page_pool_get_cached(pool);
> - if (page)
> - return page;
>
> /* Slow-path: cache empty, do real allocation */
> - page = __page_pool_alloc_pages_slow(pool, gfp);
> + if (!page)
> + page = __page_pool_alloc_pages_slow(pool, gfp);
> +
> + if (likely(page))
> + page_pool_set_frag_count(page, 1);
> +

I really don't like that you add one atomic_long_set operation per page
alloc call.
This is a fast-path for XDP use-cases, which you are ignoring as you
drivers doesn't implement XDP.

As I cannot ask you to run XDP benchmarks, I fortunately have some
page_pool specific microbenchmarks you can run instead.

I will ask you to provide before and after results from running these
benchmarks [1] and [2].

[1]
https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_simple.c

[2]
https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_cross_cpu.c

How to use these module is documented here[3]:
[3]
https://prototype-kernel.readthedocs.io/en/latest/prototype-kernel/build-process.html

> return page;
> }
> EXPORT_SYMBOL(page_pool_alloc_pages);
> @@ -428,8 +431,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
> unsigned int dma_sync_size, bool allow_direct)
> {
> /* It is not the last user for the page frag case */
> - if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
> - page_pool_atomic_sub_frag_count_return(page, 1))
> + if (page_pool_atomic_sub_frag_count_return(page, 1))
> return NULL;

This adds an atomic_long_read, even when PP_FLAG_PAGE_FRAG is not set.

>
> /* This allocator is optimized for the XDP mode that uses
>

2021-09-24 07:24:38

by Yunsheng Lin

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] page_pool: support non-split page with PP_FLAG_PAGE_FRAG

On 2021/9/23 20:08, Jesper Dangaard Brouer wrote:
>
>
> On 22/09/2021 11.41, Yunsheng Lin wrote:
>> Currently when PP_FLAG_PAGE_FRAG is set, the caller is not
>> expected to call page_pool_alloc_pages() directly because of
>> the PP_FLAG_PAGE_FRAG checking in __page_pool_put_page().
>>
>> The patch removes the above checking to enable non-split page
>> support when PP_FLAG_PAGE_FRAG is set.
>>
>> Reviewed-by: Alexander Duyck <[email protected]>
>> Signed-off-by: Yunsheng Lin <[email protected]>
>> ---
>> net/core/page_pool.c | 12 +++++++-----
>> 1 file changed, 7 insertions(+), 5 deletions(-)
>>
>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> index a65bd7972e37..f7e71dcb6a2e 100644
>> --- a/net/core/page_pool.c
>> +++ b/net/core/page_pool.c
>> @@ -315,11 +315,14 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
>> /* Fast-path: Get a page from cache */
>> page = __page_pool_get_cached(pool);
>> - if (page)
>> - return page;
>> /* Slow-path: cache empty, do real allocation */
>> - page = __page_pool_alloc_pages_slow(pool, gfp);
>> + if (!page)
>> + page = __page_pool_alloc_pages_slow(pool, gfp);
>> +
>> + if (likely(page))
>> + page_pool_set_frag_count(page, 1);
>> +
>
> I really don't like that you add one atomic_long_set operation per page alloc call.
> This is a fast-path for XDP use-cases, which you are ignoring as you drivers doesn't implement XDP.
>
> As I cannot ask you to run XDP benchmarks, I fortunately have some page_pool specific microbenchmarks you can run instead.
>
> I will ask you to provide before and after results from running these benchmarks [1] and [2].
>
> [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_simple.c
>
> [2] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_cross_cpu.c
>
> How to use these module is documented here[3]:
> [3] https://prototype-kernel.readthedocs.io/en/latest/prototype-kernel/build-process.html

Will running these benchmarks to see if any performance overhead noticable here,
thanks for the benchmarks.

>
>> return page;
>> }
>> EXPORT_SYMBOL(page_pool_alloc_pages);
>> @@ -428,8 +431,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>> unsigned int dma_sync_size, bool allow_direct)
>> {
>> /* It is not the last user for the page frag case */
>> - if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
>> - page_pool_atomic_sub_frag_count_return(page, 1))
>> + if (page_pool_atomic_sub_frag_count_return(page, 1))
>> return NULL;
>
> This adds an atomic_long_read, even when PP_FLAG_PAGE_FRAG is not set.

The point here is to have consistent handling for both PP_FLAG_PAGE_FRAG
and non-PP_FLAG_PAGE_FRAG case in the following patch.

As the page->_refcount is accessed in "page_ref_count(page) == 1" checking
in __page_pool_put_page(), and page->pp_frag_count is most likely in the
same cache line as the page->_refcount, So I am not expecting a noticable
overhead here.

Anyway, will use the above benchmarks as an example to verify it.


>
>> /* This allocator is optimized for the XDP mode that uses
>>
>
> .
>

2021-09-30 07:33:20

by Yunsheng Lin

[permalink] [raw]
Subject: Re: [Linuxarm] Re: [PATCH net-next 2/7] page_pool: support non-split page with PP_FLAG_PAGE_FRAG

On 2021/9/24 15:23, Yunsheng Lin wrote:
> On 2021/9/23 20:08, Jesper Dangaard Brouer wrote:
>>
>>
>> On 22/09/2021 11.41, Yunsheng Lin wrote:
>>> Currently when PP_FLAG_PAGE_FRAG is set, the caller is not
>>> expected to call page_pool_alloc_pages() directly because of
>>> the PP_FLAG_PAGE_FRAG checking in __page_pool_put_page().
>>>
>>> The patch removes the above checking to enable non-split page
>>> support when PP_FLAG_PAGE_FRAG is set.
>>>
>>> Reviewed-by: Alexander Duyck <[email protected]>
>>> Signed-off-by: Yunsheng Lin <[email protected]>
>>> ---
>>> net/core/page_pool.c | 12 +++++++-----
>>> 1 file changed, 7 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>>> index a65bd7972e37..f7e71dcb6a2e 100644
>>> --- a/net/core/page_pool.c
>>> +++ b/net/core/page_pool.c
>>> @@ -315,11 +315,14 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
>>> /* Fast-path: Get a page from cache */
>>> page = __page_pool_get_cached(pool);
>>> - if (page)
>>> - return page;
>>> /* Slow-path: cache empty, do real allocation */
>>> - page = __page_pool_alloc_pages_slow(pool, gfp);
>>> + if (!page)
>>> + page = __page_pool_alloc_pages_slow(pool, gfp);
>>> +
>>> + if (likely(page))
>>> + page_pool_set_frag_count(page, 1);
>>> +
>>
>> I really don't like that you add one atomic_long_set operation per page alloc call.
>> This is a fast-path for XDP use-cases, which you are ignoring as you drivers doesn't implement XDP.
>>
>> As I cannot ask you to run XDP benchmarks, I fortunately have some page_pool specific microbenchmarks you can run instead.
>>
>> I will ask you to provide before and after results from running these benchmarks [1] and [2].
>>
>> [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_simple.c
>>
>> [2] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_cross_cpu.c
>>
>> How to use these module is documented here[3]:
>> [3] https://prototype-kernel.readthedocs.io/en/latest/prototype-kernel/build-process.html
>
> Will running these benchmarks to see if any performance overhead noticable here,
> thanks for the benchmarks.

You are right, there is notiable overhead for bench_page_pool_cross_cpu test
case above, possibly due to the cache bouncing caused by page_pool_set_frag_count().

As memntioned by Ilias, mlx5 use page pool and also do the recycling internally,
so handling the page frag tracking consistently for both PP_FLAG_PAGE_FRAG and
non-PP_FLAG_PAGE_FRAG will break the mlx5 driver.

So I will drop this patch for now.

>
>>
>>> return page;
>>> }
>>> EXPORT_SYMBOL(page_pool_alloc_pages);
>>> @@ -428,8 +431,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>>> unsigned int dma_sync_size, bool allow_direct)
>>> {
>>> /* It is not the last user for the page frag case */
>>> - if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
>>> - page_pool_atomic_sub_frag_count_return(page, 1))
>>> + if (page_pool_atomic_sub_frag_count_return(page, 1))
>>> return NULL;
>>
>> This adds an atomic_long_read, even when PP_FLAG_PAGE_FRAG is not set.
>
> The point here is to have consistent handling for both PP_FLAG_PAGE_FRAG
> and non-PP_FLAG_PAGE_FRAG case in the following patch.
>
> As the page->_refcount is accessed in "page_ref_count(page) == 1" checking
> in __page_pool_put_page(), and page->pp_frag_count is most likely in the
> same cache line as the page->_refcount, So I am not expecting a noticable
> overhead here.
>
> Anyway, will use the above benchmarks as an example to verify it.
>
>
>>
>>> /* This allocator is optimized for the XDP mode that uses
>>>
>>
>> .
>>
> _______________________________________________
> Linuxarm mailing list -- [email protected]
> To unsubscribe send an email to [email protected]
>