This patchset adds frag page support in page pool and
enable skb's page frag recycling based on page pool in
hns3 drvier.
RFC v4:
1. Use the dma_addr[1] to store bias.
2. Default to a pagecnt_bias of PAGE_SIZE - 1.
3. other minor comment suggested by Alexander.
RFC v3:
1. Implement the semantic of "page recycling only wait for the
page pool user instead of all user of a page"
2. Support the frag allocation of different sizes
3. Merge patch 4 & 5 to one patch as it does not make sense to
use page_pool_dev_alloc_pages() API directly with elevated
refcnt.
4. other minor comment suggested by Alexander.
RFC v2:
1. Split patch 1 to more reviewable one.
2. Repurpose the lower 12 bits of the dma address to store the
pagecnt_bias as suggested by Alexander.
3. support recycling to pool->alloc for elevated refcnt case
too.
Yunsheng Lin (4):
page_pool: keep pp info as long as page pool owns the page
page_pool: add interface to manipulate bias in page pool
page_pool: add frag page recycling support in page pool
net: hns3: support skb's frag page recycling based on page pool
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 79 +++++++++++++-
drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 3 +
drivers/net/ethernet/marvell/mvneta.c | 6 +-
drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +-
drivers/net/ethernet/ti/cpsw.c | 2 +-
drivers/net/ethernet/ti/cpsw_new.c | 2 +-
include/linux/skbuff.h | 4 +-
include/net/page_pool.h | 99 +++++++++++++++--
net/core/page_pool.c | 135 +++++++++++++++++++++---
9 files changed, 288 insertions(+), 44 deletions(-)
--
2.7.4
As suggested by Alexander, "A DMA mapping should be page
aligned anyway so the lower 12 bits would be reserved 0",
so it might make more sense to repurpose the lower 12 bits
of the dma address to store the bias for frag page support
in page pool for 32 bit systems with 64 bit dma, which
should be rare those days.
For normal system, the dma_addr[1] in 'struct page' is not
used, so we can reuse the dma_addr[1] for storing bias.
The PAGE_POOP_USE_DMA_ADDR_1 macro is used to decide where
to store the bias, as the "sizeof(dma_addr_t) > sizeof(
unsigned long)" is false for normal system, so hopefully the
compiler will optimize out the unused code for those system.
The newly added page_pool_set_bias() should be called before
the page is passed to any user. Otherwise, call the newly
added page_pool_atomic_sub_bias_return().
Signed-off-by: Yunsheng Lin <[email protected]>
---
include/net/page_pool.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++---
net/core/page_pool.c | 10 +++++++
2 files changed, 77 insertions(+), 3 deletions(-)
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 8d7744d..315b9f2 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -198,21 +198,85 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
page_pool_put_full_page(pool, page, true);
}
+#define PAGE_POOP_USE_DMA_ADDR_1 (sizeof(dma_addr_t) > sizeof(unsigned long))
+
static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
{
- dma_addr_t ret = page->dma_addr[0];
- if (sizeof(dma_addr_t) > sizeof(unsigned long))
+ dma_addr_t ret;
+
+ if (PAGE_POOP_USE_DMA_ADDR_1) {
+ ret = READ_ONCE(page->dma_addr[0]) & PAGE_MASK;
ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
+ } else {
+ ret = page->dma_addr[0];
+ }
+
return ret;
}
static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
{
page->dma_addr[0] = addr;
- if (sizeof(dma_addr_t) > sizeof(unsigned long))
+ if (PAGE_POOP_USE_DMA_ADDR_1)
page->dma_addr[1] = upper_32_bits(addr);
}
+static inline int page_pool_atomic_sub_bias_return(struct page *page, int nr)
+{
+ int bias;
+
+ if (PAGE_POOP_USE_DMA_ADDR_1) {
+ unsigned long *bias_ptr = &page->dma_addr[0];
+ unsigned long old_bias = READ_ONCE(*bias_ptr);
+ unsigned long new_bias;
+
+ do {
+ bias = (int)(old_bias & ~PAGE_MASK);
+
+ /* Warn when page_pool_dev_alloc_pages() is called
+ * with PP_FLAG_PAGE_FRAG flag in driver.
+ */
+ WARN_ON(!bias);
+
+ /* already the last user */
+ if (!(bias - nr))
+ return 0;
+
+ new_bias = old_bias - nr;
+ } while (!try_cmpxchg(bias_ptr, &old_bias, new_bias));
+
+ WARN_ON((new_bias & PAGE_MASK) != (old_bias & PAGE_MASK));
+
+ bias = new_bias & ~PAGE_MASK;
+ } else {
+ atomic_t *v = (atomic_t *)&page->dma_addr[1];
+
+ if (atomic_read(v) == nr)
+ return 0;
+
+ bias = atomic_sub_return(nr, v);
+ WARN_ON(bias < 0);
+ }
+
+ return bias;
+}
+
+static inline void page_pool_set_bias(struct page *page, int bias)
+{
+ if (PAGE_POOP_USE_DMA_ADDR_1) {
+ unsigned long dma_addr_0 = READ_ONCE(page->dma_addr[0]);
+
+ dma_addr_0 &= PAGE_MASK;
+ dma_addr_0 |= bias;
+
+ WRITE_ONCE(page->dma_addr[0], dma_addr_0);
+ } else {
+ atomic_t *v = (atomic_t *)&page->dma_addr[1];
+
+ atomic_set(v, bias);
+ }
+}
+
static inline bool is_page_pool_compiled_in(void)
{
#ifdef CONFIG_PAGE_POOL
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 78838c6..6ac5b00 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -198,6 +198,16 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
if (dma_mapping_error(pool->p.dev, dma))
return false;
+ if (PAGE_POOP_USE_DMA_ADDR_1 &&
+ WARN_ON(pool->p.flags & PP_FLAG_PAGE_FRAG &&
+ dma & ~PAGE_MASK)) {
+ dma_unmap_page_attrs(pool->p.dev, dma,
+ PAGE_SIZE << pool->p.order,
+ pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ return false;
+ }
+
page_pool_set_dma_addr(page, dma);
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
--
2.7.4
Currently page pool only support page recycling only when
there is only one user of the page, and the split page
reusing implemented in the most driver can not use the
page pool as bing-pong way of reusing requires the multi
user support in page pool.
Those reusing or recycling has below limitations:
1. page from page pool can only be used be one user in order
for the page recycling to happen.
2. Bing-pong way of reusing in most driver does not support
multi desc using different part of the same page in order
to save memory.
So add multi-users support and frag page recycling in page pool
to overcome the above limitation.
Signed-off-by: Yunsheng Lin <[email protected]>
---
include/net/page_pool.h | 22 +++++++++-
net/core/page_pool.c | 104 ++++++++++++++++++++++++++++++++++++++++++------
2 files changed, 112 insertions(+), 14 deletions(-)
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 315b9f2..dd4bb90 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -45,7 +45,10 @@
* Please note DMA-sync-for-CPU is still
* device driver responsibility
*/
-#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)
+#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */
+#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\
+ PP_FLAG_DMA_SYNC_DEV |\
+ PP_FLAG_PAGE_FRAG)
/*
* Fast allocation side cache array/stack
@@ -88,6 +91,9 @@ struct page_pool {
unsigned long defer_warn;
u32 pages_state_hold_cnt;
+ unsigned int frag_offset;
+ int frag_bias;
+ struct page *frag_page;
/*
* Data structure for allocation side
@@ -137,6 +143,20 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
return page_pool_alloc_pages(pool, gfp);
}
+struct page *page_pool_alloc_frag(struct page_pool *pool,
+ unsigned int *offset,
+ unsigned int size,
+ gfp_t gfp);
+
+static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
+ unsigned int *offset,
+ unsigned int size)
+{
+ gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+ return page_pool_alloc_frag(pool, offset, size, gfp);
+}
+
/* get the stored dma direction. A driver might decide to treat this locally and
* avoid the extra cache line from page_pool to determine the direction
*/
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 6ac5b00..d172777 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -24,6 +24,8 @@
#define DEFER_TIME (msecs_to_jiffies(1000))
#define DEFER_WARN_INTERVAL (60 * HZ)
+#define BIAS_MAX (PAGE_SIZE - 1)
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
@@ -67,6 +69,14 @@ static int page_pool_init(struct page_pool *pool,
*/
}
+ /* Make sure there is at least one bias left as we depend on that
+ * to ensure the frag page is reserved to serve more users.
+ */
+ if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
+ (PAGE_SIZE << pool->p.order >
+ dma_get_cache_alignment() * (BIAS_MAX - 1)))
+ return -EINVAL;
+
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
@@ -429,6 +439,11 @@ static __always_inline struct page *
__page_pool_put_page(struct page_pool *pool, struct page *page,
unsigned int dma_sync_size, bool allow_direct)
{
+ /* It is not the last user for the page frag case */
+ if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
+ page_pool_atomic_sub_bias_return(page, 1))
+ return NULL;
+
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
* regular page allocator APIs.
@@ -452,19 +467,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
/* Page found as candidate for recycling */
return page;
}
- /* Fallback/non-XDP mode: API user have elevated refcnt.
- *
- * Many drivers split up the page into fragments, and some
- * want to keep doing this to save memory and do refcnt based
- * recycling. Support this use case too, to ease drivers
- * switching between XDP/non-XDP.
- *
- * In-case page_pool maintains the DMA mapping, API user must
- * call page_pool_put_page once. In this elevated refcnt
- * case, the DMA is unmapped/released, as driver is likely
- * doing refcnt based recycle tricks, meaning another process
- * will be invoking put_page.
- */
+
/* Do not replace this with page_pool_return_page() */
page_pool_release_page(pool, page);
put_page(page);
@@ -521,6 +524,79 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
}
EXPORT_SYMBOL(page_pool_put_page_bulk);
+static struct page *page_pool_drain_frag(struct page_pool *pool,
+ struct page *page)
+{
+ /* page pool is not the last user */
+ if (page_pool_atomic_sub_bias_return(page,
+ BIAS_MAX - pool->frag_bias))
+ return NULL;
+
+ if (likely(page_ref_count(page) == 1 &&
+ !page_is_pfmemalloc(page)))
+ return page;
+
+ page_pool_return_page(pool, page);
+ return NULL;
+}
+
+static void page_pool_free_frag(struct page_pool *pool)
+{
+ struct page *page = pool->frag_page;
+
+ if (!page ||
+ page_pool_atomic_sub_bias_return(page,
+ BIAS_MAX - pool->frag_bias))
+ return;
+
+ page_pool_return_page(pool, page);
+ pool->frag_page = NULL;
+}
+
+struct page *page_pool_alloc_frag(struct page_pool *pool,
+ unsigned int *offset,
+ unsigned int size,
+ gfp_t gfp)
+{
+ unsigned int max_size = PAGE_SIZE << pool->p.order;
+ unsigned int frag_offset = pool->frag_offset;
+ struct page *frag_page = pool->frag_page;
+
+ if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
+ size > max_size))
+ return NULL;
+
+ size = ALIGN(size, dma_get_cache_alignment());
+
+ if (frag_page && frag_offset + size > max_size) {
+ frag_page = page_pool_drain_frag(pool, frag_page);
+ if (frag_page)
+ goto frag_reset;
+ }
+
+ if (!frag_page) {
+ frag_page = page_pool_alloc_pages(pool, gfp);
+ if (unlikely(!frag_page)) {
+ pool->frag_page = NULL;
+ return NULL;
+ }
+
+ pool->frag_page = frag_page;
+
+frag_reset:
+ pool->frag_bias = 0;
+ frag_offset = 0;
+ page_pool_set_bias(frag_page, BIAS_MAX);
+ }
+
+ pool->frag_bias++;
+ *offset = frag_offset;
+ pool->frag_offset = frag_offset + size;
+
+ return frag_page;
+}
+EXPORT_SYMBOL(page_pool_alloc_frag);
+
static void page_pool_empty_ring(struct page_pool *pool)
{
struct page *page;
@@ -626,6 +702,8 @@ void page_pool_destroy(struct page_pool *pool)
if (!page_pool_put(pool))
return;
+ page_pool_free_frag(pool);
+
if (!page_pool_release(pool))
return;
--
2.7.4
Currently, page->pp is cleared and set everytime the page
is recycled, which is unnecessary.
So only set the page->pp when the page is added to the page
pool and only clear it when the page is released from the
page pool.
This is also a preparation to support allocating frag page
in page pool.
Signed-off-by: Yunsheng Lin <[email protected]>
---
drivers/net/ethernet/marvell/mvneta.c | 6 +-----
drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +-
drivers/net/ethernet/ti/cpsw.c | 2 +-
drivers/net/ethernet/ti/cpsw_new.c | 2 +-
include/linux/skbuff.h | 4 +---
include/net/page_pool.h | 7 -------
net/core/page_pool.c | 21 +++++++++++++++++----
7 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 361bc4f..89bf31fd 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
if (!skb)
return ERR_PTR(-ENOMEM);
- skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);
+ skb_mark_for_recycle(skb);
skb_reserve(skb, xdp->data - xdp->data_hard_start);
skb_put(skb, xdp->data_end - xdp->data);
@@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
skb_frag_page(frag), skb_frag_off(frag),
skb_frag_size(frag), PAGE_SIZE);
- /* We don't need to reset pp_recycle here. It's already set, so
- * just mark fragments for recycling.
- */
- page_pool_store_mem_info(skb_frag_page(frag), pool);
}
return skb;
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 3229baf..320eddb 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -3995,7 +3995,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,
}
if (pp)
- skb_mark_for_recycle(skb, page, pp);
+ skb_mark_for_recycle(skb);
else
dma_unmap_single_attrs(dev->dev.parent, dma_addr,
bm_pool->buf_size, DMA_FROM_DEVICE,
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index cbbd0f6..9d59143 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -431,7 +431,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
skb->protocol = eth_type_trans(skb, ndev);
/* mark skb for recycling */
- skb_mark_for_recycle(skb, page, pool);
+ skb_mark_for_recycle(skb);
netif_receive_skb(skb);
ndev->stats.rx_bytes += len;
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 57d279f..a4234a3 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -374,7 +374,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
skb->protocol = eth_type_trans(skb, ndev);
/* mark skb for recycling */
- skb_mark_for_recycle(skb, page, pool);
+ skb_mark_for_recycle(skb);
netif_receive_skb(skb);
ndev->stats.rx_bytes += len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b2db9cd..7795979 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
}
#ifdef CONFIG_PAGE_POOL
-static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,
- struct page_pool *pp)
+static inline void skb_mark_for_recycle(struct sk_buff *skb)
{
skb->pp_recycle = 1;
- page_pool_store_mem_info(page, pp);
}
#endif
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 3dd62dd..8d7744d 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -253,11 +253,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)
spin_unlock_bh(&pool->ring.producer_lock);
}
-/* Store mem_info on struct page and use it while recycling skb frags */
-static inline
-void page_pool_store_mem_info(struct page *page, struct page_pool *pp)
-{
- page->pp = pp;
-}
-
#endif /* _NET_PAGE_POOL_H */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5e4eb45..78838c6 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -206,6 +206,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
return true;
}
+static void page_pool_set_pp_info(struct page_pool *pool,
+ struct page *page)
+{
+ page->pp = pool;
+ page->pp_magic |= PP_SIGNATURE;
+}
+
+static void page_pool_clear_pp_info(struct page *page)
+{
+ page->pp_magic = 0;
+ page->pp = NULL;
+}
+
static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
gfp_t gfp)
{
@@ -222,7 +235,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
return NULL;
}
- page->pp_magic |= PP_SIGNATURE;
+ page_pool_set_pp_info(pool, page);
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
@@ -266,7 +279,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
put_page(page);
continue;
}
- page->pp_magic |= PP_SIGNATURE;
+
+ page_pool_set_pp_info(pool, page);
pool->alloc.cache[pool->alloc.count++] = page;
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
@@ -345,7 +359,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
DMA_ATTR_SKIP_CPU_SYNC);
page_pool_set_dma_addr(page, 0);
skip_dma_unmap:
- page->pp_magic = 0;
+ page_pool_clear_pp_info(page);
/* This may be the last page returned, releasing the pool, so
* it is not safe to reference pool afterwards.
@@ -644,7 +658,6 @@ bool page_pool_return_skb_page(struct page *page)
* The page will be returned to the pool here regardless of the
* 'flipped' fragment being in use or not.
*/
- page->pp = NULL;
page_pool_put_full_page(pp, page, false);
return true;
--
2.7.4
This patch adds skb's frag page recycling support based on
the elevated refcnt support in page pool.
The performance improves above 10~20% with IOMMU disabled.
The performance improves about 200% when IOMMU is enabled
and iperf server shares the same cpu with irq/NAPI.
Signed-off-by: Yunsheng Lin <[email protected]>
---
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 79 +++++++++++++++++++++++--
drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 3 +
2 files changed, 77 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index cdb5f14..c799129 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3205,6 +3205,21 @@ static int hns3_alloc_buffer(struct hns3_enet_ring *ring,
unsigned int order = hns3_page_order(ring);
struct page *p;
+ if (ring->page_pool) {
+ p = page_pool_dev_alloc_frag(ring->page_pool,
+ &cb->page_offset,
+ hns3_buf_size(ring));
+ if (unlikely(!p))
+ return -ENOMEM;
+
+ cb->priv = p;
+ cb->buf = page_address(p);
+ cb->dma = page_pool_get_dma_addr(p);
+ cb->type = DESC_TYPE_FRAG;
+ cb->reuse_flag = 0;
+ return 0;
+ }
+
p = dev_alloc_pages(order);
if (!p)
return -ENOMEM;
@@ -3227,8 +3242,12 @@ static void hns3_free_buffer(struct hns3_enet_ring *ring,
if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD |
DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB))
napi_consume_skb(cb->priv, budget);
- else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias)
- __page_frag_cache_drain(cb->priv, cb->pagecnt_bias);
+ else if (!HNAE3_IS_TX_RING(ring)) {
+ if (cb->type & DESC_TYPE_PAGE && cb->pagecnt_bias)
+ __page_frag_cache_drain(cb->priv, cb->pagecnt_bias);
+ else if (cb->type & DESC_TYPE_FRAG)
+ page_pool_put_full_page(ring->page_pool, cb->priv, false);
+ }
memset(cb, 0, sizeof(*cb));
}
@@ -3315,7 +3334,7 @@ static int hns3_alloc_and_map_buffer(struct hns3_enet_ring *ring,
int ret;
ret = hns3_alloc_buffer(ring, cb);
- if (ret)
+ if (ret || ring->page_pool)
goto out;
ret = hns3_map_buffer(ring, cb);
@@ -3337,7 +3356,8 @@ static int hns3_alloc_and_attach_buffer(struct hns3_enet_ring *ring, int i)
if (ret)
return ret;
- ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma);
+ ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma +
+ ring->desc_cb[i].page_offset);
return 0;
}
@@ -3367,7 +3387,8 @@ static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i,
{
hns3_unmap_buffer(ring, &ring->desc_cb[i]);
ring->desc_cb[i] = *res_cb;
- ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma);
+ ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma +
+ ring->desc_cb[i].page_offset);
ring->desc[i].rx.bd_base_info = 0;
}
@@ -3539,6 +3560,12 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i,
u32 frag_size = size - pull_len;
bool reused;
+ if (ring->page_pool) {
+ skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset,
+ frag_size, truesize);
+ return;
+ }
+
/* Avoid re-using remote or pfmem page */
if (unlikely(!dev_page_is_reusable(desc_cb->priv)))
goto out;
@@ -3856,6 +3883,9 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length,
/* We can reuse buffer as-is, just make sure it is reusable */
if (dev_page_is_reusable(desc_cb->priv))
desc_cb->reuse_flag = 1;
+ else if (desc_cb->type & DESC_TYPE_FRAG)
+ page_pool_put_full_page(ring->page_pool, desc_cb->priv,
+ false);
else /* This page cannot be reused so discard it */
__page_frag_cache_drain(desc_cb->priv,
desc_cb->pagecnt_bias);
@@ -3863,6 +3893,10 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length,
hns3_rx_ring_move_fw(ring);
return 0;
}
+
+ if (ring->page_pool)
+ skb_mark_for_recycle(skb);
+
u64_stats_update_begin(&ring->syncp);
ring->stats.seg_pkt_cnt++;
u64_stats_update_end(&ring->syncp);
@@ -3901,6 +3935,10 @@ static int hns3_add_frag(struct hns3_enet_ring *ring)
"alloc rx fraglist skb fail\n");
return -ENXIO;
}
+
+ if (ring->page_pool)
+ skb_mark_for_recycle(new_skb);
+
ring->frag_num = 0;
if (ring->tail_skb) {
@@ -4705,6 +4743,29 @@ static void hns3_put_ring_config(struct hns3_nic_priv *priv)
priv->ring = NULL;
}
+static void hns3_alloc_page_pool(struct hns3_enet_ring *ring)
+{
+ struct page_pool_params pp_params = {
+ .flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG,
+ .order = hns3_page_order(ring),
+ .pool_size = ring->desc_num * hns3_buf_size(ring) / PAGE_SIZE,
+ .nid = dev_to_node(ring_to_dev(ring)),
+ .dev = ring_to_dev(ring),
+ .dma_dir = DMA_FROM_DEVICE,
+ .offset = 0,
+ .max_len = 0,
+ };
+
+ ring->page_pool = page_pool_create(&pp_params);
+ if (IS_ERR(ring->page_pool)) {
+ dev_warn(ring_to_dev(ring), "page pool creation failed: %ld\n",
+ PTR_ERR(ring->page_pool));
+ ring->page_pool = NULL;
+ } else {
+ dev_info(ring_to_dev(ring), "page pool creation succeeded\n");
+ }
+}
+
static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring)
{
int ret;
@@ -4724,6 +4785,8 @@ static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring)
goto out_with_desc_cb;
if (!HNAE3_IS_TX_RING(ring)) {
+ hns3_alloc_page_pool(ring);
+
ret = hns3_alloc_ring_buffers(ring);
if (ret)
goto out_with_desc;
@@ -4764,6 +4827,12 @@ void hns3_fini_ring(struct hns3_enet_ring *ring)
devm_kfree(ring_to_dev(ring), tx_spare);
ring->tx_spare = NULL;
}
+
+ if (!HNAE3_IS_TX_RING(ring) && ring->page_pool) {
+ page_pool_destroy(ring->page_pool);
+ ring->page_pool = NULL;
+ dev_info(ring_to_dev(ring), "page pool destroyed\n");
+ }
}
static int hns3_buf_size2type(u32 buf_size)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 15af3d9..115c0ce 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -6,6 +6,7 @@
#include <linux/dim.h>
#include <linux/if_vlan.h>
+#include <net/page_pool.h>
#include "hnae3.h"
@@ -307,6 +308,7 @@ enum hns3_desc_type {
DESC_TYPE_BOUNCE_ALL = 1 << 3,
DESC_TYPE_BOUNCE_HEAD = 1 << 4,
DESC_TYPE_SGL_SKB = 1 << 5,
+ DESC_TYPE_FRAG = 1 << 6,
};
struct hns3_desc_cb {
@@ -451,6 +453,7 @@ struct hns3_enet_ring {
struct hnae3_queue *tqp;
int queue_index;
struct device *dev; /* will be used for DMA mapping of descriptors */
+ struct page_pool *page_pool;
/* statistic */
struct ring_stats stats;
--
2.7.4
On 7/13/21, Yunsheng Lin <[email protected]> wrote:
> This patch adds skb's frag page recycling support based on
> the elevated refcnt support in page pool.
>
> The performance improves above 10~20% with IOMMU disabled.
> The performance improves about 200% when IOMMU is enabled
> and iperf server shares the same cpu with irq/NAPI.
Could you share workload details?
>
> Signed-off-by: Yunsheng Lin <[email protected]>
> ---
> drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 79
> +++++++++++++++++++++++--
> drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 3 +
> 2 files changed, 77 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> index cdb5f14..c799129 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> @@ -3205,6 +3205,21 @@ static int hns3_alloc_buffer(struct hns3_enet_ring
> *ring,
> unsigned int order = hns3_page_order(ring);
> struct page *p;
>
> + if (ring->page_pool) {
> + p = page_pool_dev_alloc_frag(ring->page_pool,
> + &cb->page_offset,
> + hns3_buf_size(ring));
> + if (unlikely(!p))
> + return -ENOMEM;
> +
> + cb->priv = p;
> + cb->buf = page_address(p);
> + cb->dma = page_pool_get_dma_addr(p);
> + cb->type = DESC_TYPE_FRAG;
> + cb->reuse_flag = 0;
> + return 0;
> + }
> +
> p = dev_alloc_pages(order);
> if (!p)
> return -ENOMEM;
> @@ -3227,8 +3242,12 @@ static void hns3_free_buffer(struct hns3_enet_ring
> *ring,
> if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD |
> DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB))
> napi_consume_skb(cb->priv, budget);
> - else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias)
> - __page_frag_cache_drain(cb->priv, cb->pagecnt_bias);
> + else if (!HNAE3_IS_TX_RING(ring)) {
> + if (cb->type & DESC_TYPE_PAGE && cb->pagecnt_bias)
> + __page_frag_cache_drain(cb->priv, cb->pagecnt_bias);
> + else if (cb->type & DESC_TYPE_FRAG)
> + page_pool_put_full_page(ring->page_pool, cb->priv, false);
> + }
> memset(cb, 0, sizeof(*cb));
> }
>
> @@ -3315,7 +3334,7 @@ static int hns3_alloc_and_map_buffer(struct
> hns3_enet_ring *ring,
> int ret;
>
> ret = hns3_alloc_buffer(ring, cb);
> - if (ret)
> + if (ret || ring->page_pool)
> goto out;
>
> ret = hns3_map_buffer(ring, cb);
> @@ -3337,7 +3356,8 @@ static int hns3_alloc_and_attach_buffer(struct
> hns3_enet_ring *ring, int i)
> if (ret)
> return ret;
>
> - ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma);
> + ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma +
> + ring->desc_cb[i].page_offset);
>
> return 0;
> }
> @@ -3367,7 +3387,8 @@ static void hns3_replace_buffer(struct hns3_enet_ring
> *ring, int i,
> {
> hns3_unmap_buffer(ring, &ring->desc_cb[i]);
> ring->desc_cb[i] = *res_cb;
> - ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma);
> + ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma +
> + ring->desc_cb[i].page_offset);
> ring->desc[i].rx.bd_base_info = 0;
> }
>
> @@ -3539,6 +3560,12 @@ static void hns3_nic_reuse_page(struct sk_buff *skb,
> int i,
> u32 frag_size = size - pull_len;
> bool reused;
>
> + if (ring->page_pool) {
> + skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset,
> + frag_size, truesize);
> + return;
> + }
> +
> /* Avoid re-using remote or pfmem page */
> if (unlikely(!dev_page_is_reusable(desc_cb->priv)))
> goto out;
> @@ -3856,6 +3883,9 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring,
> unsigned int length,
> /* We can reuse buffer as-is, just make sure it is reusable */
> if (dev_page_is_reusable(desc_cb->priv))
> desc_cb->reuse_flag = 1;
> + else if (desc_cb->type & DESC_TYPE_FRAG)
> + page_pool_put_full_page(ring->page_pool, desc_cb->priv,
> + false);
> else /* This page cannot be reused so discard it */
> __page_frag_cache_drain(desc_cb->priv,
> desc_cb->pagecnt_bias);
> @@ -3863,6 +3893,10 @@ static int hns3_alloc_skb(struct hns3_enet_ring
> *ring, unsigned int length,
> hns3_rx_ring_move_fw(ring);
> return 0;
> }
> +
> + if (ring->page_pool)
> + skb_mark_for_recycle(skb);
> +
> u64_stats_update_begin(&ring->syncp);
> ring->stats.seg_pkt_cnt++;
> u64_stats_update_end(&ring->syncp);
> @@ -3901,6 +3935,10 @@ static int hns3_add_frag(struct hns3_enet_ring
> *ring)
> "alloc rx fraglist skb fail\n");
> return -ENXIO;
> }
> +
> + if (ring->page_pool)
> + skb_mark_for_recycle(new_skb);
> +
> ring->frag_num = 0;
>
> if (ring->tail_skb) {
> @@ -4705,6 +4743,29 @@ static void hns3_put_ring_config(struct hns3_nic_priv
> *priv)
> priv->ring = NULL;
> }
>
> +static void hns3_alloc_page_pool(struct hns3_enet_ring *ring)
> +{
> + struct page_pool_params pp_params = {
> + .flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG,
> + .order = hns3_page_order(ring),
> + .pool_size = ring->desc_num * hns3_buf_size(ring) / PAGE_SIZE,
> + .nid = dev_to_node(ring_to_dev(ring)),
> + .dev = ring_to_dev(ring),
> + .dma_dir = DMA_FROM_DEVICE,
> + .offset = 0,
> + .max_len = 0,
> + };
> +
> + ring->page_pool = page_pool_create(&pp_params);
> + if (IS_ERR(ring->page_pool)) {
> + dev_warn(ring_to_dev(ring), "page pool creation failed: %ld\n",
> + PTR_ERR(ring->page_pool));
> + ring->page_pool = NULL;
> + } else {
> + dev_info(ring_to_dev(ring), "page pool creation succeeded\n");
> + }
> +}
> +
> static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring)
> {
> int ret;
> @@ -4724,6 +4785,8 @@ static int hns3_alloc_ring_memory(struct
> hns3_enet_ring *ring)
> goto out_with_desc_cb;
>
> if (!HNAE3_IS_TX_RING(ring)) {
> + hns3_alloc_page_pool(ring);
> +
> ret = hns3_alloc_ring_buffers(ring);
> if (ret)
> goto out_with_desc;
> @@ -4764,6 +4827,12 @@ void hns3_fini_ring(struct hns3_enet_ring *ring)
> devm_kfree(ring_to_dev(ring), tx_spare);
> ring->tx_spare = NULL;
> }
> +
> + if (!HNAE3_IS_TX_RING(ring) && ring->page_pool) {
> + page_pool_destroy(ring->page_pool);
> + ring->page_pool = NULL;
> + dev_info(ring_to_dev(ring), "page pool destroyed\n");
> + }
> }
>
> static int hns3_buf_size2type(u32 buf_size)
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
> b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
> index 15af3d9..115c0ce 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
> @@ -6,6 +6,7 @@
>
> #include <linux/dim.h>
> #include <linux/if_vlan.h>
> +#include <net/page_pool.h>
>
> #include "hnae3.h"
>
> @@ -307,6 +308,7 @@ enum hns3_desc_type {
> DESC_TYPE_BOUNCE_ALL = 1 << 3,
> DESC_TYPE_BOUNCE_HEAD = 1 << 4,
> DESC_TYPE_SGL_SKB = 1 << 5,
> + DESC_TYPE_FRAG = 1 << 6,
> };
>
> struct hns3_desc_cb {
> @@ -451,6 +453,7 @@ struct hns3_enet_ring {
> struct hnae3_queue *tqp;
> int queue_index;
> struct device *dev; /* will be used for DMA mapping of descriptors */
> + struct page_pool *page_pool;
>
> /* statistic */
> struct ring_stats stats;
> --
> 2.7.4
>
>
On Tue, Jul 13, 2021 at 05:24:29PM +0800, Yunsheng Lin wrote:
> Currently, page->pp is cleared and set everytime the page
> is recycled, which is unnecessary.
>
> So only set the page->pp when the page is added to the page
> pool and only clear it when the page is released from the
> page pool.
>
> This is also a preparation to support allocating frag page
> in page pool.
>
> Signed-off-by: Yunsheng Lin <[email protected]>
> ---
> drivers/net/ethernet/marvell/mvneta.c | 6 +-----
> drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +-
> drivers/net/ethernet/ti/cpsw.c | 2 +-
> drivers/net/ethernet/ti/cpsw_new.c | 2 +-
> include/linux/skbuff.h | 4 +---
> include/net/page_pool.h | 7 -------
> net/core/page_pool.c | 21 +++++++++++++++++----
> 7 files changed, 22 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
> index 361bc4f..89bf31fd 100644
> --- a/drivers/net/ethernet/marvell/mvneta.c
> +++ b/drivers/net/ethernet/marvell/mvneta.c
> @@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
> if (!skb)
> return ERR_PTR(-ENOMEM);
>
> - skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);
> + skb_mark_for_recycle(skb);
>
> skb_reserve(skb, xdp->data - xdp->data_hard_start);
> skb_put(skb, xdp->data_end - xdp->data);
> @@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
> skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
> skb_frag_page(frag), skb_frag_off(frag),
> skb_frag_size(frag), PAGE_SIZE);
> - /* We don't need to reset pp_recycle here. It's already set, so
> - * just mark fragments for recycling.
> - */
> - page_pool_store_mem_info(skb_frag_page(frag), pool);
> }
>
> return skb;
> diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
> index 3229baf..320eddb 100644
> --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
> +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
> @@ -3995,7 +3995,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,
> }
>
> if (pp)
> - skb_mark_for_recycle(skb, page, pp);
> + skb_mark_for_recycle(skb);
> else
> dma_unmap_single_attrs(dev->dev.parent, dma_addr,
> bm_pool->buf_size, DMA_FROM_DEVICE,
> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index cbbd0f6..9d59143 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -431,7 +431,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
> skb->protocol = eth_type_trans(skb, ndev);
>
> /* mark skb for recycling */
> - skb_mark_for_recycle(skb, page, pool);
> + skb_mark_for_recycle(skb);
> netif_receive_skb(skb);
>
> ndev->stats.rx_bytes += len;
> diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
> index 57d279f..a4234a3 100644
> --- a/drivers/net/ethernet/ti/cpsw_new.c
> +++ b/drivers/net/ethernet/ti/cpsw_new.c
> @@ -374,7 +374,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
> skb->protocol = eth_type_trans(skb, ndev);
>
> /* mark skb for recycling */
> - skb_mark_for_recycle(skb, page, pool);
> + skb_mark_for_recycle(skb);
> netif_receive_skb(skb);
>
> ndev->stats.rx_bytes += len;
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index b2db9cd..7795979 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
> }
>
> #ifdef CONFIG_PAGE_POOL
> -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,
> - struct page_pool *pp)
> +static inline void skb_mark_for_recycle(struct sk_buff *skb)
> {
> skb->pp_recycle = 1;
> - page_pool_store_mem_info(page, pp);
> }
> #endif
>
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 3dd62dd..8d7744d 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -253,11 +253,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)
> spin_unlock_bh(&pool->ring.producer_lock);
> }
>
> -/* Store mem_info on struct page and use it while recycling skb frags */
> -static inline
> -void page_pool_store_mem_info(struct page *page, struct page_pool *pp)
> -{
> - page->pp = pp;
> -}
> -
> #endif /* _NET_PAGE_POOL_H */
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 5e4eb45..78838c6 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -206,6 +206,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
> return true;
> }
>
> +static void page_pool_set_pp_info(struct page_pool *pool,
> + struct page *page)
> +{
> + page->pp = pool;
> + page->pp_magic |= PP_SIGNATURE;
> +}
> +
> +static void page_pool_clear_pp_info(struct page *page)
> +{
> + page->pp_magic = 0;
> + page->pp = NULL;
> +}
> +
> static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
> gfp_t gfp)
> {
> @@ -222,7 +235,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
> return NULL;
> }
>
> - page->pp_magic |= PP_SIGNATURE;
> + page_pool_set_pp_info(pool, page);
>
> /* Track how many pages are held 'in-flight' */
> pool->pages_state_hold_cnt++;
> @@ -266,7 +279,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
> put_page(page);
> continue;
> }
> - page->pp_magic |= PP_SIGNATURE;
> +
> + page_pool_set_pp_info(pool, page);
> pool->alloc.cache[pool->alloc.count++] = page;
> /* Track how many pages are held 'in-flight' */
> pool->pages_state_hold_cnt++;
> @@ -345,7 +359,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
> DMA_ATTR_SKIP_CPU_SYNC);
> page_pool_set_dma_addr(page, 0);
> skip_dma_unmap:
> - page->pp_magic = 0;
> + page_pool_clear_pp_info(page);
>
> /* This may be the last page returned, releasing the pool, so
> * it is not safe to reference pool afterwards.
> @@ -644,7 +658,6 @@ bool page_pool_return_skb_page(struct page *page)
> * The page will be returned to the pool here regardless of the
> * 'flipped' fragment being in use or not.
> */
> - page->pp = NULL;
> page_pool_put_full_page(pp, page, false);
>
> return true;
> --
> 2.7.4
>
That's useful overall regardless of the frag allocation patchset.
The reason I avoided doing this in the original patchset was cases were an
XDP buffer gets coverted to an SKB (e.g XDP_PASS or REDIRECT). Now that
being said I can't think of any case, were marking the page page_pool
allocates with that special signature by default, will cause failures.
Even if we convert it to an SKB, the packet will eventually be recycled
once the processing is over (assuming someone marks the skb for it).
If anyone can think of any case I missed please shout.
Reviewed-by: Ilias Apalodimas <[email protected]>
On Tue, Jul 13, 2021 at 2:25 AM Yunsheng Lin <[email protected]> wrote:
>
> As suggested by Alexander, "A DMA mapping should be page
> aligned anyway so the lower 12 bits would be reserved 0",
> so it might make more sense to repurpose the lower 12 bits
> of the dma address to store the bias for frag page support
> in page pool for 32 bit systems with 64 bit dma, which
> should be rare those days.
>
> For normal system, the dma_addr[1] in 'struct page' is not
> used, so we can reuse the dma_addr[1] for storing bias.
>
> The PAGE_POOP_USE_DMA_ADDR_1 macro is used to decide where
> to store the bias, as the "sizeof(dma_addr_t) > sizeof(
> unsigned long)" is false for normal system, so hopefully the
> compiler will optimize out the unused code for those system.
I assume the name is a typo and you meant PAGE_POOL_USE_DMA_ADDR_1?
> The newly added page_pool_set_bias() should be called before
> the page is passed to any user. Otherwise, call the newly
> added page_pool_atomic_sub_bias_return().
>
> Signed-off-by: Yunsheng Lin <[email protected]>
> ---
> include/net/page_pool.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++---
> net/core/page_pool.c | 10 +++++++
> 2 files changed, 77 insertions(+), 3 deletions(-)
>
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 8d7744d..315b9f2 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -198,21 +198,85 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
> page_pool_put_full_page(pool, page, true);
> }
>
> +#define PAGE_POOP_USE_DMA_ADDR_1 (sizeof(dma_addr_t) > sizeof(unsigned long))
> +
> static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
> {
> - dma_addr_t ret = page->dma_addr[0];
> - if (sizeof(dma_addr_t) > sizeof(unsigned long))
> + dma_addr_t ret;
> +
> + if (PAGE_POOP_USE_DMA_ADDR_1) {
> + ret = READ_ONCE(page->dma_addr[0]) & PAGE_MASK;
> ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
Alternatively we could change things a bit and rename things so we
have the MSB of dma_addr where dma_addr[1] is and we rename
dma_addr[0] to pp_frag_count we could have it also contain the lower
bits and handle it like so:
ret = page->dma_addr;
if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
ret <<= 32;
ret |= atomic_long_read(&page->pp_frag_count) & PAGE_MASK;
}
> + } else {
> + ret = page->dma_addr[0];
> + }
> +
> return ret;
> }
>
> static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
> {
> page->dma_addr[0] = addr;
> - if (sizeof(dma_addr_t) > sizeof(unsigned long))
> + if (PAGE_POOP_USE_DMA_ADDR_1)
> page->dma_addr[1] = upper_32_bits(addr);
So assuming similar logic to above we could do something like:
if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
atomic_long_set(&page->pp_frag_count, addr & PAGE_MASK);
addr >>= 32;
}
pp->dma_addr = addr;
> }
>
> +static inline int page_pool_atomic_sub_bias_return(struct page *page, int nr)
> +{
> + int bias;
> +
> + if (PAGE_POOP_USE_DMA_ADDR_1) {
> + unsigned long *bias_ptr = &page->dma_addr[0];
> + unsigned long old_bias = READ_ONCE(*bias_ptr);
> + unsigned long new_bias;
> +
> + do {
> + bias = (int)(old_bias & ~PAGE_MASK);
> +
> + /* Warn when page_pool_dev_alloc_pages() is called
> + * with PP_FLAG_PAGE_FRAG flag in driver.
> + */
> + WARN_ON(!bias);
> +
> + /* already the last user */
> + if (!(bias - nr))
> + return 0;
> +
> + new_bias = old_bias - nr;
> + } while (!try_cmpxchg(bias_ptr, &old_bias, new_bias));
> +
> + WARN_ON((new_bias & PAGE_MASK) != (old_bias & PAGE_MASK));
> +
> + bias = new_bias & ~PAGE_MASK;
> + } else {
> + atomic_t *v = (atomic_t *)&page->dma_addr[1];
The problem with casting like this is that it makes assumptions about
byte ordering in the case that atomic_t is a 32b value and dma_addr is
a long value.
> +
> + if (atomic_read(v) == nr)
> + return 0;
> +
> + bias = atomic_sub_return(nr, v);
> + WARN_ON(bias < 0);
> + }
Rather than have 2 versions of this function it might work better to
just use the atomic_long version of these functions instead. Then you
shouldn't need to have two versions of the code.
You could just modify the block on the end to check for new_frag_count
vs old_frag_count if PAGE_POOL_USE_PP_FRAG_COUNT is true, or
new_frag_count < 0 if false.
> +
> + return bias;
> +}
> +
> +static inline void page_pool_set_bias(struct page *page, int bias)
> +{
> + if (PAGE_POOP_USE_DMA_ADDR_1) {
> + unsigned long dma_addr_0 = READ_ONCE(page->dma_addr[0]);
> +
> + dma_addr_0 &= PAGE_MASK;
> + dma_addr_0 |= bias;
> +
> + WRITE_ONCE(page->dma_addr[0], dma_addr_0);
> + } else {
> + atomic_t *v = (atomic_t *)&page->dma_addr[1];
> +
> + atomic_set(v, bias);
> + }
Similarly here you could just update bias to include the dma_addr in
the if case, and then use atomic_long_set for both cases.
> +}
> +
> static inline bool is_page_pool_compiled_in(void)
> {
> #ifdef CONFIG_PAGE_POOL
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 78838c6..6ac5b00 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -198,6 +198,16 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
> if (dma_mapping_error(pool->p.dev, dma))
> return false;
>
> + if (PAGE_POOP_USE_DMA_ADDR_1 &&
> + WARN_ON(pool->p.flags & PP_FLAG_PAGE_FRAG &&
> + dma & ~PAGE_MASK)) {
> + dma_unmap_page_attrs(pool->p.dev, dma,
> + PAGE_SIZE << pool->p.order,
> + pool->p.dma_dir,
> + DMA_ATTR_SKIP_CPU_SYNC);
> + return false;
> + }
> +
> page_pool_set_dma_addr(page, dma);
>
> if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
> --
> 2.7.4
>
On 2021/7/13 19:42, Denis Kirjanov wrote:
> On 7/13/21, Yunsheng Lin <[email protected]> wrote:
>> This patch adds skb's frag page recycling support based on
>> the elevated refcnt support in page pool.
>>
>> The performance improves above 10~20% with IOMMU disabled.
>> The performance improves about 200% when IOMMU is enabled
>> and iperf server shares the same cpu with irq/NAPI.
>
> Could you share workload details?
The testcase is simple, using iperf TCP with only one thread.
The the iperf server CPU and NAPI softirq CPU is pinned to the
same CPU, the performance improves from 14Gbit to 33Gbit when
SMMU is in strict mode, so the above state is not accurate, it
should be "improves to about 200% when IOMMU is in strict mode"
>
>>
On 2021/7/14 2:41, Alexander Duyck wrote:
> On Tue, Jul 13, 2021 at 2:25 AM Yunsheng Lin <[email protected]> wrote:
>>
>> As suggested by Alexander, "A DMA mapping should be page
>> aligned anyway so the lower 12 bits would be reserved 0",
>> so it might make more sense to repurpose the lower 12 bits
>> of the dma address to store the bias for frag page support
>> in page pool for 32 bit systems with 64 bit dma, which
>> should be rare those days.
>>
>> For normal system, the dma_addr[1] in 'struct page' is not
>> used, so we can reuse the dma_addr[1] for storing bias.
>>
>> The PAGE_POOP_USE_DMA_ADDR_1 macro is used to decide where
>> to store the bias, as the "sizeof(dma_addr_t) > sizeof(
>> unsigned long)" is false for normal system, so hopefully the
>> compiler will optimize out the unused code for those system.
>
> I assume the name is a typo and you meant PAGE_POOL_USE_DMA_ADDR_1?
Yes, will use the PAGE_POOL_DMA_USE_PP_FRAG_COUNT you suggested below.
>
>> The newly added page_pool_set_bias() should be called before
>> the page is passed to any user. Otherwise, call the newly
>> added page_pool_atomic_sub_bias_return().
>>
>> Signed-off-by: Yunsheng Lin <[email protected]>
>> ---
>> include/net/page_pool.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++---
>> net/core/page_pool.c | 10 +++++++
>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
>> index 8d7744d..315b9f2 100644
>> --- a/include/net/page_pool.h
>> +++ b/include/net/page_pool.h
>> @@ -198,21 +198,85 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
>> page_pool_put_full_page(pool, page, true);
>> }
>>
>> +#define PAGE_POOP_USE_DMA_ADDR_1 (sizeof(dma_addr_t) > sizeof(unsigned long))
>> +
>> static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
>> {
>> - dma_addr_t ret = page->dma_addr[0];
>> - if (sizeof(dma_addr_t) > sizeof(unsigned long))
>> + dma_addr_t ret;
>> +
>> + if (PAGE_POOP_USE_DMA_ADDR_1) {
>> + ret = READ_ONCE(page->dma_addr[0]) & PAGE_MASK;
>> ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
>
> Alternatively we could change things a bit and rename things so we
> have the MSB of dma_addr where dma_addr[1] is and we rename
> dma_addr[0] to pp_frag_count we could have it also contain the lower
> bits and handle it like so:
> ret = page->dma_addr;
> if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
> ret <<= 32;
> ret |= atomic_long_read(&page->pp_frag_count) & PAGE_MASK;
> }
Ok, it seems better.
>
>> + } else {
>> + ret = page->dma_addr[0];
>> + }
>> +
>> return ret;
>> }
>>
>> static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
>> {
>> page->dma_addr[0] = addr;
>> - if (sizeof(dma_addr_t) > sizeof(unsigned long))
>> + if (PAGE_POOP_USE_DMA_ADDR_1)
>> page->dma_addr[1] = upper_32_bits(addr);
>
> So assuming similar logic to above we could do something like:
> if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
> atomic_long_set(&page->pp_frag_count, addr & PAGE_MASK);
> addr >>= 32;
> }
> pp->dma_addr = addr;
ok.
>
>> }
>>
>> +static inline int page_pool_atomic_sub_bias_return(struct page *page, int nr)
>> +{
>> + int bias;
>> +
>> + if (PAGE_POOP_USE_DMA_ADDR_1) {
>> + unsigned long *bias_ptr = &page->dma_addr[0];
>> + unsigned long old_bias = READ_ONCE(*bias_ptr);
>> + unsigned long new_bias;
>> +
>> + do {
>> + bias = (int)(old_bias & ~PAGE_MASK);
>> +
>> + /* Warn when page_pool_dev_alloc_pages() is called
>> + * with PP_FLAG_PAGE_FRAG flag in driver.
>> + */
>> + WARN_ON(!bias);
>> +
>> + /* already the last user */
>> + if (!(bias - nr))
>> + return 0;
>> +
>> + new_bias = old_bias - nr;
>> + } while (!try_cmpxchg(bias_ptr, &old_bias, new_bias));
>> +
>> + WARN_ON((new_bias & PAGE_MASK) != (old_bias & PAGE_MASK));
>> +
>> + bias = new_bias & ~PAGE_MASK;
>> + } else {
>> + atomic_t *v = (atomic_t *)&page->dma_addr[1];
>
> The problem with casting like this is that it makes assumptions about
> byte ordering in the case that atomic_t is a 32b value and dma_addr is
> a long value.
Will define a pp_frag_count as type of atomic_long_t to replace
dma_addr[1].
>
>> +
>> + if (atomic_read(v) == nr)
>> + return 0;
>> +
>> + bias = atomic_sub_return(nr, v);
>> + WARN_ON(bias < 0);
>> + }
>
> Rather than have 2 versions of this function it might work better to
> just use the atomic_long version of these functions instead. Then you
> shouldn't need to have two versions of the code.
>
> You could just modify the block on the end to check for new_frag_count
> vs old_frag_count if PAGE_POOL_USE_PP_FRAG_COUNT is true, or
> new_frag_count < 0 if false.
When implementing the above, it seems it may still be better to have two
big blocks when both are using the atomic_long_sub_return(), otherwise we
may have many small blocks.
>
>> +
>> + return bias;
>> +}
>> +
>> +static inline void page_pool_set_bias(struct page *page, int bias)
>> +{
>> + if (PAGE_POOP_USE_DMA_ADDR_1) {
>> + unsigned long dma_addr_0 = READ_ONCE(page->dma_addr[0]);
>> +
>> + dma_addr_0 &= PAGE_MASK;
>> + dma_addr_0 |= bias;
>> +
>> + WRITE_ONCE(page->dma_addr[0], dma_addr_0);
>> + } else {
>> + atomic_t *v = (atomic_t *)&page->dma_addr[1];
>> +
>> + atomic_set(v, bias);
>> + }
>
> Similarly here you could just update bias to include the dma_addr in
> the if case, and then use atomic_long_set for both cases.
ok.
>
>> +}
>> +
>> static inline bool is_page_pool_compiled_in(void)
>> {
>> #ifdef CONFIG_PAGE_POOL
>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> index 78838c6..6ac5b00 100644
>> --- a/net/core/page_pool.c
>> +++ b/net/core/page_pool.c
>> @@ -198,6 +198,16 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
>> if (dma_mapping_error(pool->p.dev, dma))
>> return false;
>>
>> + if (PAGE_POOP_USE_DMA_ADDR_1 &&
>> + WARN_ON(pool->p.flags & PP_FLAG_PAGE_FRAG &&
>> + dma & ~PAGE_MASK)) {
>> + dma_unmap_page_attrs(pool->p.dev, dma,
>> + PAGE_SIZE << pool->p.order,
>> + pool->p.dma_dir,
>> + DMA_ATTR_SKIP_CPU_SYNC);
>> + return false;
>> + }
>> +
>> page_pool_set_dma_addr(page, dma);
>>
>> if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
>> --
>> 2.7.4
>>
> .
>