2023-03-19 19:58:33

by Kal Cutter Conley

[permalink] [raw]
Subject: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE

Add core AF_XDP support for chunk sizes larger than PAGE_SIZE. This
enables sending/receiving jumbo ethernet frames up to the theoretical
maxiumum of 64 KiB. For chunk sizes > PAGE_SIZE, the UMEM is required
to consist of HugeTLB VMAs (and be hugepage aligned). Initially, only
XDP_COPY mode is usuable pending future driver work.

For consistency, check for HugeTLB pages during UMEM registration. This
implies that hugepages are required for XDP_COPY mode despite DMA not
being used. This restriction is desirable since it ensures user software
can take advantage of future driver support.

Even in HugeTLB mode, continue to do page accounting using order-0
(4 KiB) pages. This minimizes the size of this change and reduces the
risk of impacting driver code. Taking full advantage of hugepages for
accounting should improve XDP performance in the general case.

No significant change in RX/TX performance was observed with this patch.
A few data points are reproduced below:

Machine : Dell PowerEdge R940
CPU : Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz
NIC : MT27700 Family [ConnectX-4]

+-----+------------+-------------+---------------+
| | frame size | packet size | rxdrop (Mpps) |
+-----+------------+-------------+---------------+
| old | 4000 | 320 | 15.7 |
| new | 4000 | 320 | 15.8 |
+-----+------------+-------------+---------------+
| old | 4096 | 320 | 16.4 |
| new | 4096 | 320 | 16.3 |
+-----+------------+-------------+---------------+
| new | 9000 | 320 | 6.3 |
| new | 10240 | 9000 | 0.4 |
+-----+------------+-------------+---------------+

Signed-off-by: Kal Conley <[email protected]>
---
include/net/xdp_sock.h | 1 +
include/net/xdp_sock_drv.h | 6 +++++
include/net/xsk_buff_pool.h | 4 +++-
net/xdp/xdp_umem.c | 46 +++++++++++++++++++++++++++++--------
net/xdp/xsk.c | 3 +++
net/xdp/xsk_buff_pool.c | 16 +++++++++----
6 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 3057e1a4a11c..e562ac3f5295 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -28,6 +28,7 @@ struct xdp_umem {
struct user_struct *user;
refcount_t users;
u8 flags;
+ bool hugetlb;
bool zc;
struct page **pgs;
int id;
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 9c0d860609ba..eb630d17f994 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -12,6 +12,12 @@
#define XDP_UMEM_MIN_CHUNK_SHIFT 11
#define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT)

+/* Allow chunk sizes up to the maximum size of an ethernet frame (64 KiB).
+ * Larger chunks are not guaranteed to fit in a single SKB.
+ */
+#define XDP_UMEM_MAX_CHUNK_SHIFT 16
+#define XDP_UMEM_MAX_CHUNK_SIZE (1 << XDP_UMEM_MAX_CHUNK_SHIFT)
+
#ifdef CONFIG_XDP_SOCKETS

void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 3e952e569418..69e3970da092 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -78,6 +78,7 @@ struct xsk_buff_pool {
u8 cached_need_wakeup;
bool uses_need_wakeup;
bool dma_need_sync;
+ bool hugetlb;
bool unaligned;
void *addrs;
/* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
@@ -175,7 +176,8 @@ static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
u64 addr, u32 len)
{
- bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
+ bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
+ (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;

if (likely(!cross_pg))
return false;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 02207e852d79..c96eefb9f5ae 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -10,6 +10,7 @@
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/hugetlb_inline.h>
#include <linux/mm.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
@@ -19,6 +20,9 @@
#include "xdp_umem.h"
#include "xsk_queue.h"

+_Static_assert(XDP_UMEM_MIN_CHUNK_SIZE <= PAGE_SIZE);
+_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE <= HPAGE_SIZE);
+
static DEFINE_IDA(umem_ida);

static void xdp_umem_unpin_pages(struct xdp_umem *umem)
@@ -91,7 +95,26 @@ void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup)
}
}

-static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
+/* Returns true if the UMEM contains HugeTLB pages exclusively, false otherwise.
+ *
+ * The mmap_lock must be held by the caller.
+ */
+static bool xdp_umem_is_hugetlb(struct xdp_umem *umem, unsigned long address)
+{
+ unsigned long end = address + umem->npgs * PAGE_SIZE;
+ struct vm_area_struct *vma;
+ struct vma_iterator vmi;
+
+ vma_iter_init(&vmi, current->mm, address);
+ for_each_vma_range(vmi, vma, end) {
+ if (!is_vm_hugetlb_page(vma))
+ return false;
+ }
+
+ return true;
+}
+
+static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address, bool hugetlb)
{
unsigned int gup_flags = FOLL_WRITE;
long npgs;
@@ -102,8 +125,17 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
return -ENOMEM;

mmap_read_lock(current->mm);
+
+ umem->hugetlb = IS_ALIGNED(address, HPAGE_SIZE) && xdp_umem_is_hugetlb(umem, address);
+ if (hugetlb && !umem->hugetlb) {
+ mmap_read_unlock(current->mm);
+ err = -EINVAL;
+ goto out_pgs;
+ }
+
npgs = pin_user_pages(address, umem->npgs,
gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
+
mmap_read_unlock(current->mm);

if (npgs != umem->npgs) {
@@ -152,20 +184,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
{
bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
+ bool hugetlb = chunk_size > PAGE_SIZE;
u64 addr = mr->addr, size = mr->len;
u32 chunks_rem, npgs_rem;
u64 chunks, npgs;
int err;

- if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
- /* Strictly speaking we could support this, if:
- * - huge pages, or*
- * - using an IOMMU, or
- * - making sure the memory area is consecutive
- * but for now, we simply say "computer says no".
- */
+ if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > XDP_UMEM_MAX_CHUNK_SIZE)
return -EINVAL;
- }

if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
return -EINVAL;
@@ -215,7 +241,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (err)
return err;

- err = xdp_umem_pin_pages(umem, (unsigned long)addr);
+ err = xdp_umem_pin_pages(umem, (unsigned long)addr, hugetlb);
if (err)
goto out_account;

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 2ac58b282b5e..3899a2d235bb 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -421,6 +421,9 @@ static void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}

+/* Chunks must fit in the SKB `frags` array. */
+_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE / PAGE_SIZE <= MAX_SKB_FRAGS);
+
static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
struct xdp_desc *desc)
{
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index b2df1e0f8153..777e8a38a232 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -80,6 +80,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
pool->headroom = umem->headroom;
pool->chunk_size = umem->chunk_size;
pool->chunk_shift = ffs(umem->chunk_size) - 1;
+ pool->hugetlb = umem->hugetlb;
pool->unaligned = unaligned;
pool->frame_len = umem->chunk_size - umem->headroom -
XDP_PACKET_HEADROOM;
@@ -369,16 +370,23 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
}
EXPORT_SYMBOL(xp_dma_unmap);

-static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
+/* HugeTLB pools consider contiguity at hugepage granularity only. Hence, all
+ * order-0 pages within a hugepage have the same contiguity value.
+ */
+static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map, bool hugetlb)
{
+ u32 page_size = hugetlb ? HPAGE_SIZE : PAGE_SIZE;
+ u32 n = page_size >> PAGE_SHIFT;
u32 i;

- for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
- if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
+ for (i = 0; i + n < dma_map->dma_pages_cnt; i++) {
+ if (dma_map->dma_pages[i] + page_size == dma_map->dma_pages[i + n])
dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
else
dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
}
+ for (; i < dma_map->dma_pages_cnt; i++)
+ dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
}

static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
@@ -441,7 +449,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
}

if (pool->unaligned)
- xp_check_dma_contiguity(dma_map);
+ xp_check_dma_contiguity(dma_map, pool->hugetlb);

err = xp_init_dma_info(pool, dma_map);
if (err) {
--
2.39.2



2023-03-19 22:56:35

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE

Hi Kal,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf/master]
[also build test WARNING on next-20230317]
[cannot apply to bpf-next/master linus/master v6.3-rc3]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Kal-Conley/xsk-Support-UMEM-chunk_size-PAGE_SIZE/20230320-035849
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git master
patch link: https://lore.kernel.org/r/20230319195656.326701-2-kal.conley%40dectris.com
patch subject: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE
config: s390-randconfig-r016-20230319 (https://download.01.org/0day-ci/archive/20230320/[email protected]/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project 67409911353323ca5edf2049ef0df54132fa1ca7)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install s390 cross compiling tool for clang build
# apt-get install binutils-s390x-linux-gnu
# https://github.com/intel-lab-lkp/linux/commit/bbcc35c4ff807754bf61ef2c1f11195533e53de0
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Kal-Conley/xsk-Support-UMEM-chunk_size-PAGE_SIZE/20230320-035849
git checkout bbcc35c4ff807754bf61ef2c1f11195533e53de0
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
| Link: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All warnings (new ones prefixed by >>):

In file included from net/xdp/xsk.c:23:
In file included from include/linux/netdevice.h:38:
In file included from include/net/net_namespace.h:43:
In file included from include/linux/skbuff.h:28:
In file included from include/linux/dma-mapping.h:10:
In file included from include/linux/scatterlist.h:9:
In file included from arch/s390/include/asm/io.h:75:
include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __raw_readb(PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
~~~~~~~~~~ ^
include/uapi/linux/byteorder/big_endian.h:37:59: note: expanded from macro '__le16_to_cpu'
#define __le16_to_cpu(x) __swab16((__force __u16)(__le16)(x))
^
include/uapi/linux/swab.h:102:54: note: expanded from macro '__swab16'
#define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
^
In file included from net/xdp/xsk.c:23:
In file included from include/linux/netdevice.h:38:
In file included from include/net/net_namespace.h:43:
In file included from include/linux/skbuff.h:28:
In file included from include/linux/dma-mapping.h:10:
In file included from include/linux/scatterlist.h:9:
In file included from arch/s390/include/asm/io.h:75:
include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
~~~~~~~~~~ ^
include/uapi/linux/byteorder/big_endian.h:35:59: note: expanded from macro '__le32_to_cpu'
#define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
^
include/uapi/linux/swab.h:115:54: note: expanded from macro '__swab32'
#define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
^
In file included from net/xdp/xsk.c:23:
In file included from include/linux/netdevice.h:38:
In file included from include/net/net_namespace.h:43:
In file included from include/linux/skbuff.h:28:
In file included from include/linux/dma-mapping.h:10:
In file included from include/linux/scatterlist.h:9:
In file included from arch/s390/include/asm/io.h:75:
include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writeb(value, PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:692:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
readsb(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
include/asm-generic/io.h:700:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
readsw(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
include/asm-generic/io.h:708:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
readsl(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
include/asm-generic/io.h:717:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
writesb(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
include/asm-generic/io.h:726:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
writesw(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
include/asm-generic/io.h:735:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
writesl(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
>> net/xdp/xsk.c:425:68: warning: '_Static_assert' with no message is a C2x extension [-Wc2x-extensions]
_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE / PAGE_SIZE <= MAX_SKB_FRAGS);
^
, ""
13 warnings generated.
--
In file included from net/xdp/xdp_umem.c:15:
In file included from include/linux/netdevice.h:38:
In file included from include/net/net_namespace.h:43:
In file included from include/linux/skbuff.h:28:
In file included from include/linux/dma-mapping.h:10:
In file included from include/linux/scatterlist.h:9:
In file included from arch/s390/include/asm/io.h:75:
include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __raw_readb(PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
~~~~~~~~~~ ^
include/uapi/linux/byteorder/big_endian.h:37:59: note: expanded from macro '__le16_to_cpu'
#define __le16_to_cpu(x) __swab16((__force __u16)(__le16)(x))
^
include/uapi/linux/swab.h:102:54: note: expanded from macro '__swab16'
#define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
^
In file included from net/xdp/xdp_umem.c:15:
In file included from include/linux/netdevice.h:38:
In file included from include/net/net_namespace.h:43:
In file included from include/linux/skbuff.h:28:
In file included from include/linux/dma-mapping.h:10:
In file included from include/linux/scatterlist.h:9:
In file included from arch/s390/include/asm/io.h:75:
include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
~~~~~~~~~~ ^
include/uapi/linux/byteorder/big_endian.h:35:59: note: expanded from macro '__le32_to_cpu'
#define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
^
include/uapi/linux/swab.h:115:54: note: expanded from macro '__swab32'
#define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
^
In file included from net/xdp/xdp_umem.c:15:
In file included from include/linux/netdevice.h:38:
In file included from include/net/net_namespace.h:43:
In file included from include/linux/skbuff.h:28:
In file included from include/linux/dma-mapping.h:10:
In file included from include/linux/scatterlist.h:9:
In file included from arch/s390/include/asm/io.h:75:
include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writeb(value, PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:692:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
readsb(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
include/asm-generic/io.h:700:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
readsw(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
include/asm-generic/io.h:708:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
readsl(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
include/asm-generic/io.h:717:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
writesb(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
include/asm-generic/io.h:726:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
writesw(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
include/asm-generic/io.h:735:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
writesl(PCI_IOBASE + addr, buffer, count);
~~~~~~~~~~ ^
>> net/xdp/xdp_umem.c:23:52: warning: '_Static_assert' with no message is a C2x extension [-Wc2x-extensions]
_Static_assert(XDP_UMEM_MIN_CHUNK_SIZE <= PAGE_SIZE);
^
, ""
net/xdp/xdp_umem.c:24:53: warning: '_Static_assert' with no message is a C2x extension [-Wc2x-extensions]
_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE <= HPAGE_SIZE);
^
, ""
14 warnings generated.


vim +/_Static_assert +425 net/xdp/xsk.c

423
424 /* Chunks must fit in the SKB `frags` array. */
> 425 _Static_assert(XDP_UMEM_MAX_CHUNK_SIZE / PAGE_SIZE <= MAX_SKB_FRAGS);
426

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

2023-03-20 01:09:17

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE

Hi Kal,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf/master]
[also build test ERROR on next-20230317]
[cannot apply to bpf-next/master linus/master v6.3-rc3]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Kal-Conley/xsk-Support-UMEM-chunk_size-PAGE_SIZE/20230320-035849
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git master
patch link: https://lore.kernel.org/r/20230319195656.326701-2-kal.conley%40dectris.com
patch subject: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE
config: arm-mvebu_v5_defconfig (https://download.01.org/0day-ci/archive/20230320/[email protected]/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project 67409911353323ca5edf2049ef0df54132fa1ca7)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install arm cross compiling tool for clang build
# apt-get install binutils-arm-linux-gnueabi
# https://github.com/intel-lab-lkp/linux/commit/bbcc35c4ff807754bf61ef2c1f11195533e53de0
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Kal-Conley/xsk-Support-UMEM-chunk_size-PAGE_SIZE/20230320-035849
git checkout bbcc35c4ff807754bf61ef2c1f11195533e53de0
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash net/core/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
| Link: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All errors (new ones prefixed by >>):

In file included from net/core/xdp.c:22:
In file included from include/net/xdp_sock_drv.h:10:
>> include/net/xsk_buff_pool.h:179:43: error: use of undeclared identifier 'HPAGE_SIZE'
bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
^
include/net/xsk_buff_pool.h:179:68: error: use of undeclared identifier 'HPAGE_SIZE'
bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
^
2 errors generated.


vim +/HPAGE_SIZE +179 include/net/xsk_buff_pool.h

175
176 static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
177 u64 addr, u32 len)
178 {
> 179 bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
180 (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
181
182 if (likely(!cross_pg))
183 return false;
184
185 if (pool->dma_pages_cnt) {
186 return !(pool->dma_pages[addr >> PAGE_SHIFT] &
187 XSK_NEXT_PG_CONTIG_MASK);
188 }
189
190 /* skb path */
191 return addr + len > pool->addrs_cnt;
192 }
193

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

2023-03-20 20:46:14

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE

Hi Kal,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf/master]
[also build test ERROR on next-20230320]
[cannot apply to bpf-next/master linus/master v6.3-rc3]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Kal-Conley/xsk-Support-UMEM-chunk_size-PAGE_SIZE/20230320-035849
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git master
patch link: https://lore.kernel.org/r/20230319195656.326701-2-kal.conley%40dectris.com
patch subject: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE
config: mips-randconfig-r011-20230320 (https://download.01.org/0day-ci/archive/20230321/[email protected]/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project 67409911353323ca5edf2049ef0df54132fa1ca7)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install mips cross compiling tool for clang build
# apt-get install binutils-mipsel-linux-gnu
# https://github.com/intel-lab-lkp/linux/commit/bbcc35c4ff807754bf61ef2c1f11195533e53de0
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Kal-Conley/xsk-Support-UMEM-chunk_size-PAGE_SIZE/20230320-035849
git checkout bbcc35c4ff807754bf61ef2c1f11195533e53de0
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=mips olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash net/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
| Link: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All errors (new ones prefixed by >>):

net/xdp/xsk.c:425:68: warning: '_Static_assert' with no message is a C2x extension [-Wc2x-extensions]
_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE / PAGE_SIZE <= MAX_SKB_FRAGS);
^
, ""
In file included from net/xdp/xsk.c:26:
In file included from include/net/xdp_sock_drv.h:10:
>> include/net/xsk_buff_pool.h:179:43: error: call to '__compiletime_assert_494' declared with 'error' attribute: BUILD_BUG failed
bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
^
arch/mips/include/asm/page.h:68:22: note: expanded from macro 'HPAGE_SIZE'
#define HPAGE_SIZE ({BUILD_BUG(); 0; })
^
include/linux/build_bug.h:59:21: note: expanded from macro 'BUILD_BUG'
#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
^
include/linux/build_bug.h:39:37: note: expanded from macro 'BUILD_BUG_ON_MSG'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
include/linux/compiler_types.h:385:2: note: expanded from macro '_compiletime_assert'
__compiletime_assert(condition, msg, prefix, suffix)
^
include/linux/compiler_types.h:378:4: note: expanded from macro '__compiletime_assert'
prefix ## suffix(); \
^
<scratch space>:191:1: note: expanded from here
__compiletime_assert_494
^
1 warning and 1 error generated.
--
net/xdp/xdp_umem.c:23:52: warning: '_Static_assert' with no message is a C2x extension [-Wc2x-extensions]
_Static_assert(XDP_UMEM_MIN_CHUNK_SIZE <= PAGE_SIZE);
^
, ""
>> net/xdp/xdp_umem.c:24:43: error: statement expression not allowed at file scope
_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE <= HPAGE_SIZE);
^
arch/mips/include/asm/page.h:68:20: note: expanded from macro 'HPAGE_SIZE'
#define HPAGE_SIZE ({BUILD_BUG(); 0; })
^
1 warning and 1 error generated.
--
>> net/xdp/xsk_buff_pool.c:378:28: error: call to '__compiletime_assert_507' declared with 'error' attribute: BUILD_BUG failed
u32 page_size = hugetlb ? HPAGE_SIZE : PAGE_SIZE;
^
arch/mips/include/asm/page.h:68:22: note: expanded from macro 'HPAGE_SIZE'
#define HPAGE_SIZE ({BUILD_BUG(); 0; })
^
include/linux/build_bug.h:59:21: note: expanded from macro 'BUILD_BUG'
#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
^
include/linux/build_bug.h:39:37: note: expanded from macro 'BUILD_BUG_ON_MSG'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
include/linux/compiler_types.h:385:2: note: expanded from macro '_compiletime_assert'
__compiletime_assert(condition, msg, prefix, suffix)
^
include/linux/compiler_types.h:378:4: note: expanded from macro '__compiletime_assert'
prefix ## suffix(); \
^
<scratch space>:82:1: note: expanded from here
__compiletime_assert_507
^
In file included from net/xdp/xsk_buff_pool.c:3:
include/net/xsk_buff_pool.h:179:43: error: call to '__compiletime_assert_350' declared with 'error' attribute: BUILD_BUG failed
bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
^
arch/mips/include/asm/page.h:68:22: note: expanded from macro 'HPAGE_SIZE'
#define HPAGE_SIZE ({BUILD_BUG(); 0; })
^
include/linux/build_bug.h:59:21: note: expanded from macro 'BUILD_BUG'
#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
^
include/linux/build_bug.h:39:37: note: expanded from macro 'BUILD_BUG_ON_MSG'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
include/linux/compiler_types.h:385:2: note: expanded from macro '_compiletime_assert'
__compiletime_assert(condition, msg, prefix, suffix)
^
include/linux/compiler_types.h:378:4: note: expanded from macro '__compiletime_assert'
prefix ## suffix(); \
^
<scratch space>:21:1: note: expanded from here
__compiletime_assert_350
^
In file included from net/xdp/xsk_buff_pool.c:3:
include/net/xsk_buff_pool.h:179:43: error: call to '__compiletime_assert_350' declared with 'error' attribute: BUILD_BUG failed
arch/mips/include/asm/page.h:68:22: note: expanded from macro 'HPAGE_SIZE'
#define HPAGE_SIZE ({BUILD_BUG(); 0; })
^
include/linux/build_bug.h:59:21: note: expanded from macro 'BUILD_BUG'
#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
^
include/linux/build_bug.h:39:37: note: expanded from macro 'BUILD_BUG_ON_MSG'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
include/linux/compiler_types.h:385:2: note: expanded from macro '_compiletime_assert'
__compiletime_assert(condition, msg, prefix, suffix)
^
include/linux/compiler_types.h:378:4: note: expanded from macro '__compiletime_assert'
prefix ## suffix(); \
^
<scratch space>:21:1: note: expanded from here
__compiletime_assert_350
^
3 errors generated.


vim +179 include/net/xsk_buff_pool.h

175
176 static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
177 u64 addr, u32 len)
178 {
> 179 bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
180 (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
181
182 if (likely(!cross_pg))
183 return false;
184
185 if (pool->dma_pages_cnt) {
186 return !(pool->dma_pages[addr >> PAGE_SHIFT] &
187 XSK_NEXT_PG_CONTIG_MASK);
188 }
189
190 /* skb path */
191 return addr + len > pool->addrs_cnt;
192 }
193

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

2023-03-21 08:43:16

by Magnus Karlsson

[permalink] [raw]
Subject: Re: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE

On Sun, 19 Mar 2023 at 21:03, Kal Conley <[email protected]> wrote:
>
> Add core AF_XDP support for chunk sizes larger than PAGE_SIZE. This
> enables sending/receiving jumbo ethernet frames up to the theoretical
> maxiumum of 64 KiB. For chunk sizes > PAGE_SIZE, the UMEM is required
> to consist of HugeTLB VMAs (and be hugepage aligned). Initially, only
> XDP_COPY mode is usuable pending future driver work.

nit: useable

> For consistency, check for HugeTLB pages during UMEM registration. This
> implies that hugepages are required for XDP_COPY mode despite DMA not
> being used. This restriction is desirable since it ensures user software
> can take advantage of future driver support.
>
> Even in HugeTLB mode, continue to do page accounting using order-0
> (4 KiB) pages. This minimizes the size of this change and reduces the
> risk of impacting driver code. Taking full advantage of hugepages for
> accounting should improve XDP performance in the general case.

Thank you Kal for working on this. Interesting stuff.

First some general comments and questions on the patch set:

* Please document this new feature in Documentation/networking/af_xdp.rst
* Have you verified the SKB path for Rx? Tx was exercised by running l2fwd.
* Have you checked that an XDP program can access the full >4K packet?
The xdp_buff has no problem with this as the buffer is consecutive,
but just wondering if there is some other check or limit in there?
Jesper and Toke will likely know, so roping them in.
* Would be interesting to know your thoughts about taking this to
zero-copy mode too. It would be good if you could support all modes
from the get go, instead of partial support for some unknown amount of
time and then zero-copy support. Partial support makes using the
feature more cumbersome when an app is deployed on various systems.
The continuity checking code you have at the end of the patch is a
step in the direction of zero-copy support, it seems.
* What happens if I try to run this in zero-copy mode with a chunk_size > 4K?
* There are some compilation errors to fix from the kernel test robot

> No significant change in RX/TX performance was observed with this patch.
> A few data points are reproduced below:

Looks good.

> Machine : Dell PowerEdge R940
> CPU : Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz
> NIC : MT27700 Family [ConnectX-4]
>
> +-----+------------+-------------+---------------+
> | | frame size | packet size | rxdrop (Mpps) |
> +-----+------------+-------------+---------------+
> | old | 4000 | 320 | 15.7 |
> | new | 4000 | 320 | 15.8 |
> +-----+------------+-------------+---------------+
> | old | 4096 | 320 | 16.4 |
> | new | 4096 | 320 | 16.3 |
> +-----+------------+-------------+---------------+
> | new | 9000 | 320 | 6.3 |
> | new | 10240 | 9000 | 0.4 |
> +-----+------------+-------------+---------------+
>
> Signed-off-by: Kal Conley <[email protected]>
> ---
> include/net/xdp_sock.h | 1 +
> include/net/xdp_sock_drv.h | 6 +++++
> include/net/xsk_buff_pool.h | 4 +++-
> net/xdp/xdp_umem.c | 46 +++++++++++++++++++++++++++++--------
> net/xdp/xsk.c | 3 +++
> net/xdp/xsk_buff_pool.c | 16 +++++++++----
> 6 files changed, 61 insertions(+), 15 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 3057e1a4a11c..e562ac3f5295 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -28,6 +28,7 @@ struct xdp_umem {
> struct user_struct *user;
> refcount_t users;
> u8 flags;
> + bool hugetlb;
> bool zc;
> struct page **pgs;
> int id;
> diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
> index 9c0d860609ba..eb630d17f994 100644
> --- a/include/net/xdp_sock_drv.h
> +++ b/include/net/xdp_sock_drv.h
> @@ -12,6 +12,12 @@
> #define XDP_UMEM_MIN_CHUNK_SHIFT 11
> #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT)
>
> +/* Allow chunk sizes up to the maximum size of an ethernet frame (64 KiB).
> + * Larger chunks are not guaranteed to fit in a single SKB.
> + */
> +#define XDP_UMEM_MAX_CHUNK_SHIFT 16
> +#define XDP_UMEM_MAX_CHUNK_SIZE (1 << XDP_UMEM_MAX_CHUNK_SHIFT)
> +
> #ifdef CONFIG_XDP_SOCKETS
>
> void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
> diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> index 3e952e569418..69e3970da092 100644
> --- a/include/net/xsk_buff_pool.h
> +++ b/include/net/xsk_buff_pool.h
> @@ -78,6 +78,7 @@ struct xsk_buff_pool {
> u8 cached_need_wakeup;
> bool uses_need_wakeup;
> bool dma_need_sync;
> + bool hugetlb;
> bool unaligned;
> void *addrs;
> /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
> @@ -175,7 +176,8 @@ static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
> static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
> u64 addr, u32 len)
> {
> - bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
> + bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
> + (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
>
> if (likely(!cross_pg))
> return false;
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 02207e852d79..c96eefb9f5ae 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -10,6 +10,7 @@
> #include <linux/uaccess.h>
> #include <linux/slab.h>
> #include <linux/bpf.h>
> +#include <linux/hugetlb_inline.h>
> #include <linux/mm.h>
> #include <linux/netdevice.h>
> #include <linux/rtnetlink.h>
> @@ -19,6 +20,9 @@
> #include "xdp_umem.h"
> #include "xsk_queue.h"
>
> +_Static_assert(XDP_UMEM_MIN_CHUNK_SIZE <= PAGE_SIZE);
> +_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE <= HPAGE_SIZE);
> +
> static DEFINE_IDA(umem_ida);
>
> static void xdp_umem_unpin_pages(struct xdp_umem *umem)
> @@ -91,7 +95,26 @@ void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup)
> }
> }
>
> -static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
> +/* Returns true if the UMEM contains HugeTLB pages exclusively, false otherwise.
> + *
> + * The mmap_lock must be held by the caller.
> + */
> +static bool xdp_umem_is_hugetlb(struct xdp_umem *umem, unsigned long address)
> +{
> + unsigned long end = address + umem->npgs * PAGE_SIZE;
> + struct vm_area_struct *vma;
> + struct vma_iterator vmi;
> +
> + vma_iter_init(&vmi, current->mm, address);
> + for_each_vma_range(vmi, vma, end) {
> + if (!is_vm_hugetlb_page(vma))
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address, bool hugetlb)
> {
> unsigned int gup_flags = FOLL_WRITE;
> long npgs;
> @@ -102,8 +125,17 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
> return -ENOMEM;
>
> mmap_read_lock(current->mm);
> +
> + umem->hugetlb = IS_ALIGNED(address, HPAGE_SIZE) && xdp_umem_is_hugetlb(umem, address);
> + if (hugetlb && !umem->hugetlb) {
> + mmap_read_unlock(current->mm);
> + err = -EINVAL;
> + goto out_pgs;
> + }
> +
> npgs = pin_user_pages(address, umem->npgs,
> gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
> +
> mmap_read_unlock(current->mm);
>
> if (npgs != umem->npgs) {
> @@ -152,20 +184,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> {
> bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
> u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
> + bool hugetlb = chunk_size > PAGE_SIZE;

require_hugetlb would be a clearer name

> u64 addr = mr->addr, size = mr->len;
> u32 chunks_rem, npgs_rem;
> u64 chunks, npgs;
> int err;
>
> - if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
> - /* Strictly speaking we could support this, if:
> - * - huge pages, or*
> - * - using an IOMMU, or
> - * - making sure the memory area is consecutive
> - * but for now, we simply say "computer says no".
> - */
> + if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > XDP_UMEM_MAX_CHUNK_SIZE)
> return -EINVAL;
> - }
>
> if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> return -EINVAL;
> @@ -215,7 +241,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> if (err)
> return err;
>
> - err = xdp_umem_pin_pages(umem, (unsigned long)addr);
> + err = xdp_umem_pin_pages(umem, (unsigned long)addr, hugetlb);
> if (err)
> goto out_account;
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 2ac58b282b5e..3899a2d235bb 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -421,6 +421,9 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +/* Chunks must fit in the SKB `frags` array. */
> +_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE / PAGE_SIZE <= MAX_SKB_FRAGS);
> +
> static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> struct xdp_desc *desc)
> {
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index b2df1e0f8153..777e8a38a232 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -80,6 +80,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> pool->headroom = umem->headroom;
> pool->chunk_size = umem->chunk_size;
> pool->chunk_shift = ffs(umem->chunk_size) - 1;
> + pool->hugetlb = umem->hugetlb;
> pool->unaligned = unaligned;
> pool->frame_len = umem->chunk_size - umem->headroom -
> XDP_PACKET_HEADROOM;
> @@ -369,16 +370,23 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
> }
> EXPORT_SYMBOL(xp_dma_unmap);
>
> -static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
> +/* HugeTLB pools consider contiguity at hugepage granularity only. Hence, all
> + * order-0 pages within a hugepage have the same contiguity value.
> + */
> +static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map, bool hugetlb)
> {
> + u32 page_size = hugetlb ? HPAGE_SIZE : PAGE_SIZE;
> + u32 n = page_size >> PAGE_SHIFT;

next_mapping? n as a name is not very descriptive.

> u32 i;
>
> - for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
> - if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
> + for (i = 0; i + n < dma_map->dma_pages_cnt; i++) {
> + if (dma_map->dma_pages[i] + page_size == dma_map->dma_pages[i + n])
> dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
> else
> dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
> }
> + for (; i < dma_map->dma_pages_cnt; i++)
> + dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;

Is this not too conservative? If your umem consists of two huge pages
mappings but they are not mapped consecutively in physical memory, you
are going to mark all the chunks as non-consecutive. Would it not be
better to just look chunk_size ahead of you instead of page_size
above? The only thing you care about is that the chunk you are in is
in consecutive physical memory, and that is strictly only true for
zero-copy mode. So this seems to be in preparation for zero-copy mode.


> }
>
> static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
> @@ -441,7 +449,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
> }
>
> if (pool->unaligned)
> - xp_check_dma_contiguity(dma_map);
> + xp_check_dma_contiguity(dma_map, pool->hugetlb);
>
> err = xp_init_dma_info(pool, dma_map);
> if (err) {
> --
> 2.39.2
>

2023-03-21 10:48:56

by Maciej Fijalkowski

[permalink] [raw]
Subject: Re: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE

On Sun, Mar 19, 2023 at 08:56:54PM +0100, Kal Conley wrote:
> Add core AF_XDP support for chunk sizes larger than PAGE_SIZE. This
> enables sending/receiving jumbo ethernet frames up to the theoretical
> maxiumum of 64 KiB. For chunk sizes > PAGE_SIZE, the UMEM is required
> to consist of HugeTLB VMAs (and be hugepage aligned). Initially, only
> XDP_COPY mode is usuable pending future driver work.
>
> For consistency, check for HugeTLB pages during UMEM registration. This
> implies that hugepages are required for XDP_COPY mode despite DMA not
> being used. This restriction is desirable since it ensures user software
> can take advantage of future driver support.
>
> Even in HugeTLB mode, continue to do page accounting using order-0
> (4 KiB) pages. This minimizes the size of this change and reduces the
> risk of impacting driver code. Taking full advantage of hugepages for
> accounting should improve XDP performance in the general case.
>
> No significant change in RX/TX performance was observed with this patch.
> A few data points are reproduced below:
>
> Machine : Dell PowerEdge R940
> CPU : Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz
> NIC : MT27700 Family [ConnectX-4]

can you tell us a bit more about your environment setup - from what i can
tell mlx4 does not support xdp multi-buffer, so you were testing this
feature with XDP program attached in generic mode?

if you were using xdpsock -S or xdpsock -c? or maybe even something
different?

what was your MTU size on NIC?

>
> +-----+------------+-------------+---------------+
> | | frame size | packet size | rxdrop (Mpps) |
> +-----+------------+-------------+---------------+
> | old | 4000 | 320 | 15.7 |
> | new | 4000 | 320 | 15.8 |
> +-----+------------+-------------+---------------+
> | old | 4096 | 320 | 16.4 |
> | new | 4096 | 320 | 16.3 |
> +-----+------------+-------------+---------------+
> | new | 9000 | 320 | 6.3 |
> | new | 10240 | 9000 | 0.4 |
> +-----+------------+-------------+---------------+
>
> Signed-off-by: Kal Conley <[email protected]>
> ---
> include/net/xdp_sock.h | 1 +
> include/net/xdp_sock_drv.h | 6 +++++
> include/net/xsk_buff_pool.h | 4 +++-
> net/xdp/xdp_umem.c | 46 +++++++++++++++++++++++++++++--------
> net/xdp/xsk.c | 3 +++
> net/xdp/xsk_buff_pool.c | 16 +++++++++----
> 6 files changed, 61 insertions(+), 15 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 3057e1a4a11c..e562ac3f5295 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -28,6 +28,7 @@ struct xdp_umem {
> struct user_struct *user;
> refcount_t users;
> u8 flags;
> + bool hugetlb;
> bool zc;
> struct page **pgs;
> int id;
> diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
> index 9c0d860609ba..eb630d17f994 100644
> --- a/include/net/xdp_sock_drv.h
> +++ b/include/net/xdp_sock_drv.h
> @@ -12,6 +12,12 @@
> #define XDP_UMEM_MIN_CHUNK_SHIFT 11
> #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT)
>
> +/* Allow chunk sizes up to the maximum size of an ethernet frame (64 KiB).
> + * Larger chunks are not guaranteed to fit in a single SKB.
> + */
> +#define XDP_UMEM_MAX_CHUNK_SHIFT 16
> +#define XDP_UMEM_MAX_CHUNK_SIZE (1 << XDP_UMEM_MAX_CHUNK_SHIFT)
> +
> #ifdef CONFIG_XDP_SOCKETS
>
> void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
> diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> index 3e952e569418..69e3970da092 100644
> --- a/include/net/xsk_buff_pool.h
> +++ b/include/net/xsk_buff_pool.h
> @@ -78,6 +78,7 @@ struct xsk_buff_pool {
> u8 cached_need_wakeup;
> bool uses_need_wakeup;
> bool dma_need_sync;
> + bool hugetlb;
> bool unaligned;
> void *addrs;
> /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
> @@ -175,7 +176,8 @@ static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
> static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
> u64 addr, u32 len)
> {
> - bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
> + bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
> + (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
>
> if (likely(!cross_pg))
> return false;
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 02207e852d79..c96eefb9f5ae 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -10,6 +10,7 @@
> #include <linux/uaccess.h>
> #include <linux/slab.h>
> #include <linux/bpf.h>
> +#include <linux/hugetlb_inline.h>
> #include <linux/mm.h>
> #include <linux/netdevice.h>
> #include <linux/rtnetlink.h>
> @@ -19,6 +20,9 @@
> #include "xdp_umem.h"
> #include "xsk_queue.h"
>
> +_Static_assert(XDP_UMEM_MIN_CHUNK_SIZE <= PAGE_SIZE);
> +_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE <= HPAGE_SIZE);
> +
> static DEFINE_IDA(umem_ida);
>
> static void xdp_umem_unpin_pages(struct xdp_umem *umem)
> @@ -91,7 +95,26 @@ void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup)
> }
> }
>
> -static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
> +/* Returns true if the UMEM contains HugeTLB pages exclusively, false otherwise.
> + *
> + * The mmap_lock must be held by the caller.
> + */
> +static bool xdp_umem_is_hugetlb(struct xdp_umem *umem, unsigned long address)
> +{
> + unsigned long end = address + umem->npgs * PAGE_SIZE;
> + struct vm_area_struct *vma;
> + struct vma_iterator vmi;
> +
> + vma_iter_init(&vmi, current->mm, address);
> + for_each_vma_range(vmi, vma, end) {
> + if (!is_vm_hugetlb_page(vma))
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address, bool hugetlb)
> {
> unsigned int gup_flags = FOLL_WRITE;
> long npgs;
> @@ -102,8 +125,17 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
> return -ENOMEM;
>
> mmap_read_lock(current->mm);
> +
> + umem->hugetlb = IS_ALIGNED(address, HPAGE_SIZE) && xdp_umem_is_hugetlb(umem, address);
> + if (hugetlb && !umem->hugetlb) {
> + mmap_read_unlock(current->mm);
> + err = -EINVAL;
> + goto out_pgs;
> + }
> +
> npgs = pin_user_pages(address, umem->npgs,
> gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
> +
> mmap_read_unlock(current->mm);
>
> if (npgs != umem->npgs) {
> @@ -152,20 +184,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> {
> bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
> u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
> + bool hugetlb = chunk_size > PAGE_SIZE;
> u64 addr = mr->addr, size = mr->len;
> u32 chunks_rem, npgs_rem;
> u64 chunks, npgs;
> int err;
>
> - if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
> - /* Strictly speaking we could support this, if:
> - * - huge pages, or*
> - * - using an IOMMU, or
> - * - making sure the memory area is consecutive
> - * but for now, we simply say "computer says no".
> - */
> + if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > XDP_UMEM_MAX_CHUNK_SIZE)
> return -EINVAL;
> - }
>
> if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> return -EINVAL;
> @@ -215,7 +241,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> if (err)
> return err;
>
> - err = xdp_umem_pin_pages(umem, (unsigned long)addr);
> + err = xdp_umem_pin_pages(umem, (unsigned long)addr, hugetlb);
> if (err)
> goto out_account;
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 2ac58b282b5e..3899a2d235bb 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -421,6 +421,9 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +/* Chunks must fit in the SKB `frags` array. */
> +_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE / PAGE_SIZE <= MAX_SKB_FRAGS);
> +
> static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> struct xdp_desc *desc)
> {
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index b2df1e0f8153..777e8a38a232 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -80,6 +80,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> pool->headroom = umem->headroom;
> pool->chunk_size = umem->chunk_size;
> pool->chunk_shift = ffs(umem->chunk_size) - 1;
> + pool->hugetlb = umem->hugetlb;
> pool->unaligned = unaligned;
> pool->frame_len = umem->chunk_size - umem->headroom -
> XDP_PACKET_HEADROOM;
> @@ -369,16 +370,23 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
> }
> EXPORT_SYMBOL(xp_dma_unmap);
>
> -static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
> +/* HugeTLB pools consider contiguity at hugepage granularity only. Hence, all
> + * order-0 pages within a hugepage have the same contiguity value.
> + */
> +static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map, bool hugetlb)
> {
> + u32 page_size = hugetlb ? HPAGE_SIZE : PAGE_SIZE;
> + u32 n = page_size >> PAGE_SHIFT;
> u32 i;
>
> - for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
> - if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
> + for (i = 0; i + n < dma_map->dma_pages_cnt; i++) {
> + if (dma_map->dma_pages[i] + page_size == dma_map->dma_pages[i + n])
> dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
> else
> dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
> }
> + for (; i < dma_map->dma_pages_cnt; i++)
> + dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
> }
>
> static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
> @@ -441,7 +449,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
> }
>
> if (pool->unaligned)
> - xp_check_dma_contiguity(dma_map);
> + xp_check_dma_contiguity(dma_map, pool->hugetlb);
>
> err = xp_init_dma_info(pool, dma_map);
> if (err) {
> --
> 2.39.2
>

2023-03-21 19:36:06

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE

On Tue, 21 Mar 2023 11:48:27 +0100 Maciej Fijalkowski wrote:
> tell mlx4 does not support xdp multi-buffer, so you were testing this

FWIW

ConnectX-3 [PRO] == mlx4
ConnectX-4 == mlx5

But agreed that more info would help. You'd need to have some TLB
pressure to see benefit of huge pages.

2023-03-29 15:37:15

by Kal Cutter Conley

[permalink] [raw]
Subject: Re: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE

> can you tell us a bit more about your environment setup - from what i can
> tell mlx4 does not support xdp multi-buffer, so you were testing this
> feature with XDP program attached in generic mode?

This card uses the mlx5 driver. These numbers were taken from a
slightly modified xdpsock example. XDP multi-buffer was not used.

>
> if you were using xdpsock -S or xdpsock -c? or maybe even something
> different?
>
> what was your MTU size on NIC?

I will update the table in the v2 patchset with more information.

Kal

2023-03-29 16:02:59

by Kal Cutter Conley

[permalink] [raw]
Subject: Re: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE

> > Add core AF_XDP support for chunk sizes larger than PAGE_SIZE. This
> > enables sending/receiving jumbo ethernet frames up to the theoretical
> > maxiumum of 64 KiB. For chunk sizes > PAGE_SIZE, the UMEM is required
> > to consist of HugeTLB VMAs (and be hugepage aligned). Initially, only
> > XDP_COPY mode is usuable pending future driver work.
>
> nit: useable

Fixed in v2.

>
> > For consistency, check for HugeTLB pages during UMEM registration. This
> > implies that hugepages are required for XDP_COPY mode despite DMA not
> > being used. This restriction is desirable since it ensures user software
> > can take advantage of future driver support.
> >
> > Even in HugeTLB mode, continue to do page accounting using order-0
> > (4 KiB) pages. This minimizes the size of this change and reduces the
> > risk of impacting driver code. Taking full advantage of hugepages for
> > accounting should improve XDP performance in the general case.
>
> Thank you Kal for working on this. Interesting stuff.
>
> First some general comments and questions on the patch set:
>
> * Please document this new feature in Documentation/networking/af_xdp.rst

Fixed in v2.

> * Have you verified the SKB path for Rx? Tx was exercised by running l2fwd.

This patchset allows sending/receiving 9000 MTU packets with xdpsock
(slightly modified). The benchmark numbers show the results for rxdrop
(-r).

> * Have you checked that an XDP program can access the full >4K packet?
> The xdp_buff has no problem with this as the buffer is consecutive,
> but just wondering if there is some other check or limit in there?
> Jesper and Toke will likely know, so roping them in.

Yes, the full packet can be accessed from a SEC("xdp") BPF program
(only tested in SKB mode).

> * Would be interesting to know your thoughts about taking this to
> zero-copy mode too. It would be good if you could support all modes
> from the get go, instead of partial support for some unknown amount of
> time and then zero-copy support. Partial support makes using the
> feature more cumbersome when an app is deployed on various systems.
> The continuity checking code you have at the end of the patch is a
> step in the direction of zero-copy support, it seems.

I think this patchset is enough to support zero-copy as long as the
driver allows it. Currently, no drivers will work out of the box AFAIK
since they all validate the chunk_size or the MTU size. I would
absolutely love for drivers to support this. Hopefully this patchset
is enough inspiration? :-) Do you think it's absolutely necessary to
have driver ZC support ready to land this?

> * What happens if I try to run this in zero-copy mode with a chunk_size > 4K?

AFAIK drivers check for this and throw an error. Maybe there are some
drivers that don't check this properly?

> * There are some compilation errors to fix from the kernel test robot

Fixed in v2.

>
> require_hugetlb would be a clearer name

Fixed in v2. Renamed to `need_hugetlb`.

>
> next_mapping? n as a name is not very descriptive.

Fixed in v2. Renamed to `stride`.

>
> > u32 i;
> >
> > - for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
> > - if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
> > + for (i = 0; i + n < dma_map->dma_pages_cnt; i++) {
> > + if (dma_map->dma_pages[i] + page_size == dma_map->dma_pages[i + n])
> > dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
> > else
> > dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
> > }
> > + for (; i < dma_map->dma_pages_cnt; i++)
> > + dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
>
> Is this not too conservative? If your umem consists of two huge pages
> mappings but they are not mapped consecutively in physical memory, you
> are going to mark all the chunks as non-consecutive. Would it not be
> better to just look chunk_size ahead of you instead of page_size
> above? The only thing you care about is that the chunk you are in is
> in consecutive physical memory, and that is strictly only true for
> zero-copy mode. So this seems to be in preparation for zero-copy mode.
>

It is slightly too conservative. I have updated the logic a bit in v2.
If the packet doesn't cross a page boundary, then this array is not
read anyway.

Thanks!
Kal