2019-05-23 18:23:30

by Ivan Khoronzhuk

[permalink] [raw]
Subject: [PATCH net-next 0/3] net: ethernet: ti: cpsw: Add XDP support

This patchset add XDP support for TI cpsw driver and base it on
page_pool allocator. It was verified on af_xdp socket drop,
af_xdp l2f, ebpf XDP_DROP, XDP_REDIRECT, XDP_PASS, XDP_TX.

It was verified with following configs enabled:
CONFIG_JIT=y
CONFIG_BPFILTER=y
CONFIG_BPF_SYSCALL=y
CONFIG_XDP_SOCKETS=y
CONFIG_BPF_EVENTS=y
CONFIG_HAVE_EBPF_JIT=y
CONFIG_BPF_JIT=y
CONFIG_CGROUP_BPF=y

Link on previous RFC:
https://lkml.org/lkml/2019/4/17/861

Also regular tests with iperf2 were done in order to verify impact on
regular netstack performance, compared with base commit:
https://pastebin.com/JSMT0iZ4

Based on net-next/master

Ivan Khoronzhuk (3):
net: ethernet: ti: davinci_cpdma: add dma mapped submit
net: ethernet: ti: davinci_cpdma: return handler status
net: ethernet: ti: cpsw: add XDP support

drivers/net/ethernet/ti/Kconfig | 1 +
drivers/net/ethernet/ti/cpsw.c | 570 +++++++++++++++++++++---
drivers/net/ethernet/ti/cpsw_ethtool.c | 55 ++-
drivers/net/ethernet/ti/cpsw_priv.h | 9 +-
drivers/net/ethernet/ti/davinci_cpdma.c | 122 +++--
drivers/net/ethernet/ti/davinci_cpdma.h | 6 +-
drivers/net/ethernet/ti/davinci_emac.c | 18 +-
7 files changed, 675 insertions(+), 106 deletions(-)

--
2.17.1


2019-05-23 18:23:38

by Ivan Khoronzhuk

[permalink] [raw]
Subject: [PATCH net-next 2/3] net: ethernet: ti: davinci_cpdma: return handler status

This change is needed to return flush status of rx handler for
flushing redirected xdp frames after processing channel packets.
Do it as separate patch for simplicity.

Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
drivers/net/ethernet/ti/cpsw.c | 23 +++++++++++------
drivers/net/ethernet/ti/cpsw_ethtool.c | 2 +-
drivers/net/ethernet/ti/cpsw_priv.h | 2 +-
drivers/net/ethernet/ti/davinci_cpdma.c | 34 +++++++++++++++----------
drivers/net/ethernet/ti/davinci_cpdma.h | 4 +--
drivers/net/ethernet/ti/davinci_emac.c | 18 ++++++++-----
6 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 634fc484a0b3..87a600aeee4a 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -337,7 +337,7 @@ void cpsw_intr_disable(struct cpsw_common *cpsw)
return;
}

-void cpsw_tx_handler(void *token, int len, int status)
+int cpsw_tx_handler(void *token, int len, int status)
{
struct netdev_queue *txq;
struct sk_buff *skb = token;
@@ -355,6 +355,7 @@ void cpsw_tx_handler(void *token, int len, int status)
ndev->stats.tx_packets++;
ndev->stats.tx_bytes += len;
dev_kfree_skb_any(skb);
+ return 0;
}

static void cpsw_rx_vlan_encap(struct sk_buff *skb)
@@ -400,7 +401,7 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
}
}

-static void cpsw_rx_handler(void *token, int len, int status)
+static int cpsw_rx_handler(void *token, int len, int status)
{
struct cpdma_chan *ch;
struct sk_buff *skb = token;
@@ -434,7 +435,7 @@ static void cpsw_rx_handler(void *token, int len, int status)

/* the interface is going down, skbs are purged */
dev_kfree_skb_any(skb);
- return;
+ return 0;
}

new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
@@ -459,7 +460,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
requeue:
if (netif_dormant(ndev)) {
dev_kfree_skb_any(new_skb);
- return;
+ return 0;
}

ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch;
@@ -467,6 +468,8 @@ static void cpsw_rx_handler(void *token, int len, int status)
skb_tailroom(new_skb), 0);
if (WARN_ON(ret < 0))
dev_kfree_skb_any(new_skb);
+
+ return 0;
}

void cpsw_split_res(struct cpsw_common *cpsw)
@@ -605,7 +608,8 @@ static int cpsw_tx_mq_poll(struct napi_struct *napi_tx, int budget)
else
cur_budget = txv->budget;

- num_tx += cpdma_chan_process(txv->ch, cur_budget);
+ cpdma_chan_process(txv->ch, &cur_budget);
+ num_tx += cur_budget;
if (num_tx >= budget)
break;
}
@@ -623,7 +627,8 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
struct cpsw_common *cpsw = napi_to_cpsw(napi_tx);
int num_tx;

- num_tx = cpdma_chan_process(cpsw->txv[0].ch, budget);
+ num_tx = budget;
+ cpdma_chan_process(cpsw->txv[0].ch, &num_tx);
if (num_tx < budget) {
napi_complete(napi_tx);
writel(0xff, &cpsw->wr_regs->tx_en);
@@ -655,7 +660,8 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
else
cur_budget = rxv->budget;

- num_rx += cpdma_chan_process(rxv->ch, cur_budget);
+ cpdma_chan_process(rxv->ch, &cur_budget);
+ num_rx += cur_budget;
if (num_rx >= budget)
break;
}
@@ -673,7 +679,8 @@ static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
int num_rx;

- num_rx = cpdma_chan_process(cpsw->rxv[0].ch, budget);
+ num_rx = budget;
+ cpdma_chan_process(cpsw->rxv[0].ch, &num_rx);
if (num_rx < budget) {
napi_complete_done(napi_rx, num_rx);
writel(0xff, &cpsw->wr_regs->rx_en);
diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c
index a4a7ec0d2531..0c08ec91635a 100644
--- a/drivers/net/ethernet/ti/cpsw_ethtool.c
+++ b/drivers/net/ethernet/ti/cpsw_ethtool.c
@@ -535,8 +535,8 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx,
cpdma_handler_fn rx_handler)
{
struct cpsw_common *cpsw = priv->cpsw;
- void (*handler)(void *, int, int);
struct netdev_queue *queue;
+ cpdma_handler_fn handler;
struct cpsw_vector *vec;
int ret, *ch, vch;

diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
index 04795b97ee71..2ecb3af59fe9 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.h
+++ b/drivers/net/ethernet/ti/cpsw_priv.h
@@ -390,7 +390,7 @@ void cpsw_split_res(struct cpsw_common *cpsw);
int cpsw_fill_rx_channels(struct cpsw_priv *priv);
void cpsw_intr_enable(struct cpsw_common *cpsw);
void cpsw_intr_disable(struct cpsw_common *cpsw);
-void cpsw_tx_handler(void *token, int len, int status);
+int cpsw_tx_handler(void *token, int len, int status);

/* ethtool */
u32 cpsw_get_msglevel(struct net_device *ndev);
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 7f89b2299f05..b3d4dfd760d2 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -1137,15 +1137,16 @@ bool cpdma_check_free_tx_desc(struct cpdma_chan *chan)
return free_tx_desc;
}

-static void __cpdma_chan_free(struct cpdma_chan *chan,
- struct cpdma_desc __iomem *desc,
- int outlen, int status)
+static int __cpdma_chan_free(struct cpdma_chan *chan,
+ struct cpdma_desc __iomem *desc, int outlen,
+ int status)
{
struct cpdma_ctlr *ctlr = chan->ctlr;
struct cpdma_desc_pool *pool = ctlr->pool;
dma_addr_t buff_dma;
int origlen;
uintptr_t token;
+ int ret;

token = desc_read(desc, sw_token);
origlen = desc_read(desc, sw_len);
@@ -1160,14 +1161,16 @@ static void __cpdma_chan_free(struct cpdma_chan *chan,
}

cpdma_desc_free(pool, desc, 1);
- (*chan->handler)((void *)token, outlen, status);
+ ret = (*chan->handler)((void *)token, outlen, status);
+
+ return ret;
}

static int __cpdma_chan_process(struct cpdma_chan *chan)
{
+ int status, outlen, ret;
struct cpdma_ctlr *ctlr = chan->ctlr;
struct cpdma_desc __iomem *desc;
- int status, outlen;
int cb_status = 0;
struct cpdma_desc_pool *pool = ctlr->pool;
dma_addr_t desc_dma;
@@ -1178,7 +1181,7 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
desc = chan->head;
if (!desc) {
chan->stats.empty_dequeue++;
- status = -ENOENT;
+ ret = -ENOENT;
goto unlock_ret;
}
desc_dma = desc_phys(pool, desc);
@@ -1187,7 +1190,7 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
outlen = status & 0x7ff;
if (status & CPDMA_DESC_OWNER) {
chan->stats.busy_dequeue++;
- status = -EBUSY;
+ ret = -EBUSY;
goto unlock_ret;
}

@@ -1213,28 +1216,31 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
else
cb_status = status;

- __cpdma_chan_free(chan, desc, outlen, cb_status);
- return status;
+ ret = __cpdma_chan_free(chan, desc, outlen, cb_status);
+ return ret;

unlock_ret:
spin_unlock_irqrestore(&chan->lock, flags);
- return status;
+ return ret;
}

-int cpdma_chan_process(struct cpdma_chan *chan, int quota)
+int cpdma_chan_process(struct cpdma_chan *chan, int *quota)
{
- int used = 0, ret = 0;
+ int used = 0, ret = 0, res = 0;

if (chan->state != CPDMA_STATE_ACTIVE)
return -EINVAL;

- while (used < quota) {
+ while (used < *quota) {
ret = __cpdma_chan_process(chan);
if (ret < 0)
break;
+ res += ret;
used++;
}
- return used;
+
+ *quota = used;
+ return res;
}

int cpdma_chan_start(struct cpdma_chan *chan)
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index 8f6f27185c63..56543d375923 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -61,7 +61,7 @@ struct cpdma_chan_stats {
struct cpdma_ctlr;
struct cpdma_chan;

-typedef void (*cpdma_handler_fn)(void *token, int len, int status);
+typedef int (*cpdma_handler_fn)(void *token, int len, int status);

struct cpdma_ctlr *cpdma_ctlr_create(struct cpdma_params *params);
int cpdma_ctlr_destroy(struct cpdma_ctlr *ctlr);
@@ -81,7 +81,7 @@ int cpdma_chan_submit_mapped(struct cpdma_chan *chan, void *token,
dma_addr_t data, int len, int directed);
int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
int len, int directed);
-int cpdma_chan_process(struct cpdma_chan *chan, int quota);
+int cpdma_chan_process(struct cpdma_chan *chan, int *quota);

int cpdma_ctlr_int_ctrl(struct cpdma_ctlr *ctlr, bool enable);
void cpdma_ctlr_eoi(struct cpdma_ctlr *ctlr, u32 value);
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 4bf65cab79e6..3592690b8dd8 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -860,7 +860,7 @@ static struct sk_buff *emac_rx_alloc(struct emac_priv *priv)
return skb;
}

-static void emac_rx_handler(void *token, int len, int status)
+static int emac_rx_handler(void *token, int len, int status)
{
struct sk_buff *skb = token;
struct net_device *ndev = skb->dev;
@@ -871,7 +871,7 @@ static void emac_rx_handler(void *token, int len, int status)
/* free and bail if we are shutting down */
if (unlikely(!netif_running(ndev))) {
dev_kfree_skb_any(skb);
- return;
+ return 0;
}

/* recycle on receive error */
@@ -892,7 +892,7 @@ static void emac_rx_handler(void *token, int len, int status)
if (!skb) {
if (netif_msg_rx_err(priv) && net_ratelimit())
dev_err(emac_dev, "failed rx buffer alloc\n");
- return;
+ return 0;
}

recycle:
@@ -902,9 +902,11 @@ static void emac_rx_handler(void *token, int len, int status)
WARN_ON(ret == -ENOMEM);
if (unlikely(ret < 0))
dev_kfree_skb_any(skb);
+
+ return 0;
}

-static void emac_tx_handler(void *token, int len, int status)
+static int emac_tx_handler(void *token, int len, int status)
{
struct sk_buff *skb = token;
struct net_device *ndev = skb->dev;
@@ -917,6 +919,7 @@ static void emac_tx_handler(void *token, int len, int status)
ndev->stats.tx_packets++;
ndev->stats.tx_bytes += len;
dev_kfree_skb_any(skb);
+ return 0;
}

/**
@@ -1237,8 +1240,8 @@ static int emac_poll(struct napi_struct *napi, int budget)
mask = EMAC_DM646X_MAC_IN_VECTOR_TX_INT_VEC;

if (status & mask) {
- num_tx_pkts = cpdma_chan_process(priv->txchan,
- EMAC_DEF_TX_MAX_SERVICE);
+ num_tx_pkts = EMAC_DEF_TX_MAX_SERVICE;
+ cpdma_chan_process(priv->txchan, &num_tx_pkts);
} /* TX processing */

mask = EMAC_DM644X_MAC_IN_VECTOR_RX_INT_VEC;
@@ -1247,7 +1250,8 @@ static int emac_poll(struct napi_struct *napi, int budget)
mask = EMAC_DM646X_MAC_IN_VECTOR_RX_INT_VEC;

if (status & mask) {
- num_rx_pkts = cpdma_chan_process(priv->rxchan, budget);
+ num_rx_pkts = budget;
+ cpdma_chan_process(priv->rxchan, &num_rx_pkts);
} /* RX processing */

mask = EMAC_DM644X_MAC_IN_VECTOR_HOST_INT;
--
2.17.1

2019-05-23 18:23:46

by Ivan Khoronzhuk

[permalink] [raw]
Subject: [PATCH net-next 3/3] net: ethernet: ti: cpsw: add XDP support

Add XDP support based on rx page_pool allocator, one frame per page.
Page pool allocator is used with assumption that only one rx_handler
is running simultaneously. DMA map/unmap is reused from page pool
despite there is no need to map whole page.

Due to specific of cpsw, the same TX/RX handler can be used by 2
network devices, so special fields in buffer are added to identify
an interface the frame is destined to. Thus XDP works for both
interfaces, that allows to test xdp redirect between two interfaces
easily.

XDP prog is common for all channels till appropriate changes are added
in XDP infrastructure.

Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
drivers/net/ethernet/ti/Kconfig | 1 +
drivers/net/ethernet/ti/cpsw.c | 555 ++++++++++++++++++++++---
drivers/net/ethernet/ti/cpsw_ethtool.c | 53 +++
drivers/net/ethernet/ti/cpsw_priv.h | 7 +
4 files changed, 554 insertions(+), 62 deletions(-)

diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
index bd05a977ee7e..3cb8c5214835 100644
--- a/drivers/net/ethernet/ti/Kconfig
+++ b/drivers/net/ethernet/ti/Kconfig
@@ -50,6 +50,7 @@ config TI_CPSW
depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST
select TI_DAVINCI_MDIO
select MFD_SYSCON
+ select PAGE_POOL
select REGMAP
---help---
This driver supports TI's CPSW Ethernet Switch.
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 87a600aeee4a..274e6b64ea9e 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -31,6 +31,10 @@
#include <linux/if_vlan.h>
#include <linux/kmemleak.h>
#include <linux/sys_soc.h>
+#include <net/page_pool.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>

#include <linux/pinctrl/consumer.h>
#include <net/pkt_cls.h>
@@ -60,6 +64,10 @@ static int descs_pool_size = CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT;
module_param(descs_pool_size, int, 0444);
MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");

+/* The buf includes headroom compatible with both skb and xdpf */
+#define CPSW_HEADROOM_NA (max(XDP_PACKET_HEADROOM, NET_SKB_PAD) + NET_IP_ALIGN)
+#define CPSW_HEADROOM ALIGN(CPSW_HEADROOM_NA, sizeof(long))
+
#define for_each_slave(priv, func, arg...) \
do { \
struct cpsw_slave *slave; \
@@ -74,6 +82,8 @@ MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");
(func)(slave++, ##arg); \
} while (0)

+#define CPSW_XMETA_OFFSET ALIGN(sizeof(struct xdp_frame), sizeof(long))
+
static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
__be16 proto, u16 vid);

@@ -337,24 +347,58 @@ void cpsw_intr_disable(struct cpsw_common *cpsw)
return;
}

+static int cpsw_is_xdpf_handle(void *handle)
+{
+ return (unsigned long)handle & BIT(0);
+}
+
+static void *cpsw_xdpf_to_handle(struct xdp_frame *xdpf)
+{
+ return (void *)((unsigned long)xdpf | BIT(0));
+}
+
+static struct xdp_frame *cpsw_handle_to_xdpf(void *handle)
+{
+ return (struct xdp_frame *)((unsigned long)handle & ~BIT(0));
+}
+
+struct __aligned(sizeof(long)) cpsw_meta_xdp {
+ struct net_device *ndev;
+ int ch;
+};
+
int cpsw_tx_handler(void *token, int len, int status)
{
+ struct cpsw_meta_xdp *xmeta;
+ struct xdp_frame *xdpf;
+ struct net_device *ndev;
struct netdev_queue *txq;
- struct sk_buff *skb = token;
- struct net_device *ndev = skb->dev;
- struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
+ struct sk_buff *skb;
+ int ch;
+
+ if (cpsw_is_xdpf_handle(token)) {
+ xdpf = cpsw_handle_to_xdpf(token);
+ xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
+ ndev = xmeta->ndev;
+ ch = xmeta->ch;
+ xdp_return_frame_rx_napi(xdpf);
+ } else {
+ skb = token;
+ ndev = skb->dev;
+ ch = skb_get_queue_mapping(skb);
+ cpts_tx_timestamp(ndev_to_cpsw(ndev)->cpts, skb);
+ dev_kfree_skb_any(skb);
+ }

/* Check whether the queue is stopped due to stalled tx dma, if the
* queue is stopped then start the queue as we have free desc for tx
*/
- txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb));
+ txq = netdev_get_tx_queue(ndev, ch);
if (unlikely(netif_tx_queue_stopped(txq)))
netif_tx_wake_queue(txq);

- cpts_tx_timestamp(cpsw->cpts, skb);
ndev->stats.tx_packets++;
ndev->stats.tx_bytes += len;
- dev_kfree_skb_any(skb);
return 0;
}

@@ -401,22 +445,229 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
}
}

+static int cpsw_xdp_tx_frame_mapped(struct cpsw_priv *priv,
+ struct xdp_frame *xdpf, struct page *page)
+{
+ struct cpsw_common *cpsw = priv->cpsw;
+ struct cpsw_meta_xdp *xmeta;
+ struct netdev_queue *txq;
+ struct cpdma_chan *txch;
+ dma_addr_t dma;
+ int ret;
+
+ xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
+ xmeta->ch = 0;
+
+ txch = cpsw->txv[0].ch;
+ dma = (xdpf->data - (void *)xdpf) + page->dma_addr;
+ ret = cpdma_chan_submit_mapped(txch, cpsw_xdpf_to_handle(xdpf), dma,
+ xdpf->len,
+ priv->emac_port + cpsw->data.dual_emac);
+ if (ret) {
+ xdp_return_frame_rx_napi(xdpf);
+ goto stop;
+ }
+
+ /* no tx desc - stop sending us tx frames */
+ if (unlikely(!cpdma_check_free_tx_desc(txch)))
+ goto stop;
+
+ return ret;
+stop:
+ txq = netdev_get_tx_queue(priv->ndev, 0);
+ netif_tx_stop_queue(txq);
+
+ /* Barrier, so that stop_queue visible to other cpus */
+ smp_mb__after_atomic();
+
+ if (cpdma_check_free_tx_desc(txch))
+ netif_tx_wake_queue(txq);
+
+ return ret;
+}
+
+static int cpsw_xdp_tx_frame(struct cpsw_priv *priv, struct xdp_frame *xdpf)
+{
+ struct cpsw_common *cpsw = priv->cpsw;
+ struct cpsw_meta_xdp *xmeta;
+ struct netdev_queue *txq;
+ struct cpdma_chan *txch;
+ int ret;
+
+ xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
+ if (sizeof(*xmeta) > xdpf->headroom)
+ return -EINVAL;
+
+ xmeta->ndev = priv->ndev;
+ xmeta->ch = 0;
+
+ txch = cpsw->txv[0].ch;
+ ret = cpdma_chan_submit(txch, cpsw_xdpf_to_handle(xdpf), xdpf->data,
+ xdpf->len,
+ priv->emac_port + cpsw->data.dual_emac);
+ if (ret) {
+ xdp_return_frame_rx_napi(xdpf);
+ goto stop;
+ }
+
+ /* no tx desc - stop sending us tx frames */
+ if (unlikely(!cpdma_check_free_tx_desc(txch)))
+ goto stop;
+
+ return ret;
+stop:
+ txq = netdev_get_tx_queue(priv->ndev, 0);
+ netif_tx_stop_queue(txq);
+
+ /* Barrier, so that stop_queue visible to other cpus */
+ smp_mb__after_atomic();
+
+ if (cpdma_check_free_tx_desc(txch))
+ netif_tx_wake_queue(txq);
+
+ return ret;
+}
+
+static int cpsw_run_xdp(struct cpsw_priv *priv, struct cpsw_vector *rxv,
+ struct xdp_buff *xdp, struct page *page)
+{
+ struct net_device *ndev = priv->ndev;
+ struct xdp_frame *xdpf;
+ struct bpf_prog *prog;
+ int ret = 1;
+ u32 act;
+
+ rcu_read_lock();
+
+ prog = READ_ONCE(priv->xdp_prog);
+ if (!prog) {
+ ret = 0;
+ goto out;
+ }
+
+ act = bpf_prog_run_xdp(prog, xdp);
+ switch (act) {
+ case XDP_PASS:
+ ret = 0;
+ break;
+ case XDP_TX:
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
+ xdp_return_buff(xdp);
+ else
+ cpsw_xdp_tx_frame_mapped(priv, xdpf, page);
+ break;
+ case XDP_REDIRECT:
+ if (xdp_do_redirect(ndev, xdp, prog))
+ xdp_return_buff(xdp);
+ else
+ ret = 2;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(ndev, prog, act);
+ /* fall through -- handle aborts by dropping packet */
+ case XDP_DROP:
+ xdp_return_buff(xdp);
+ break;
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static unsigned int cpsw_rxbuf_total_len(unsigned int len)
+{
+ len += CPSW_HEADROOM;
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ return SKB_DATA_ALIGN(len);
+}
+
+struct page_pool *cpsw_create_rx_pool(struct cpsw_common *cpsw)
+{
+ struct page_pool_params pp_params = { 0 };
+ struct page_pool *pool;
+
+ pp_params.order = 0;
+ pp_params.flags = PP_FLAG_DMA_MAP;
+
+ /* set it to number of RX descriptors, but can be more */
+ pp_params.pool_size = cpdma_get_num_rx_descs(cpsw->dma);
+ pp_params.nid = NUMA_NO_NODE;
+ pp_params.dma_dir = DMA_BIDIRECTIONAL;
+ pp_params.dev = cpsw->dev;
+
+ pool = page_pool_create(&pp_params);
+ if (IS_ERR(pool))
+ dev_err(cpsw->dev, "cannot create rx page pool\n");
+
+ return pool;
+}
+
+static struct page *cpsw_alloc_page(struct cpsw_common *cpsw)
+{
+ struct page_pool *pool = cpsw->rx_page_pool;
+ struct page *page, *prev_page = NULL;
+ int try = pool->p.pool_size << 2;
+ int start_free = 0, ret;
+
+ do {
+ page = page_pool_dev_alloc_pages(pool);
+ if (!page)
+ return NULL;
+
+ /* if netstack has page_pool recycling remove the rest */
+ if (page_ref_count(page) == 1)
+ break;
+
+ /* start free pages in use, shouldn't happen */
+ if (prev_page == page || start_free) {
+ /* dma unmap/puts page if rfcnt != 1 */
+ page_pool_recycle_direct(pool, page);
+ start_free = 1;
+ continue;
+ }
+
+ /* if refcnt > 1, page has been holding by netstack, it's pity,
+ * so put it to the ring to be consumed later when fast cash is
+ * empty. If ring is full then free page by recycling as above.
+ */
+ ret = ptr_ring_produce(&pool->ring, page);
+ if (ret) {
+ page_pool_recycle_direct(pool, page);
+ continue;
+ }
+
+ if (!prev_page)
+ prev_page = page;
+ } while (try--);
+
+ return page;
+}
+
static int cpsw_rx_handler(void *token, int len, int status)
{
- struct cpdma_chan *ch;
- struct sk_buff *skb = token;
- struct sk_buff *new_skb;
- struct net_device *ndev = skb->dev;
- int ret = 0, port;
- struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
+ struct page *new_page, *page = token;
+ void *pa = page_address(page);
+ struct cpsw_meta_xdp *xmeta = pa + CPSW_XMETA_OFFSET;
+ struct cpsw_common *cpsw = ndev_to_cpsw(xmeta->ndev);
+ int pkt_size = cpsw->rx_packet_max;
+ int ret = 0, port, ch = xmeta->ch;
+ struct page_pool *pool = cpsw->rx_page_pool;
+ int headroom = CPSW_HEADROOM;
+ struct net_device *ndev = xmeta->ndev;
+ int flush = 0;
struct cpsw_priv *priv;
+ struct sk_buff *skb;
+ struct xdp_buff xdp;
+ dma_addr_t dma;

if (cpsw->data.dual_emac) {
port = CPDMA_RX_SOURCE_PORT(status);
- if (port) {
+ if (port)
ndev = cpsw->slaves[--port].ndev;
- skb->dev = ndev;
- }
}

if (unlikely(status < 0) || unlikely(!netif_running(ndev))) {
@@ -429,47 +680,101 @@ static int cpsw_rx_handler(void *token, int len, int status)
* in reducing of the number of rx descriptor in
* DMA engine, requeue skb back to cpdma.
*/
- new_skb = skb;
+ new_page = page;
goto requeue;
}

/* the interface is going down, skbs are purged */
- dev_kfree_skb_any(skb);
+ page_pool_recycle_direct(pool, page);
return 0;
}

- new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
- if (new_skb) {
- skb_copy_queue_mapping(new_skb, skb);
- skb_put(skb, len);
- if (status & CPDMA_RX_VLAN_ENCAP)
- cpsw_rx_vlan_encap(skb);
- priv = netdev_priv(ndev);
- if (priv->rx_ts_enabled)
- cpts_rx_timestamp(cpsw->cpts, skb);
- skb->protocol = eth_type_trans(skb, ndev);
- netif_receive_skb(skb);
- ndev->stats.rx_bytes += len;
- ndev->stats.rx_packets++;
- kmemleak_not_leak(new_skb);
- } else {
+ new_page = cpsw_alloc_page(cpsw);
+ if (unlikely(!new_page)) {
+ new_page = page;
ndev->stats.rx_dropped++;
- new_skb = skb;
+ goto requeue;
}

+ priv = netdev_priv(ndev);
+ if (priv->xdp_prog) {
+ if (status & CPDMA_RX_VLAN_ENCAP) {
+ xdp.data = (void *)pa + CPSW_HEADROOM +
+ CPSW_RX_VLAN_ENCAP_HDR_SIZE;
+ xdp.data_end = xdp.data + len -
+ CPSW_RX_VLAN_ENCAP_HDR_SIZE;
+ } else {
+ xdp.data = (void *)pa + CPSW_HEADROOM;
+ xdp.data_end = xdp.data + len;
+ }
+
+ xdp_set_data_meta_invalid(&xdp);
+
+ xdp.data_hard_start = pa;
+ xdp.rxq = &priv->xdp_rxq[ch];
+
+ ret = cpsw_run_xdp(priv, &cpsw->rxv[ch], &xdp, page);
+ if (ret) {
+ if (ret == 2)
+ flush = 1;
+
+ goto requeue;
+ }
+
+ /* XDP prog might have changed packet data and boundaries */
+ len = xdp.data_end - xdp.data;
+ headroom = xdp.data - xdp.data_hard_start;
+ }
+
+ /* Build skb and pass it to netstack if XDP off or XDP prog
+ * returned XDP_PASS
+ */
+ skb = build_skb(pa, cpsw_rxbuf_total_len(pkt_size));
+ if (!skb) {
+ ndev->stats.rx_dropped++;
+ page_pool_recycle_direct(pool, page);
+ goto requeue;
+ }
+
+ skb_reserve(skb, headroom);
+ skb_put(skb, len);
+ skb->dev = ndev;
+ if (status & CPDMA_RX_VLAN_ENCAP)
+ cpsw_rx_vlan_encap(skb);
+ if (priv->rx_ts_enabled)
+ cpts_rx_timestamp(cpsw->cpts, skb);
+ skb->protocol = eth_type_trans(skb, ndev);
+
+ /* recycle page before increasing refcounter, it allows to hold page in
+ * page pool cache improving allocation time, see cpsw_alloc_page().
+ */
+ page_pool_recycle_direct(pool, page);
+
+ /* remove once ordinary netstack has page_pool recycling */
+ page_ref_inc(page);
+
+ netif_receive_skb(skb);
+
+ ndev->stats.rx_bytes += len;
+ ndev->stats.rx_packets++;
+
requeue:
if (netif_dormant(ndev)) {
- dev_kfree_skb_any(new_skb);
- return 0;
+ page_pool_recycle_direct(pool, new_page);
+ return flush;
}

- ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch;
- ret = cpdma_chan_submit(ch, new_skb, new_skb->data,
- skb_tailroom(new_skb), 0);
+ xmeta = page_address(new_page) + CPSW_XMETA_OFFSET;
+ xmeta->ndev = ndev;
+ xmeta->ch = ch;
+
+ dma = new_page->dma_addr + CPSW_HEADROOM;
+ ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma,
+ pkt_size, 0);
if (WARN_ON(ret < 0))
- dev_kfree_skb_any(new_skb);
+ page_pool_recycle_direct(pool, new_page);

- return 0;
+ return flush;
}

void cpsw_split_res(struct cpsw_common *cpsw)
@@ -644,7 +949,7 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
{
u32 ch_map;
- int num_rx, cur_budget, ch;
+ int num_rx, cur_budget, ch, flush;
struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
struct cpsw_vector *rxv;

@@ -660,8 +965,12 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
else
cur_budget = rxv->budget;

- cpdma_chan_process(rxv->ch, &cur_budget);
+ flush = cpdma_chan_process(rxv->ch, &cur_budget);
num_rx += cur_budget;
+
+ if (flush)
+ xdp_do_flush_map();
+
if (num_rx >= budget)
break;
}
@@ -677,10 +986,15 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
{
struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
- int num_rx;
+ struct cpsw_vector *rxv;
+ int num_rx, flush;

num_rx = budget;
- cpdma_chan_process(cpsw->rxv[0].ch, &num_rx);
+ rxv = &cpsw->rxv[0];
+ flush = cpdma_chan_process(rxv->ch, &num_rx);
+ if (flush)
+ xdp_do_flush_map();
+
if (num_rx < budget) {
napi_complete_done(napi_rx, num_rx);
writel(0xff, &cpsw->wr_regs->rx_en);
@@ -1042,33 +1356,38 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
int cpsw_fill_rx_channels(struct cpsw_priv *priv)
{
struct cpsw_common *cpsw = priv->cpsw;
- struct sk_buff *skb;
+ struct cpsw_meta_xdp *xmeta;
+ struct page_pool *pool;
+ struct page *page;
int ch_buf_num;
int ch, i, ret;
+ dma_addr_t dma;

+ pool = cpsw->rx_page_pool;
for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
ch_buf_num = cpdma_chan_get_rx_buf_num(cpsw->rxv[ch].ch);
for (i = 0; i < ch_buf_num; i++) {
- skb = __netdev_alloc_skb_ip_align(priv->ndev,
- cpsw->rx_packet_max,
- GFP_KERNEL);
- if (!skb) {
- cpsw_err(priv, ifup, "cannot allocate skb\n");
+ page = cpsw_alloc_page(cpsw);
+ if (!page) {
+ cpsw_err(priv, ifup, "allocate rx page err\n");
return -ENOMEM;
}

- skb_set_queue_mapping(skb, ch);
- ret = cpdma_chan_submit(cpsw->rxv[ch].ch, skb,
- skb->data, skb_tailroom(skb),
- 0);
+ xmeta = page_address(page) + CPSW_XMETA_OFFSET;
+ xmeta->ndev = priv->ndev;
+ xmeta->ch = ch;
+
+ dma = page->dma_addr + CPSW_HEADROOM;
+ ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, page,
+ dma, cpsw->rx_packet_max,
+ 0);
if (ret < 0) {
cpsw_err(priv, ifup,
"cannot submit skb to channel %d rx, error %d\n",
ch, ret);
- kfree_skb(skb);
+ page_pool_recycle_direct(pool, page);
return ret;
}
- kmemleak_not_leak(skb);
}

cpsw_info(priv, ifup, "ch %d rx, submitted %d descriptors\n",
@@ -2011,6 +2330,64 @@ static int cpsw_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type,
}
}

+static int cpsw_xdp_prog_setup(struct cpsw_priv *priv, struct netdev_bpf *bpf)
+{
+ struct bpf_prog *prog = bpf->prog;
+
+ if (!priv->xdpi.prog && !prog)
+ return 0;
+
+ if (!xdp_attachment_flags_ok(&priv->xdpi, bpf))
+ return -EBUSY;
+
+ WRITE_ONCE(priv->xdp_prog, prog);
+
+ xdp_attachment_setup(&priv->xdpi, bpf);
+
+ return 0;
+}
+
+static int cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf)
+{
+ struct cpsw_priv *priv = netdev_priv(ndev);
+
+ switch (bpf->command) {
+ case XDP_SETUP_PROG:
+ return cpsw_xdp_prog_setup(priv, bpf);
+
+ case XDP_QUERY_PROG:
+ return xdp_attachment_query(&priv->xdpi, bpf);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static int cpsw_ndo_xdp_xmit(struct net_device *ndev, int n,
+ struct xdp_frame **frames, u32 flags)
+{
+ struct cpsw_priv *priv = netdev_priv(ndev);
+ struct xdp_frame *xdpf;
+ int i, drops = 0;
+
+ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+ return -EINVAL;
+
+ for (i = 0; i < n; i++) {
+ xdpf = frames[i];
+ if (xdpf->len < CPSW_MIN_PACKET_SIZE) {
+ xdp_return_frame_rx_napi(xdpf);
+ drops++;
+ continue;
+ }
+
+ if (cpsw_xdp_tx_frame(priv, xdpf))
+ drops++;
+ }
+
+ return n - drops;
+}
+
#ifdef CONFIG_NET_POLL_CONTROLLER
static void cpsw_ndo_poll_controller(struct net_device *ndev)
{
@@ -2039,6 +2416,8 @@ static const struct net_device_ops cpsw_netdev_ops = {
.ndo_vlan_rx_add_vid = cpsw_ndo_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = cpsw_ndo_vlan_rx_kill_vid,
.ndo_setup_tc = cpsw_ndo_setup_tc,
+ .ndo_bpf = cpsw_ndo_bpf,
+ .ndo_xdp_xmit = cpsw_ndo_xdp_xmit,
};

static void cpsw_get_drvinfo(struct net_device *ndev,
@@ -2335,11 +2714,24 @@ static int cpsw_probe_dual_emac(struct cpsw_priv *priv)
ndev->netdev_ops = &cpsw_netdev_ops;
ndev->ethtool_ops = &cpsw_ethtool_ops;

+ ret = xdp_rxq_info_reg(priv_sl2->xdp_rxq, ndev, 0);
+ if (ret)
+ return ret;
+
+ ret = xdp_rxq_info_reg_mem_model(priv_sl2->xdp_rxq, MEM_TYPE_PAGE_POOL,
+ cpsw->rx_page_pool);
+ if (ret) {
+ xdp_rxq_info_unreg(priv_sl2->xdp_rxq);
+ return ret;
+ }
+
/* register the network device */
SET_NETDEV_DEV(ndev, cpsw->dev);
ret = register_netdev(ndev);
- if (ret)
+ if (ret) {
dev_err(cpsw->dev, "cpsw: error registering net device\n");
+ xdp_rxq_info_unreg(priv_sl2->xdp_rxq);
+ }

return ret;
}
@@ -2457,19 +2849,25 @@ static int cpsw_probe(struct platform_device *pdev)
if (ret)
goto clean_dt_ret;

+ cpsw->rx_page_pool = cpsw_create_rx_pool(cpsw);
+ if (IS_ERR(cpsw->rx_page_pool)) {
+ ret = PTR_ERR(cpsw->rx_page_pool);
+ goto clean_cpts;
+ }
+
ch = cpsw->quirk_irq ? 0 : 7;
cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, ch, cpsw_tx_handler, 0);
if (IS_ERR(cpsw->txv[0].ch)) {
dev_err(dev, "error initializing tx dma channel\n");
ret = PTR_ERR(cpsw->txv[0].ch);
- goto clean_cpts;
+ goto clean_pool;
}

cpsw->rxv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
if (IS_ERR(cpsw->rxv[0].ch)) {
dev_err(dev, "error initializing rx dma channel\n");
ret = PTR_ERR(cpsw->rxv[0].ch);
- goto clean_cpts;
+ goto clean_pool;
}
cpsw_split_res(cpsw);

@@ -2478,7 +2876,7 @@ static int cpsw_probe(struct platform_device *pdev)
CPSW_MAX_QUEUES, CPSW_MAX_QUEUES);
if (!ndev) {
dev_err(dev, "error allocating net_device\n");
- goto clean_cpts;
+ goto clean_pool;
}

platform_set_drvdata(pdev, ndev);
@@ -2499,6 +2897,15 @@ static int cpsw_probe(struct platform_device *pdev)

memcpy(ndev->dev_addr, priv->mac_addr, ETH_ALEN);

+ ret = xdp_rxq_info_reg(priv->xdp_rxq, ndev, 0);
+ if (ret)
+ goto clean_pool;
+
+ ret = xdp_rxq_info_reg_mem_model(priv->xdp_rxq, MEM_TYPE_PAGE_POOL,
+ cpsw->rx_page_pool);
+ if (ret)
+ goto clean_rxq_info;
+
cpsw->slaves[0].ndev = ndev;

ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_RX;
@@ -2518,7 +2925,7 @@ static int cpsw_probe(struct platform_device *pdev)
if (ret) {
dev_err(dev, "error registering net device\n");
ret = -ENODEV;
- goto clean_cpts;
+ goto clean_rxq_info;
}

if (cpsw->data.dual_emac) {
@@ -2561,6 +2968,10 @@ static int cpsw_probe(struct platform_device *pdev)

clean_unregister_netdev_ret:
unregister_netdev(ndev);
+clean_rxq_info:
+ xdp_rxq_info_unreg(priv->xdp_rxq);
+clean_pool:
+ page_pool_destroy(cpsw->rx_page_pool);
clean_cpts:
cpts_release(cpsw->cpts);
cpdma_ctlr_destroy(cpsw->dma);
@@ -2572,11 +2983,26 @@ static int cpsw_probe(struct platform_device *pdev)
return ret;
}

+void cpsw_xdp_rxq_unreg(struct cpsw_common *cpsw, int ch)
+{
+ struct cpsw_slave *slave;
+ struct cpsw_priv *priv;
+ int i;
+
+ for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) {
+ if (!slave->ndev)
+ continue;
+
+ priv = netdev_priv(slave->ndev);
+ xdp_rxq_info_unreg(&priv->xdp_rxq[ch]);
+ }
+}
+
static int cpsw_remove(struct platform_device *pdev)
{
struct net_device *ndev = platform_get_drvdata(pdev);
struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
- int ret;
+ int i, ret;

ret = pm_runtime_get_sync(&pdev->dev);
if (ret < 0) {
@@ -2590,6 +3016,11 @@ static int cpsw_remove(struct platform_device *pdev)

cpts_release(cpsw->cpts);
cpdma_ctlr_destroy(cpsw->dma);
+
+ for (i = 0; i < cpsw->rx_ch_num; i++)
+ cpsw_xdp_rxq_unreg(cpsw, i);
+
+ page_pool_destroy(cpsw->rx_page_pool);
cpsw_remove_dt(pdev);
pm_runtime_put_sync(&pdev->dev);
pm_runtime_disable(&pdev->dev);
diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c
index 0c08ec91635a..dbe4bc5513c6 100644
--- a/drivers/net/ethernet/ti/cpsw_ethtool.c
+++ b/drivers/net/ethernet/ti/cpsw_ethtool.c
@@ -14,6 +14,7 @@
#include <linux/phy.h>
#include <linux/pm_runtime.h>
#include <linux/skbuff.h>
+#include <net/page_pool.h>

#include "cpsw.h"
#include "cpts.h"
@@ -531,6 +532,42 @@ static int cpsw_check_ch_settings(struct cpsw_common *cpsw,
return 0;
}

+static int cpsw_xdp_rxq_reg(struct cpsw_common *cpsw, int ch)
+{
+ struct cpsw_slave *slave;
+ struct cpsw_priv *priv;
+ int i, ret;
+
+ /* As channels are common for both ports sharing same queues, xdp_rxq
+ * information also becomes shared and used by every packet on this
+ * channel. But exch xdp_rxq holds link on netdev, which by the theory
+ * can have different memory model and so, network device must hold it's
+ * own set of rxq and thus both netdevs should be prepared
+ */
+ for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) {
+ if (!slave->ndev)
+ continue;
+
+ priv = netdev_priv(slave->ndev);
+
+ ret = xdp_rxq_info_reg(&priv->xdp_rxq[ch], priv->ndev, ch);
+ if (ret)
+ goto err;
+
+ ret = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq[ch],
+ MEM_TYPE_PAGE_POOL,
+ cpsw->rx_page_pool);
+ if (ret)
+ goto err;
+ }
+
+ return ret;
+
+err:
+ cpsw_xdp_rxq_unreg(cpsw, ch);
+ return ret;
+}
+
static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx,
cpdma_handler_fn rx_handler)
{
@@ -562,6 +599,11 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx,
if (!vec[*ch].ch)
return -EINVAL;

+ if (rx && cpsw_xdp_rxq_reg(cpsw, *ch)) {
+ cpdma_chan_destroy(vec[*ch].ch);
+ return -EINVAL;
+ }
+
cpsw_info(priv, ifup, "created new %d %s channel\n", *ch,
(rx ? "rx" : "tx"));
(*ch)++;
@@ -570,6 +612,9 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx,
while (*ch > ch_num) {
(*ch)--;

+ if (rx)
+ cpsw_xdp_rxq_unreg(cpsw, *ch);
+
ret = cpdma_chan_destroy(vec[*ch].ch);
if (ret)
return ret;
@@ -654,6 +699,7 @@ int cpsw_set_ringparam(struct net_device *ndev,
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
+ struct page_pool *pool;
int ret;

/* ignore ering->tx_pending - only rx_pending adjustment is supported */
@@ -666,6 +712,10 @@ int cpsw_set_ringparam(struct net_device *ndev,
if (ering->rx_pending == cpdma_get_num_rx_descs(cpsw->dma))
return 0;

+ pool = cpsw_create_rx_pool(cpsw);
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+
cpsw_suspend_data_pass(ndev);

cpdma_set_num_rx_descs(cpsw->dma, ering->rx_pending);
@@ -673,6 +723,9 @@ int cpsw_set_ringparam(struct net_device *ndev,
if (cpsw->usage_count)
cpdma_chan_split_pool(cpsw->dma);

+ page_pool_destroy(cpsw->rx_page_pool);
+ cpsw->rx_page_pool = pool;
+
ret = cpsw_resume_data_pass(ndev);
if (!ret)
return 0;
diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
index 2ecb3af59fe9..884ce6343a7d 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.h
+++ b/drivers/net/ethernet/ti/cpsw_priv.h
@@ -346,6 +346,7 @@ struct cpsw_common {
int rx_ch_num, tx_ch_num;
int speed;
int usage_count;
+ struct page_pool *rx_page_pool;
};

struct cpsw_priv {
@@ -360,6 +361,10 @@ struct cpsw_priv {
int shp_cfg_speed;
int tx_ts_enabled;
int rx_ts_enabled;
+ struct bpf_prog *xdp_prog;
+ struct xdp_rxq_info xdp_rxq[CPSW_MAX_QUEUES];
+ struct xdp_attachment_info xdpi;
+
u32 emac_port;
struct cpsw_common *cpsw;
};
@@ -391,6 +396,8 @@ int cpsw_fill_rx_channels(struct cpsw_priv *priv);
void cpsw_intr_enable(struct cpsw_common *cpsw);
void cpsw_intr_disable(struct cpsw_common *cpsw);
int cpsw_tx_handler(void *token, int len, int status);
+void cpsw_xdp_rxq_unreg(struct cpsw_common *cpsw, int ch);
+struct page_pool *cpsw_create_rx_pool(struct cpsw_common *cpsw);

/* ethtool */
u32 cpsw_get_msglevel(struct net_device *ndev);
--
2.17.1

2019-05-23 18:25:54

by Ivan Khoronzhuk

[permalink] [raw]
Subject: [PATCH net-next 1/3] net: ethernet: ti: davinci_cpdma: add dma mapped submit

In case if dma mapped packet needs to be sent, like with XDP
page pool, the "mapped" submit can be used. This patch adds dma
mapped submit based on regular one.

Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
drivers/net/ethernet/ti/davinci_cpdma.c | 88 ++++++++++++++++++++-----
drivers/net/ethernet/ti/davinci_cpdma.h | 2 +
2 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 35bf14d8e7af..7f89b2299f05 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -125,6 +125,15 @@ struct cpdma_chan {
u32 rate;
};

+struct submit_info {
+ struct cpdma_chan *chan;
+ int directed;
+ void *token;
+ void *data;
+ int flags;
+ int len;
+};
+
struct cpdma_control_info {
u32 reg;
u32 shift, mask;
@@ -176,6 +185,8 @@ static struct cpdma_control_info controls[] = {
(directed << CPDMA_TO_PORT_SHIFT)); \
} while (0)

+#define CPDMA_DMA_EXT_MAP BIT(16)
+
static void cpdma_desc_pool_destroy(struct cpdma_ctlr *ctlr)
{
struct cpdma_desc_pool *pool = ctlr->pool;
@@ -1002,10 +1013,12 @@ static void __cpdma_chan_submit(struct cpdma_chan *chan,
}
}

-int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
- int len, int directed)
+static int cpdma_chan_submit_si(struct submit_info *si)
{
+ struct cpdma_chan *chan = si->chan;
struct cpdma_ctlr *ctlr = chan->ctlr;
+ int len = si->len;
+ int swlen = len;
struct cpdma_desc __iomem *desc;
dma_addr_t buffer;
unsigned long flags;
@@ -1037,16 +1050,22 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
chan->stats.runt_transmit_buff++;
}

- buffer = dma_map_single(ctlr->dev, data, len, chan->dir);
- ret = dma_mapping_error(ctlr->dev, buffer);
- if (ret) {
- cpdma_desc_free(ctlr->pool, desc, 1);
- ret = -EINVAL;
- goto unlock_ret;
- }
-
mode = CPDMA_DESC_OWNER | CPDMA_DESC_SOP | CPDMA_DESC_EOP;
- cpdma_desc_to_port(chan, mode, directed);
+ cpdma_desc_to_port(chan, mode, si->directed);
+
+ if (si->flags & CPDMA_DMA_EXT_MAP) {
+ buffer = (dma_addr_t)si->data;
+ dma_sync_single_for_device(ctlr->dev, buffer, len, chan->dir);
+ swlen |= CPDMA_DMA_EXT_MAP;
+ } else {
+ buffer = dma_map_single(ctlr->dev, si->data, len, chan->dir);
+ ret = dma_mapping_error(ctlr->dev, buffer);
+ if (ret) {
+ cpdma_desc_free(ctlr->pool, desc, 1);
+ ret = -EINVAL;
+ goto unlock_ret;
+ }
+ }

/* Relaxed IO accessors can be used here as there is read barrier
* at the end of write sequence.
@@ -1055,9 +1074,9 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
writel_relaxed(buffer, &desc->hw_buffer);
writel_relaxed(len, &desc->hw_len);
writel_relaxed(mode | len, &desc->hw_mode);
- writel_relaxed((uintptr_t)token, &desc->sw_token);
+ writel_relaxed((uintptr_t)si->token, &desc->sw_token);
writel_relaxed(buffer, &desc->sw_buffer);
- writel_relaxed(len, &desc->sw_len);
+ writel_relaxed(swlen, &desc->sw_len);
desc_read(desc, sw_len);

__cpdma_chan_submit(chan, desc);
@@ -1072,6 +1091,38 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
return ret;
}

+int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data, int len,
+ int directed)
+{
+ struct submit_info si;
+
+ si.chan = chan;
+ si.token = token;
+ si.data = data;
+ si.len = len;
+ si.directed = directed;
+ si.flags = 0;
+
+ return cpdma_chan_submit_si(&si);
+}
+EXPORT_SYMBOL_GPL(cpdma_chan_submit);
+
+int cpdma_chan_submit_mapped(struct cpdma_chan *chan, void *token,
+ dma_addr_t data, int len, int directed)
+{
+ struct submit_info si;
+
+ si.chan = chan;
+ si.token = token;
+ si.data = (void *)data;
+ si.len = len;
+ si.directed = directed;
+ si.flags = CPDMA_DMA_EXT_MAP;
+
+ return cpdma_chan_submit_si(&si);
+}
+EXPORT_SYMBOL_GPL(cpdma_chan_submit_mapped);
+
bool cpdma_check_free_tx_desc(struct cpdma_chan *chan)
{
struct cpdma_ctlr *ctlr = chan->ctlr;
@@ -1097,10 +1148,17 @@ static void __cpdma_chan_free(struct cpdma_chan *chan,
uintptr_t token;

token = desc_read(desc, sw_token);
- buff_dma = desc_read(desc, sw_buffer);
origlen = desc_read(desc, sw_len);

- dma_unmap_single(ctlr->dev, buff_dma, origlen, chan->dir);
+ buff_dma = desc_read(desc, sw_buffer);
+ if (origlen & CPDMA_DMA_EXT_MAP) {
+ origlen &= ~CPDMA_DMA_EXT_MAP;
+ dma_sync_single_for_cpu(ctlr->dev, buff_dma, origlen,
+ chan->dir);
+ } else {
+ dma_unmap_single(ctlr->dev, buff_dma, origlen, chan->dir);
+ }
+
cpdma_desc_free(pool, desc, 1);
(*chan->handler)((void *)token, outlen, status);
}
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index 10376062dafa..8f6f27185c63 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -77,6 +77,8 @@ int cpdma_chan_stop(struct cpdma_chan *chan);

int cpdma_chan_get_stats(struct cpdma_chan *chan,
struct cpdma_chan_stats *stats);
+int cpdma_chan_submit_mapped(struct cpdma_chan *chan, void *token,
+ dma_addr_t data, int len, int directed);
int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
int len, int directed);
int cpdma_chan_process(struct cpdma_chan *chan, int quota);
--
2.17.1

2019-05-24 09:43:50

by Ilias Apalodimas

[permalink] [raw]
Subject: Re: [PATCH net-next 0/3] net: ethernet: ti: cpsw: Add XDP support

Hi Ivan,

More XDP drivers, that's good!
> This patchset add XDP support for TI cpsw driver and base it on
> page_pool allocator. It was verified on af_xdp socket drop,
> af_xdp l2f, ebpf XDP_DROP, XDP_REDIRECT, XDP_PASS, XDP_TX.
>
> It was verified with following configs enabled:
> CONFIG_JIT=y
> CONFIG_BPFILTER=y
> CONFIG_BPF_SYSCALL=y
> CONFIG_XDP_SOCKETS=y
> CONFIG_BPF_EVENTS=y
> CONFIG_HAVE_EBPF_JIT=y
> CONFIG_BPF_JIT=y
> CONFIG_CGROUP_BPF=y
>
> Link on previous RFC:
> https://lkml.org/lkml/2019/4/17/861
>
The recycling pattern has changed i'll have a closer look in the weekend and let
you know
> Also regular tests with iperf2 were done in order to verify impact on
> regular netstack performance, compared with base commit:
> https://pastebin.com/JSMT0iZ4
Do you have any XDP related numbers?
>
> Based on net-next/master
>
> Ivan Khoronzhuk (3):
> net: ethernet: ti: davinci_cpdma: add dma mapped submit
> net: ethernet: ti: davinci_cpdma: return handler status
> net: ethernet: ti: cpsw: add XDP support
>
> drivers/net/ethernet/ti/Kconfig | 1 +
> drivers/net/ethernet/ti/cpsw.c | 570 +++++++++++++++++++++---
> drivers/net/ethernet/ti/cpsw_ethtool.c | 55 ++-
> drivers/net/ethernet/ti/cpsw_priv.h | 9 +-
> drivers/net/ethernet/ti/davinci_cpdma.c | 122 +++--
> drivers/net/ethernet/ti/davinci_cpdma.h | 6 +-
> drivers/net/ethernet/ti/davinci_emac.c | 18 +-
> 7 files changed, 675 insertions(+), 106 deletions(-)
>
> --
> 2.17.1
>
Thanks
/Ilias

2019-05-24 11:09:01

by Ilias Apalodimas

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] net: ethernet: ti: cpsw: add XDP support

On Thu, May 23, 2019 at 09:20:35PM +0300, Ivan Khoronzhuk wrote:
> Add XDP support based on rx page_pool allocator, one frame per page.
> Page pool allocator is used with assumption that only one rx_handler
> is running simultaneously. DMA map/unmap is reused from page pool
> despite there is no need to map whole page.
>
> Due to specific of cpsw, the same TX/RX handler can be used by 2
> network devices, so special fields in buffer are added to identify
> an interface the frame is destined to. Thus XDP works for both
> interfaces, that allows to test xdp redirect between two interfaces
> easily.
>
> XDP prog is common for all channels till appropriate changes are added
> in XDP infrastructure.
>
> Signed-off-by: Ivan Khoronzhuk <[email protected]>
> ---
> drivers/net/ethernet/ti/Kconfig | 1 +
> drivers/net/ethernet/ti/cpsw.c | 555 ++++++++++++++++++++++---
> drivers/net/ethernet/ti/cpsw_ethtool.c | 53 +++
> drivers/net/ethernet/ti/cpsw_priv.h | 7 +
> 4 files changed, 554 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
> index bd05a977ee7e..3cb8c5214835 100644
> --- a/drivers/net/ethernet/ti/Kconfig
> +++ b/drivers/net/ethernet/ti/Kconfig
> @@ -50,6 +50,7 @@ config TI_CPSW
> depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST
> select TI_DAVINCI_MDIO
> select MFD_SYSCON
> + select PAGE_POOL
> select REGMAP
> ---help---
> This driver supports TI's CPSW Ethernet Switch.
> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index 87a600aeee4a..274e6b64ea9e 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -31,6 +31,10 @@
> #include <linux/if_vlan.h>
> #include <linux/kmemleak.h>
> #include <linux/sys_soc.h>
> +#include <net/page_pool.h>
> +#include <linux/bpf.h>
> +#include <linux/bpf_trace.h>
> +#include <linux/filter.h>
>
> #include <linux/pinctrl/consumer.h>
> #include <net/pkt_cls.h>
> @@ -60,6 +64,10 @@ static int descs_pool_size = CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT;
> module_param(descs_pool_size, int, 0444);
> MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");
>
> +/* The buf includes headroom compatible with both skb and xdpf */
> +#define CPSW_HEADROOM_NA (max(XDP_PACKET_HEADROOM, NET_SKB_PAD) + NET_IP_ALIGN)
> +#define CPSW_HEADROOM ALIGN(CPSW_HEADROOM_NA, sizeof(long))
> +
> #define for_each_slave(priv, func, arg...) \
> do { \
> struct cpsw_slave *slave; \
> @@ -74,6 +82,8 @@ MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");
> (func)(slave++, ##arg); \
> } while (0)
>
> +#define CPSW_XMETA_OFFSET ALIGN(sizeof(struct xdp_frame), sizeof(long))
> +
> static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
> __be16 proto, u16 vid);
>
> @@ -337,24 +347,58 @@ void cpsw_intr_disable(struct cpsw_common *cpsw)
> return;
> }
>
> +static int cpsw_is_xdpf_handle(void *handle)
> +{
> + return (unsigned long)handle & BIT(0);
> +}
> +
> +static void *cpsw_xdpf_to_handle(struct xdp_frame *xdpf)
> +{
> + return (void *)((unsigned long)xdpf | BIT(0));
> +}
> +
> +static struct xdp_frame *cpsw_handle_to_xdpf(void *handle)
> +{
> + return (struct xdp_frame *)((unsigned long)handle & ~BIT(0));
> +}
> +
> +struct __aligned(sizeof(long)) cpsw_meta_xdp {
> + struct net_device *ndev;
> + int ch;
> +};
> +
> int cpsw_tx_handler(void *token, int len, int status)
> {
> + struct cpsw_meta_xdp *xmeta;
> + struct xdp_frame *xdpf;
> + struct net_device *ndev;
> struct netdev_queue *txq;
> - struct sk_buff *skb = token;
> - struct net_device *ndev = skb->dev;
> - struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
> + struct sk_buff *skb;
> + int ch;
> +
> + if (cpsw_is_xdpf_handle(token)) {
> + xdpf = cpsw_handle_to_xdpf(token);
> + xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
> + ndev = xmeta->ndev;
> + ch = xmeta->ch;
> + xdp_return_frame_rx_napi(xdpf);
> + } else {
> + skb = token;
> + ndev = skb->dev;
> + ch = skb_get_queue_mapping(skb);
> + cpts_tx_timestamp(ndev_to_cpsw(ndev)->cpts, skb);
> + dev_kfree_skb_any(skb);
> + }
>
> /* Check whether the queue is stopped due to stalled tx dma, if the
> * queue is stopped then start the queue as we have free desc for tx
> */
> - txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb));
> + txq = netdev_get_tx_queue(ndev, ch);
> if (unlikely(netif_tx_queue_stopped(txq)))
> netif_tx_wake_queue(txq);
>
> - cpts_tx_timestamp(cpsw->cpts, skb);
> ndev->stats.tx_packets++;
> ndev->stats.tx_bytes += len;
> - dev_kfree_skb_any(skb);
> return 0;
> }
>
> @@ -401,22 +445,229 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
> }
> }
>
> +static int cpsw_xdp_tx_frame_mapped(struct cpsw_priv *priv,
> + struct xdp_frame *xdpf, struct page *page)
> +{
> + struct cpsw_common *cpsw = priv->cpsw;
> + struct cpsw_meta_xdp *xmeta;
> + struct netdev_queue *txq;
> + struct cpdma_chan *txch;
> + dma_addr_t dma;
> + int ret;
> +
> + xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
> + xmeta->ch = 0;
> +
> + txch = cpsw->txv[0].ch;
> + dma = (xdpf->data - (void *)xdpf) + page->dma_addr;
> + ret = cpdma_chan_submit_mapped(txch, cpsw_xdpf_to_handle(xdpf), dma,
> + xdpf->len,
> + priv->emac_port + cpsw->data.dual_emac);
> + if (ret) {
> + xdp_return_frame_rx_napi(xdpf);
> + goto stop;
> + }
> +
> + /* no tx desc - stop sending us tx frames */
> + if (unlikely(!cpdma_check_free_tx_desc(txch)))
> + goto stop;
> +
> + return ret;
> +stop:
> + txq = netdev_get_tx_queue(priv->ndev, 0);
> + netif_tx_stop_queue(txq);
> +
> + /* Barrier, so that stop_queue visible to other cpus */
> + smp_mb__after_atomic();
> +
> + if (cpdma_check_free_tx_desc(txch))
> + netif_tx_wake_queue(txq);
> +
> + return ret;
> +}
> +
> +static int cpsw_xdp_tx_frame(struct cpsw_priv *priv, struct xdp_frame *xdpf)
> +{
> + struct cpsw_common *cpsw = priv->cpsw;
> + struct cpsw_meta_xdp *xmeta;
> + struct netdev_queue *txq;
> + struct cpdma_chan *txch;
> + int ret;
> +
> + xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
> + if (sizeof(*xmeta) > xdpf->headroom)
> + return -EINVAL;
> +
> + xmeta->ndev = priv->ndev;
> + xmeta->ch = 0;
> +
> + txch = cpsw->txv[0].ch;
> + ret = cpdma_chan_submit(txch, cpsw_xdpf_to_handle(xdpf), xdpf->data,
> + xdpf->len,
> + priv->emac_port + cpsw->data.dual_emac);
> + if (ret) {
> + xdp_return_frame_rx_napi(xdpf);
> + goto stop;
> + }
> +
> + /* no tx desc - stop sending us tx frames */
> + if (unlikely(!cpdma_check_free_tx_desc(txch)))
> + goto stop;
> +
> + return ret;
> +stop:
> + txq = netdev_get_tx_queue(priv->ndev, 0);
> + netif_tx_stop_queue(txq);
> +
> + /* Barrier, so that stop_queue visible to other cpus */
> + smp_mb__after_atomic();
> +
> + if (cpdma_check_free_tx_desc(txch))
> + netif_tx_wake_queue(txq);
> +
> + return ret;
> +}
> +
> +static int cpsw_run_xdp(struct cpsw_priv *priv, struct cpsw_vector *rxv,
> + struct xdp_buff *xdp, struct page *page)
> +{
> + struct net_device *ndev = priv->ndev;
> + struct xdp_frame *xdpf;
> + struct bpf_prog *prog;
> + int ret = 1;
> + u32 act;
> +
> + rcu_read_lock();
> +
> + prog = READ_ONCE(priv->xdp_prog);
> + if (!prog) {
> + ret = 0;
> + goto out;
> + }
> +
> + act = bpf_prog_run_xdp(prog, xdp);
> + switch (act) {
> + case XDP_PASS:
> + ret = 0;
> + break;
> + case XDP_TX:
> + xdpf = convert_to_xdp_frame(xdp);
> + if (unlikely(!xdpf))
> + xdp_return_buff(xdp);
> + else
> + cpsw_xdp_tx_frame_mapped(priv, xdpf, page);
> + break;
> + case XDP_REDIRECT:
> + if (xdp_do_redirect(ndev, xdp, prog))
> + xdp_return_buff(xdp);
> + else
> + ret = 2;
> + break;
> + default:
> + bpf_warn_invalid_xdp_action(act);
> + /* fall through */
> + case XDP_ABORTED:
> + trace_xdp_exception(ndev, prog, act);
> + /* fall through -- handle aborts by dropping packet */
> + case XDP_DROP:
> + xdp_return_buff(xdp);
> + break;
> + }
> +out:
> + rcu_read_unlock();
> + return ret;
> +}
> +
> +static unsigned int cpsw_rxbuf_total_len(unsigned int len)
> +{
> + len += CPSW_HEADROOM;
> + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> + return SKB_DATA_ALIGN(len);
> +}
> +
> +struct page_pool *cpsw_create_rx_pool(struct cpsw_common *cpsw)
> +{
> + struct page_pool_params pp_params = { 0 };
> + struct page_pool *pool;
> +
> + pp_params.order = 0;
> + pp_params.flags = PP_FLAG_DMA_MAP;
> +
> + /* set it to number of RX descriptors, but can be more */
> + pp_params.pool_size = cpdma_get_num_rx_descs(cpsw->dma);
> + pp_params.nid = NUMA_NO_NODE;
> + pp_params.dma_dir = DMA_BIDIRECTIONAL;
> + pp_params.dev = cpsw->dev;
> +
> + pool = page_pool_create(&pp_params);
> + if (IS_ERR(pool))
> + dev_err(cpsw->dev, "cannot create rx page pool\n");
> +
> + return pool;
> +}
> +
> +static struct page *cpsw_alloc_page(struct cpsw_common *cpsw)
> +{
> + struct page_pool *pool = cpsw->rx_page_pool;
> + struct page *page, *prev_page = NULL;
> + int try = pool->p.pool_size << 2;
> + int start_free = 0, ret;
> +
> + do {
> + page = page_pool_dev_alloc_pages(pool);
> + if (!page)
> + return NULL;
> +
> + /* if netstack has page_pool recycling remove the rest */
> + if (page_ref_count(page) == 1)
> + break;
> +
> + /* start free pages in use, shouldn't happen */
> + if (prev_page == page || start_free) {
> + /* dma unmap/puts page if rfcnt != 1 */
> + page_pool_recycle_direct(pool, page);
> + start_free = 1;
> + continue;
> + }
> +
> + /* if refcnt > 1, page has been holding by netstack, it's pity,
> + * so put it to the ring to be consumed later when fast cash is
s/cash/cache

> + * empty. If ring is full then free page by recycling as above.
> + */
> + ret = ptr_ring_produce(&pool->ring, page);
> + if (ret) {
> + page_pool_recycle_direct(pool, page);
> + continue;
> + }
Although this should be fine since this part won't be called during the driver
init, i think i'd prefer unmapping the buffer and let the network stack free it,
instead of pushing it for recycling. The occurence should be pretty low, so
allocating a buffer every once in a while shouldn't have a noticeable
performance impact

> +
> + if (!prev_page)
> + prev_page = page;
> + } while (try--);
> +
> + return page;
> +}
> +
> static int cpsw_rx_handler(void *token, int len, int status)
> {
> - struct cpdma_chan *ch;
> - struct sk_buff *skb = token;
> - struct sk_buff *new_skb;
> - struct net_device *ndev = skb->dev;
> - int ret = 0, port;
> - struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
> + struct page *new_page, *page = token;
> + void *pa = page_address(page);
> + struct cpsw_meta_xdp *xmeta = pa + CPSW_XMETA_OFFSET;
> + struct cpsw_common *cpsw = ndev_to_cpsw(xmeta->ndev);
> + int pkt_size = cpsw->rx_packet_max;
> + int ret = 0, port, ch = xmeta->ch;
> + struct page_pool *pool = cpsw->rx_page_pool;
> + int headroom = CPSW_HEADROOM;
> + struct net_device *ndev = xmeta->ndev;
> + int flush = 0;
> struct cpsw_priv *priv;
> + struct sk_buff *skb;
> + struct xdp_buff xdp;
> + dma_addr_t dma;
>
> if (cpsw->data.dual_emac) {
> port = CPDMA_RX_SOURCE_PORT(status);
> - if (port) {
> + if (port)
> ndev = cpsw->slaves[--port].ndev;
> - skb->dev = ndev;
> - }
> }
>
> if (unlikely(status < 0) || unlikely(!netif_running(ndev))) {
> @@ -429,47 +680,101 @@ static int cpsw_rx_handler(void *token, int len, int status)
> * in reducing of the number of rx descriptor in
> * DMA engine, requeue skb back to cpdma.
> */
> - new_skb = skb;
> + new_page = page;
> goto requeue;
> }
>
> /* the interface is going down, skbs are purged */
> - dev_kfree_skb_any(skb);
> + page_pool_recycle_direct(pool, page);
> return 0;
> }
>
> - new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
> - if (new_skb) {
> - skb_copy_queue_mapping(new_skb, skb);
> - skb_put(skb, len);
> - if (status & CPDMA_RX_VLAN_ENCAP)
> - cpsw_rx_vlan_encap(skb);
> - priv = netdev_priv(ndev);
> - if (priv->rx_ts_enabled)
> - cpts_rx_timestamp(cpsw->cpts, skb);
> - skb->protocol = eth_type_trans(skb, ndev);
> - netif_receive_skb(skb);
> - ndev->stats.rx_bytes += len;
> - ndev->stats.rx_packets++;
> - kmemleak_not_leak(new_skb);
> - } else {
> + new_page = cpsw_alloc_page(cpsw);
> + if (unlikely(!new_page)) {
> + new_page = page;
> ndev->stats.rx_dropped++;
> - new_skb = skb;
> + goto requeue;
> }
>
> + priv = netdev_priv(ndev);
> + if (priv->xdp_prog) {
> + if (status & CPDMA_RX_VLAN_ENCAP) {
> + xdp.data = (void *)pa + CPSW_HEADROOM +
> + CPSW_RX_VLAN_ENCAP_HDR_SIZE;
> + xdp.data_end = xdp.data + len -
> + CPSW_RX_VLAN_ENCAP_HDR_SIZE;
> + } else {
> + xdp.data = (void *)pa + CPSW_HEADROOM;
> + xdp.data_end = xdp.data + len;
> + }
> +
> + xdp_set_data_meta_invalid(&xdp);
> +
> + xdp.data_hard_start = pa;
> + xdp.rxq = &priv->xdp_rxq[ch];
> +
> + ret = cpsw_run_xdp(priv, &cpsw->rxv[ch], &xdp, page);
> + if (ret) {
> + if (ret == 2)
> + flush = 1;
> +
> + goto requeue;
> + }
> +
> + /* XDP prog might have changed packet data and boundaries */
> + len = xdp.data_end - xdp.data;
> + headroom = xdp.data - xdp.data_hard_start;
> + }
> +
> + /* Build skb and pass it to netstack if XDP off or XDP prog
> + * returned XDP_PASS
> + */
> + skb = build_skb(pa, cpsw_rxbuf_total_len(pkt_size));
> + if (!skb) {
> + ndev->stats.rx_dropped++;
> + page_pool_recycle_direct(pool, page);
> + goto requeue;
> + }
> +
> + skb_reserve(skb, headroom);
> + skb_put(skb, len);
> + skb->dev = ndev;
> + if (status & CPDMA_RX_VLAN_ENCAP)
> + cpsw_rx_vlan_encap(skb);
> + if (priv->rx_ts_enabled)
> + cpts_rx_timestamp(cpsw->cpts, skb);
> + skb->protocol = eth_type_trans(skb, ndev);
> +
> + /* recycle page before increasing refcounter, it allows to hold page in
> + * page pool cache improving allocation time, see cpsw_alloc_page().
> + */
> + page_pool_recycle_direct(pool, page);
> +
> + /* remove once ordinary netstack has page_pool recycling */
> + page_ref_inc(page);
> +
> + netif_receive_skb(skb);
> +
> + ndev->stats.rx_bytes += len;
> + ndev->stats.rx_packets++;
> +
> requeue:
> if (netif_dormant(ndev)) {
> - dev_kfree_skb_any(new_skb);
> - return 0;
> + page_pool_recycle_direct(pool, new_page);
> + return flush;
> }
>
> - ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch;
> - ret = cpdma_chan_submit(ch, new_skb, new_skb->data,
> - skb_tailroom(new_skb), 0);
> + xmeta = page_address(new_page) + CPSW_XMETA_OFFSET;
> + xmeta->ndev = ndev;
> + xmeta->ch = ch;
> +
> + dma = new_page->dma_addr + CPSW_HEADROOM;
> + ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma,
> + pkt_size, 0);
> if (WARN_ON(ret < 0))
> - dev_kfree_skb_any(new_skb);
> + page_pool_recycle_direct(pool, new_page);
>
> - return 0;
> + return flush;
> }
>
> void cpsw_split_res(struct cpsw_common *cpsw)
> @@ -644,7 +949,7 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
> static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
> {
> u32 ch_map;
> - int num_rx, cur_budget, ch;
> + int num_rx, cur_budget, ch, flush;
> struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
> struct cpsw_vector *rxv;
>
> @@ -660,8 +965,12 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
> else
> cur_budget = rxv->budget;
>
> - cpdma_chan_process(rxv->ch, &cur_budget);
> + flush = cpdma_chan_process(rxv->ch, &cur_budget);
> num_rx += cur_budget;
> +
> + if (flush)
> + xdp_do_flush_map();
> +
> if (num_rx >= budget)
> break;
> }
> @@ -677,10 +986,15 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
> static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
> {
> struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
> - int num_rx;
> + struct cpsw_vector *rxv;
> + int num_rx, flush;
>
> num_rx = budget;
> - cpdma_chan_process(cpsw->rxv[0].ch, &num_rx);
> + rxv = &cpsw->rxv[0];
> + flush = cpdma_chan_process(rxv->ch, &num_rx);
> + if (flush)
> + xdp_do_flush_map();
> +
> if (num_rx < budget) {
> napi_complete_done(napi_rx, num_rx);
> writel(0xff, &cpsw->wr_regs->rx_en);
> @@ -1042,33 +1356,38 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
> int cpsw_fill_rx_channels(struct cpsw_priv *priv)
> {
> struct cpsw_common *cpsw = priv->cpsw;
> - struct sk_buff *skb;
> + struct cpsw_meta_xdp *xmeta;
> + struct page_pool *pool;
> + struct page *page;
> int ch_buf_num;
> int ch, i, ret;
> + dma_addr_t dma;
>
> + pool = cpsw->rx_page_pool;
> for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
> ch_buf_num = cpdma_chan_get_rx_buf_num(cpsw->rxv[ch].ch);
> for (i = 0; i < ch_buf_num; i++) {
> - skb = __netdev_alloc_skb_ip_align(priv->ndev,
> - cpsw->rx_packet_max,
> - GFP_KERNEL);
> - if (!skb) {
> - cpsw_err(priv, ifup, "cannot allocate skb\n");
> + page = cpsw_alloc_page(cpsw);
> + if (!page) {
> + cpsw_err(priv, ifup, "allocate rx page err\n");
> return -ENOMEM;
> }
>
> - skb_set_queue_mapping(skb, ch);
> - ret = cpdma_chan_submit(cpsw->rxv[ch].ch, skb,
> - skb->data, skb_tailroom(skb),
> - 0);
> + xmeta = page_address(page) + CPSW_XMETA_OFFSET;
> + xmeta->ndev = priv->ndev;
> + xmeta->ch = ch;
> +
> + dma = page->dma_addr + CPSW_HEADROOM;
> + ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, page,
> + dma, cpsw->rx_packet_max,
> + 0);
> if (ret < 0) {
> cpsw_err(priv, ifup,
> "cannot submit skb to channel %d rx, error %d\n",
> ch, ret);
> - kfree_skb(skb);
> + page_pool_recycle_direct(pool, page);
> return ret;
> }
> - kmemleak_not_leak(skb);
> }
>
> cpsw_info(priv, ifup, "ch %d rx, submitted %d descriptors\n",
> @@ -2011,6 +2330,64 @@ static int cpsw_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type,
> }
> }
>
> +static int cpsw_xdp_prog_setup(struct cpsw_priv *priv, struct netdev_bpf *bpf)
> +{
> + struct bpf_prog *prog = bpf->prog;
> +
> + if (!priv->xdpi.prog && !prog)
> + return 0;
> +
> + if (!xdp_attachment_flags_ok(&priv->xdpi, bpf))
> + return -EBUSY;
> +
> + WRITE_ONCE(priv->xdp_prog, prog);
> +
> + xdp_attachment_setup(&priv->xdpi, bpf);
> +
> + return 0;
> +}
> +
> +static int cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf)
> +{
> + struct cpsw_priv *priv = netdev_priv(ndev);
> +
> + switch (bpf->command) {
> + case XDP_SETUP_PROG:
> + return cpsw_xdp_prog_setup(priv, bpf);
> +
> + case XDP_QUERY_PROG:
> + return xdp_attachment_query(&priv->xdpi, bpf);
> +
> + default:
> + return -EINVAL;
> + }
> +}
> +
> +static int cpsw_ndo_xdp_xmit(struct net_device *ndev, int n,
> + struct xdp_frame **frames, u32 flags)
> +{
> + struct cpsw_priv *priv = netdev_priv(ndev);
> + struct xdp_frame *xdpf;
> + int i, drops = 0;
> +
> + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
> + return -EINVAL;
> +
> + for (i = 0; i < n; i++) {
> + xdpf = frames[i];
> + if (xdpf->len < CPSW_MIN_PACKET_SIZE) {
> + xdp_return_frame_rx_napi(xdpf);
> + drops++;
> + continue;
> + }
> +
> + if (cpsw_xdp_tx_frame(priv, xdpf))
> + drops++;
> + }
> +
> + return n - drops;
> +}
> +
> #ifdef CONFIG_NET_POLL_CONTROLLER
> static void cpsw_ndo_poll_controller(struct net_device *ndev)
> {
> @@ -2039,6 +2416,8 @@ static const struct net_device_ops cpsw_netdev_ops = {
> .ndo_vlan_rx_add_vid = cpsw_ndo_vlan_rx_add_vid,
> .ndo_vlan_rx_kill_vid = cpsw_ndo_vlan_rx_kill_vid,
> .ndo_setup_tc = cpsw_ndo_setup_tc,
> + .ndo_bpf = cpsw_ndo_bpf,
> + .ndo_xdp_xmit = cpsw_ndo_xdp_xmit,
> };
>
> static void cpsw_get_drvinfo(struct net_device *ndev,
> @@ -2335,11 +2714,24 @@ static int cpsw_probe_dual_emac(struct cpsw_priv *priv)
> ndev->netdev_ops = &cpsw_netdev_ops;
> ndev->ethtool_ops = &cpsw_ethtool_ops;
>
> + ret = xdp_rxq_info_reg(priv_sl2->xdp_rxq, ndev, 0);
> + if (ret)
> + return ret;
> +
> + ret = xdp_rxq_info_reg_mem_model(priv_sl2->xdp_rxq, MEM_TYPE_PAGE_POOL,
> + cpsw->rx_page_pool);
> + if (ret) {
> + xdp_rxq_info_unreg(priv_sl2->xdp_rxq);
> + return ret;
> + }
> +
> /* register the network device */
> SET_NETDEV_DEV(ndev, cpsw->dev);
> ret = register_netdev(ndev);
> - if (ret)
> + if (ret) {
> dev_err(cpsw->dev, "cpsw: error registering net device\n");
> + xdp_rxq_info_unreg(priv_sl2->xdp_rxq);
> + }
>
> return ret;
> }
> @@ -2457,19 +2849,25 @@ static int cpsw_probe(struct platform_device *pdev)
> if (ret)
> goto clean_dt_ret;
>
> + cpsw->rx_page_pool = cpsw_create_rx_pool(cpsw);
> + if (IS_ERR(cpsw->rx_page_pool)) {
> + ret = PTR_ERR(cpsw->rx_page_pool);
> + goto clean_cpts;
> + }
> +
> ch = cpsw->quirk_irq ? 0 : 7;
> cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, ch, cpsw_tx_handler, 0);
> if (IS_ERR(cpsw->txv[0].ch)) {
> dev_err(dev, "error initializing tx dma channel\n");
> ret = PTR_ERR(cpsw->txv[0].ch);
> - goto clean_cpts;
> + goto clean_pool;
> }
>
> cpsw->rxv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
> if (IS_ERR(cpsw->rxv[0].ch)) {
> dev_err(dev, "error initializing rx dma channel\n");
> ret = PTR_ERR(cpsw->rxv[0].ch);
> - goto clean_cpts;
> + goto clean_pool;
> }
> cpsw_split_res(cpsw);
>
> @@ -2478,7 +2876,7 @@ static int cpsw_probe(struct platform_device *pdev)
> CPSW_MAX_QUEUES, CPSW_MAX_QUEUES);
> if (!ndev) {
> dev_err(dev, "error allocating net_device\n");
> - goto clean_cpts;
> + goto clean_pool;
> }
>
> platform_set_drvdata(pdev, ndev);
> @@ -2499,6 +2897,15 @@ static int cpsw_probe(struct platform_device *pdev)
>
> memcpy(ndev->dev_addr, priv->mac_addr, ETH_ALEN);
>
> + ret = xdp_rxq_info_reg(priv->xdp_rxq, ndev, 0);
> + if (ret)
> + goto clean_pool;
> +
> + ret = xdp_rxq_info_reg_mem_model(priv->xdp_rxq, MEM_TYPE_PAGE_POOL,
> + cpsw->rx_page_pool);
> + if (ret)
> + goto clean_rxq_info;
> +
> cpsw->slaves[0].ndev = ndev;
>
> ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_RX;
> @@ -2518,7 +2925,7 @@ static int cpsw_probe(struct platform_device *pdev)
> if (ret) {
> dev_err(dev, "error registering net device\n");
> ret = -ENODEV;
> - goto clean_cpts;
> + goto clean_rxq_info;
> }
>
> if (cpsw->data.dual_emac) {
> @@ -2561,6 +2968,10 @@ static int cpsw_probe(struct platform_device *pdev)
>
> clean_unregister_netdev_ret:
> unregister_netdev(ndev);
> +clean_rxq_info:
> + xdp_rxq_info_unreg(priv->xdp_rxq);
> +clean_pool:
> + page_pool_destroy(cpsw->rx_page_pool);
> clean_cpts:
> cpts_release(cpsw->cpts);
> cpdma_ctlr_destroy(cpsw->dma);
> @@ -2572,11 +2983,26 @@ static int cpsw_probe(struct platform_device *pdev)
> return ret;
> }
>
> +void cpsw_xdp_rxq_unreg(struct cpsw_common *cpsw, int ch)
> +{
> + struct cpsw_slave *slave;
> + struct cpsw_priv *priv;
> + int i;
> +
> + for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) {
> + if (!slave->ndev)
> + continue;
> +
> + priv = netdev_priv(slave->ndev);
> + xdp_rxq_info_unreg(&priv->xdp_rxq[ch]);
> + }
> +}
> +
> static int cpsw_remove(struct platform_device *pdev)
> {
> struct net_device *ndev = platform_get_drvdata(pdev);
> struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
> - int ret;
> + int i, ret;
>
> ret = pm_runtime_get_sync(&pdev->dev);
> if (ret < 0) {
> @@ -2590,6 +3016,11 @@ static int cpsw_remove(struct platform_device *pdev)
>
> cpts_release(cpsw->cpts);
> cpdma_ctlr_destroy(cpsw->dma);
> +
> + for (i = 0; i < cpsw->rx_ch_num; i++)
> + cpsw_xdp_rxq_unreg(cpsw, i);
> +
> + page_pool_destroy(cpsw->rx_page_pool);
> cpsw_remove_dt(pdev);
> pm_runtime_put_sync(&pdev->dev);
> pm_runtime_disable(&pdev->dev);
> diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c
> index 0c08ec91635a..dbe4bc5513c6 100644
> --- a/drivers/net/ethernet/ti/cpsw_ethtool.c
> +++ b/drivers/net/ethernet/ti/cpsw_ethtool.c
> @@ -14,6 +14,7 @@
> #include <linux/phy.h>
> #include <linux/pm_runtime.h>
> #include <linux/skbuff.h>
> +#include <net/page_pool.h>
>
> #include "cpsw.h"
> #include "cpts.h"
> @@ -531,6 +532,42 @@ static int cpsw_check_ch_settings(struct cpsw_common *cpsw,
> return 0;
> }
>
> +static int cpsw_xdp_rxq_reg(struct cpsw_common *cpsw, int ch)
> +{
> + struct cpsw_slave *slave;
> + struct cpsw_priv *priv;
> + int i, ret;
> +
> + /* As channels are common for both ports sharing same queues, xdp_rxq
> + * information also becomes shared and used by every packet on this
> + * channel. But exch xdp_rxq holds link on netdev, which by the theory
> + * can have different memory model and so, network device must hold it's
> + * own set of rxq and thus both netdevs should be prepared
> + */
> + for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) {
> + if (!slave->ndev)
> + continue;
> +
> + priv = netdev_priv(slave->ndev);
> +
> + ret = xdp_rxq_info_reg(&priv->xdp_rxq[ch], priv->ndev, ch);
> + if (ret)
> + goto err;
> +
> + ret = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq[ch],
> + MEM_TYPE_PAGE_POOL,
> + cpsw->rx_page_pool);
> + if (ret)
> + goto err;
> + }
> +
> + return ret;
> +
> +err:
> + cpsw_xdp_rxq_unreg(cpsw, ch);
> + return ret;
> +}
> +
> static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx,
> cpdma_handler_fn rx_handler)
> {
> @@ -562,6 +599,11 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx,
> if (!vec[*ch].ch)
> return -EINVAL;
>
> + if (rx && cpsw_xdp_rxq_reg(cpsw, *ch)) {
> + cpdma_chan_destroy(vec[*ch].ch);
> + return -EINVAL;
> + }
> +
> cpsw_info(priv, ifup, "created new %d %s channel\n", *ch,
> (rx ? "rx" : "tx"));
> (*ch)++;
> @@ -570,6 +612,9 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx,
> while (*ch > ch_num) {
> (*ch)--;
>
> + if (rx)
> + cpsw_xdp_rxq_unreg(cpsw, *ch);
> +
> ret = cpdma_chan_destroy(vec[*ch].ch);
> if (ret)
> return ret;
> @@ -654,6 +699,7 @@ int cpsw_set_ringparam(struct net_device *ndev,
> {
> struct cpsw_priv *priv = netdev_priv(ndev);
> struct cpsw_common *cpsw = priv->cpsw;
> + struct page_pool *pool;
> int ret;
>
> /* ignore ering->tx_pending - only rx_pending adjustment is supported */
> @@ -666,6 +712,10 @@ int cpsw_set_ringparam(struct net_device *ndev,
> if (ering->rx_pending == cpdma_get_num_rx_descs(cpsw->dma))
> return 0;
>
> + pool = cpsw_create_rx_pool(cpsw);
> + if (IS_ERR(pool))
> + return PTR_ERR(pool);
> +
> cpsw_suspend_data_pass(ndev);
>
> cpdma_set_num_rx_descs(cpsw->dma, ering->rx_pending);
> @@ -673,6 +723,9 @@ int cpsw_set_ringparam(struct net_device *ndev,
> if (cpsw->usage_count)
> cpdma_chan_split_pool(cpsw->dma);
>
> + page_pool_destroy(cpsw->rx_page_pool);
> + cpsw->rx_page_pool = pool;
> +
> ret = cpsw_resume_data_pass(ndev);
> if (!ret)
> return 0;
> diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
> index 2ecb3af59fe9..884ce6343a7d 100644
> --- a/drivers/net/ethernet/ti/cpsw_priv.h
> +++ b/drivers/net/ethernet/ti/cpsw_priv.h
> @@ -346,6 +346,7 @@ struct cpsw_common {
> int rx_ch_num, tx_ch_num;
> int speed;
> int usage_count;
> + struct page_pool *rx_page_pool;
> };
>
> struct cpsw_priv {
> @@ -360,6 +361,10 @@ struct cpsw_priv {
> int shp_cfg_speed;
> int tx_ts_enabled;
> int rx_ts_enabled;
> + struct bpf_prog *xdp_prog;
> + struct xdp_rxq_info xdp_rxq[CPSW_MAX_QUEUES];
> + struct xdp_attachment_info xdpi;
> +
> u32 emac_port;
> struct cpsw_common *cpsw;
> };
> @@ -391,6 +396,8 @@ int cpsw_fill_rx_channels(struct cpsw_priv *priv);
> void cpsw_intr_enable(struct cpsw_common *cpsw);
> void cpsw_intr_disable(struct cpsw_common *cpsw);
> int cpsw_tx_handler(void *token, int len, int status);
> +void cpsw_xdp_rxq_unreg(struct cpsw_common *cpsw, int ch);
> +struct page_pool *cpsw_create_rx_pool(struct cpsw_common *cpsw);
>
> /* ethtool */
> u32 cpsw_get_msglevel(struct net_device *ndev);
> --
> 2.17.1
>

2019-05-24 11:56:21

by Jesper Dangaard Brouer

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] net: ethernet: ti: cpsw: add XDP support

On Thu, 23 May 2019 21:20:35 +0300
Ivan Khoronzhuk <[email protected]> wrote:

> Add XDP support based on rx page_pool allocator, one frame per page.
> Page pool allocator is used with assumption that only one rx_handler
> is running simultaneously. DMA map/unmap is reused from page pool
> despite there is no need to map whole page.

When using page_pool for DMA-mapping, your XDP-memory model must use
1-page per packet, which you state you do. This is because
__page_pool_put_page() fallback mode does a __page_pool_clean_page()
unmapping the DMA. Ilias and I are looking at options for removing this
restriction as Mlx5 would need it (when we extend the SKB to return
pages to page_pool).

Unfortunately, I've found another blocker for drivers using the DMA
mapping feature of page_pool. We don't properly handle the case, where
a remote TX-driver have xdp_frame's in-flight, and simultaneously the
sending driver is unloaded and take down the page_pool. Nothing crash,
but we end-up calling put_page() on a page that is still DMA-mapped.

I'm working on different solutions for fixing this, see here:
https://github.com/xdp-project/xdp-project/blob/master/areas/mem/page_pool03_shutdown_inflight.org
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer

2019-05-24 17:51:42

by Grygorii Strashko

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] net: ethernet: ti: cpsw: add XDP support

Hi Ivan,

On 23/05/2019 21:20, Ivan Khoronzhuk wrote:
> Add XDP support based on rx page_pool allocator, one frame per page.
> Page pool allocator is used with assumption that only one rx_handler
> is running simultaneously. DMA map/unmap is reused from page pool
> despite there is no need to map whole page.
>
> Due to specific of cpsw, the same TX/RX handler can be used by 2
> network devices, so special fields in buffer are added to identify
> an interface the frame is destined to. Thus XDP works for both
> interfaces, that allows to test xdp redirect between two interfaces
> easily.
>
> XDP prog is common for all channels till appropriate changes are added
> in XDP infrastructure.
>
> Signed-off-by: Ivan Khoronzhuk <[email protected]>
> ---
> drivers/net/ethernet/ti/Kconfig | 1 +
> drivers/net/ethernet/ti/cpsw.c | 555 ++++++++++++++++++++++---
> drivers/net/ethernet/ti/cpsw_ethtool.c | 53 +++
> drivers/net/ethernet/ti/cpsw_priv.h | 7 +
> 4 files changed, 554 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
> index bd05a977ee7e..3cb8c5214835 100644
> --- a/drivers/net/ethernet/ti/Kconfig
> +++ b/drivers/net/ethernet/ti/Kconfig
> @@ -50,6 +50,7 @@ config TI_CPSW
> depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST
> select TI_DAVINCI_MDIO
> select MFD_SYSCON
> + select PAGE_POOL
> select REGMAP
> ---help---
> This driver supports TI's CPSW Ethernet Switch.
> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index 87a600aeee4a..274e6b64ea9e 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -31,6 +31,10 @@
> #include <linux/if_vlan.h>
> #include <linux/kmemleak.h>
> #include <linux/sys_soc.h>
> +#include <net/page_pool.h>
> +#include <linux/bpf.h>
> +#include <linux/bpf_trace.h>
> +#include <linux/filter.h>
>
> #include <linux/pinctrl/consumer.h>
> #include <net/pkt_cls.h>
> @@ -60,6 +64,10 @@ static int descs_pool_size = CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT;
> module_param(descs_pool_size, int, 0444);
> MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");
>
> +/* The buf includes headroom compatible with both skb and xdpf */
> +#define CPSW_HEADROOM_NA (max(XDP_PACKET_HEADROOM, NET_SKB_PAD) + NET_IP_ALIGN)
> +#define CPSW_HEADROOM ALIGN(CPSW_HEADROOM_NA, sizeof(long))
> +
> #define for_each_slave(priv, func, arg...) \
> do { \
> struct cpsw_slave *slave; \
> @@ -74,6 +82,8 @@ MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");
> (func)(slave++, ##arg); \
> } while (0)
>
> +#define CPSW_XMETA_OFFSET ALIGN(sizeof(struct xdp_frame), sizeof(long))
> +
> static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
> __be16 proto, u16 vid);
>
> @@ -337,24 +347,58 @@ void cpsw_intr_disable(struct cpsw_common *cpsw)
> return;
> }

[..]

>
> +static int cpsw_xdp_tx_frame_mapped(struct cpsw_priv *priv,
> + struct xdp_frame *xdpf, struct page *page)
> +{
> + struct cpsw_common *cpsw = priv->cpsw;
> + struct cpsw_meta_xdp *xmeta;
> + struct netdev_queue *txq;
> + struct cpdma_chan *txch;
> + dma_addr_t dma;
> + int ret;
> +
> + xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
> + xmeta->ch = 0;
> +
> + txch = cpsw->txv[0].ch;
> + dma = (xdpf->data - (void *)xdpf) + page->dma_addr;
> + ret = cpdma_chan_submit_mapped(txch, cpsw_xdpf_to_handle(xdpf), dma,
> + xdpf->len,
> + priv->emac_port + cpsw->data.dual_emac);
> + if (ret) {
> + xdp_return_frame_rx_napi(xdpf);
> + goto stop;
> + }
> +
> + /* no tx desc - stop sending us tx frames */
> + if (unlikely(!cpdma_check_free_tx_desc(txch)))
> + goto stop;
> +
> + return ret;
> +stop:
> + txq = netdev_get_tx_queue(priv->ndev, 0);
> + netif_tx_stop_queue(txq);
> +
> + /* Barrier, so that stop_queue visible to other cpus */
> + smp_mb__after_atomic();
> +
> + if (cpdma_check_free_tx_desc(txch))
> + netif_tx_wake_queue(txq);
> +
> + return ret;
> +}
> +
> +static int cpsw_xdp_tx_frame(struct cpsw_priv *priv, struct xdp_frame *xdpf)
> +{
> + struct cpsw_common *cpsw = priv->cpsw;
> + struct cpsw_meta_xdp *xmeta;
> + struct netdev_queue *txq;
> + struct cpdma_chan *txch;
> + int ret;
> +
> + xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
> + if (sizeof(*xmeta) > xdpf->headroom)
> + return -EINVAL;
> +
> + xmeta->ndev = priv->ndev;
> + xmeta->ch = 0;
> +
> + txch = cpsw->txv[0].ch;
> + ret = cpdma_chan_submit(txch, cpsw_xdpf_to_handle(xdpf), xdpf->data,
> + xdpf->len,
> + priv->emac_port + cpsw->data.dual_emac);
> + if (ret) {
> + xdp_return_frame_rx_napi(xdpf);
> + goto stop;
> + }
> +
> + /* no tx desc - stop sending us tx frames */
> + if (unlikely(!cpdma_check_free_tx_desc(txch)))
> + goto stop;
> +
> + return ret;
> +stop:
> + txq = netdev_get_tx_queue(priv->ndev, 0);
> + netif_tx_stop_queue(txq);
> +
> + /* Barrier, so that stop_queue visible to other cpus */
> + smp_mb__after_atomic();
> +
> + if (cpdma_check_free_tx_desc(txch))
> + netif_tx_wake_queue(txq);
> +
> + return ret;
> +}

Above 2 functions are mostly identical - could you do smth. with it?

> +
> +static int cpsw_run_xdp(struct cpsw_priv *priv, struct cpsw_vector *rxv,
> + struct xdp_buff *xdp, struct page *page)
> +{
> + struct net_device *ndev = priv->ndev;
> + struct xdp_frame *xdpf;
> + struct bpf_prog *prog;
> + int ret = 1;
> + u32 act;
> +
> + rcu_read_lock();
> +
> + prog = READ_ONCE(priv->xdp_prog);
> + if (!prog) {
> + ret = 0;
> + goto out;
> + }
> +
> + act = bpf_prog_run_xdp(prog, xdp);
> + switch (act) {
> + case XDP_PASS:
> + ret = 0;
> + break;
> + case XDP_TX:
> + xdpf = convert_to_xdp_frame(xdp);
> + if (unlikely(!xdpf))
> + xdp_return_buff(xdp);
> + else
> + cpsw_xdp_tx_frame_mapped(priv, xdpf, page);
> + break;
> + case XDP_REDIRECT:
> + if (xdp_do_redirect(ndev, xdp, prog))
> + xdp_return_buff(xdp);
> + else
> + ret = 2;

could we avoid using consts as return values?
may be some informative defines/enum?

> + break;
> + default:
> + bpf_warn_invalid_xdp_action(act);
> + /* fall through */
> + case XDP_ABORTED:
> + trace_xdp_exception(ndev, prog, act);
> + /* fall through -- handle aborts by dropping packet */
> + case XDP_DROP:
> + xdp_return_buff(xdp);
> + break;
> + }
> +out:
> + rcu_read_unlock();
> + return ret;
> +}
> +
> +static unsigned int cpsw_rxbuf_total_len(unsigned int len)
> +{
> + len += CPSW_HEADROOM;
> + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> + return SKB_DATA_ALIGN(len);
> +}
> +
> +struct page_pool *cpsw_create_rx_pool(struct cpsw_common *cpsw)
> +{
> + struct page_pool_params pp_params = { 0 };
> + struct page_pool *pool;
> +
> + pp_params.order = 0;
> + pp_params.flags = PP_FLAG_DMA_MAP;
> +
> + /* set it to number of RX descriptors, but can be more */
> + pp_params.pool_size = cpdma_get_num_rx_descs(cpsw->dma);
> + pp_params.nid = NUMA_NO_NODE;
> + pp_params.dma_dir = DMA_BIDIRECTIONAL;
> + pp_params.dev = cpsw->dev;
> +
> + pool = page_pool_create(&pp_params);
> + if (IS_ERR(pool))
> + dev_err(cpsw->dev, "cannot create rx page pool\n");
> +
> + return pool;
> +}
> +
> +static struct page *cpsw_alloc_page(struct cpsw_common *cpsw)
> +{
> + struct page_pool *pool = cpsw->rx_page_pool;
> + struct page *page, *prev_page = NULL;
> + int try = pool->p.pool_size << 2;
> + int start_free = 0, ret;
> +
> + do {
> + page = page_pool_dev_alloc_pages(pool);
> + if (!page)
> + return NULL;
> +
> + /* if netstack has page_pool recycling remove the rest */
> + if (page_ref_count(page) == 1)
> + break;
> +
> + /* start free pages in use, shouldn't happen */
> + if (prev_page == page || start_free) {
> + /* dma unmap/puts page if rfcnt != 1 */
> + page_pool_recycle_direct(pool, page);
> + start_free = 1;
> + continue;
> + }
> +
> + /* if refcnt > 1, page has been holding by netstack, it's pity,
> + * so put it to the ring to be consumed later when fast cash is
> + * empty. If ring is full then free page by recycling as above.
> + */
> + ret = ptr_ring_produce(&pool->ring, page);
> + if (ret) {
> + page_pool_recycle_direct(pool, page);
> + continue;
> + }
> +
> + if (!prev_page)
> + prev_page = page;
> + } while (try--);
> +
> + return page;
> +}
> +
> static int cpsw_rx_handler(void *token, int len, int status)
> {
> - struct cpdma_chan *ch;
> - struct sk_buff *skb = token;
> - struct sk_buff *new_skb;
> - struct net_device *ndev = skb->dev;
> - int ret = 0, port;
> - struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
> + struct page *new_page, *page = token;
> + void *pa = page_address(page);
> + struct cpsw_meta_xdp *xmeta = pa + CPSW_XMETA_OFFSET;
> + struct cpsw_common *cpsw = ndev_to_cpsw(xmeta->ndev);
> + int pkt_size = cpsw->rx_packet_max;
> + int ret = 0, port, ch = xmeta->ch;
> + struct page_pool *pool = cpsw->rx_page_pool;
> + int headroom = CPSW_HEADROOM;
> + struct net_device *ndev = xmeta->ndev;
> + int flush = 0;
> struct cpsw_priv *priv;
> + struct sk_buff *skb;
> + struct xdp_buff xdp;
> + dma_addr_t dma;
>
> if (cpsw->data.dual_emac) {
> port = CPDMA_RX_SOURCE_PORT(status);
> - if (port) {
> + if (port)
> ndev = cpsw->slaves[--port].ndev;
> - skb->dev = ndev;
> - }
> }
>
> if (unlikely(status < 0) || unlikely(!netif_running(ndev))) {
> @@ -429,47 +680,101 @@ static int cpsw_rx_handler(void *token, int len, int status)
> * in reducing of the number of rx descriptor in
> * DMA engine, requeue skb back to cpdma.
> */
> - new_skb = skb;
> + new_page = page;
> goto requeue;
> }
>
> /* the interface is going down, skbs are purged */
> - dev_kfree_skb_any(skb);
> + page_pool_recycle_direct(pool, page);
> return 0;
> }
>
> - new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
> - if (new_skb) {
> - skb_copy_queue_mapping(new_skb, skb);
> - skb_put(skb, len);
> - if (status & CPDMA_RX_VLAN_ENCAP)
> - cpsw_rx_vlan_encap(skb);
> - priv = netdev_priv(ndev);
> - if (priv->rx_ts_enabled)
> - cpts_rx_timestamp(cpsw->cpts, skb);
> - skb->protocol = eth_type_trans(skb, ndev);
> - netif_receive_skb(skb);
> - ndev->stats.rx_bytes += len;
> - ndev->stats.rx_packets++;
> - kmemleak_not_leak(new_skb);
> - } else {
> + new_page = cpsw_alloc_page(cpsw);
> + if (unlikely(!new_page)) {
> + new_page = page;
> ndev->stats.rx_dropped++;
> - new_skb = skb;
> + goto requeue;
> }
>
> + priv = netdev_priv(ndev);
> + if (priv->xdp_prog) {
> + if (status & CPDMA_RX_VLAN_ENCAP) {
> + xdp.data = (void *)pa + CPSW_HEADROOM +
> + CPSW_RX_VLAN_ENCAP_HDR_SIZE;
> + xdp.data_end = xdp.data + len -
> + CPSW_RX_VLAN_ENCAP_HDR_SIZE;
> + } else {
> + xdp.data = (void *)pa + CPSW_HEADROOM;
> + xdp.data_end = xdp.data + len;
> + }
> +
> + xdp_set_data_meta_invalid(&xdp);
> +
> + xdp.data_hard_start = pa;
> + xdp.rxq = &priv->xdp_rxq[ch];
> +
> + ret = cpsw_run_xdp(priv, &cpsw->rxv[ch], &xdp, page);
> + if (ret) {
> + if (ret == 2)
> + flush = 1;

const?

> +
> + goto requeue;
> + }
> +
> + /* XDP prog might have changed packet data and boundaries */
> + len = xdp.data_end - xdp.data;
> + headroom = xdp.data - xdp.data_hard_start;
> + }
> +
> + /* Build skb and pass it to netstack if XDP off or XDP prog
> + * returned XDP_PASS
> + */
> + skb = build_skb(pa, cpsw_rxbuf_total_len(pkt_size));
> + if (!skb) {
> + ndev->stats.rx_dropped++;
> + page_pool_recycle_direct(pool, page);
> + goto requeue;
> + }
> +
> + skb_reserve(skb, headroom);
> + skb_put(skb, len);
> + skb->dev = ndev;
> + if (status & CPDMA_RX_VLAN_ENCAP)
> + cpsw_rx_vlan_encap(skb);
> + if (priv->rx_ts_enabled)
> + cpts_rx_timestamp(cpsw->cpts, skb);
> + skb->protocol = eth_type_trans(skb, ndev);
> +
> + /* recycle page before increasing refcounter, it allows to hold page in
> + * page pool cache improving allocation time, see cpsw_alloc_page().
> + */
> + page_pool_recycle_direct(pool, page);
> +
> + /* remove once ordinary netstack has page_pool recycling */
> + page_ref_inc(page);
> +
> + netif_receive_skb(skb);
> +
> + ndev->stats.rx_bytes += len;
> + ndev->stats.rx_packets++;
> +
> requeue:
> if (netif_dormant(ndev)) {
> - dev_kfree_skb_any(new_skb);
> - return 0;
> + page_pool_recycle_direct(pool, new_page);
> + return flush;
> }
>
> - ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch;
> - ret = cpdma_chan_submit(ch, new_skb, new_skb->data,
> - skb_tailroom(new_skb), 0);
> + xmeta = page_address(new_page) + CPSW_XMETA_OFFSET;
> + xmeta->ndev = ndev;
> + xmeta->ch = ch;
> +
> + dma = new_page->dma_addr + CPSW_HEADROOM;
> + ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma,
> + pkt_size, 0);
> if (WARN_ON(ret < 0))
> - dev_kfree_skb_any(new_skb);
> + page_pool_recycle_direct(pool, new_page);
>
> - return 0;
> + return flush;
> }
>
> void cpsw_split_res(struct cpsw_common *cpsw)
> @@ -644,7 +949,7 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
> static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
> {
> u32 ch_map;
> - int num_rx, cur_budget, ch;
> + int num_rx, cur_budget, ch, flush;
> struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
> struct cpsw_vector *rxv;
>
> @@ -660,8 +965,12 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
> else
> cur_budget = rxv->budget;
>
> - cpdma_chan_process(rxv->ch, &cur_budget);
> + flush = cpdma_chan_process(rxv->ch, &cur_budget);
> num_rx += cur_budget;
> +
> + if (flush)
> + xdp_do_flush_map();

const?

> +
> if (num_rx >= budget)
> break;
> }
> @@ -677,10 +986,15 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
> static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)

Thank you

--
Best regards,
grygorii

2019-05-27 07:21:54

by Dan Carpenter

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] net: ethernet: ti: cpsw: add XDP support

Hi Ivan,

Thank you for the patch! Perhaps something to improve:

url: https://github.com/0day-ci/linux/commits/Ivan-Khoronzhuk/net-ethernet-ti-cpsw-Add-XDP-support/20190524-114123

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <[email protected]>
Reported-by: Dan Carpenter <[email protected]>

smatch warnings:
drivers/net/ethernet/ti/cpsw_ethtool.c:564 cpsw_xdp_rxq_reg() error: uninitialized symbol 'ret'.

# https://github.com/0day-ci/linux/commit/3cf4eb125ed19d18340fd3b0c4d7eb2f1ebdfb28
git remote add linux-review https://github.com/0day-ci/linux
git remote update linux-review
git checkout 3cf4eb125ed19d18340fd3b0c4d7eb2f1ebdfb28
vim +/ret +564 drivers/net/ethernet/ti/cpsw_ethtool.c

c24eef28 Grygorii Strashko 2019-04-26 534
3cf4eb12 Ivan Khoronzhuk 2019-05-23 535 static int cpsw_xdp_rxq_reg(struct cpsw_common *cpsw, int ch)
3cf4eb12 Ivan Khoronzhuk 2019-05-23 536 {
3cf4eb12 Ivan Khoronzhuk 2019-05-23 537 struct cpsw_slave *slave;
3cf4eb12 Ivan Khoronzhuk 2019-05-23 538 struct cpsw_priv *priv;
3cf4eb12 Ivan Khoronzhuk 2019-05-23 539 int i, ret;
3cf4eb12 Ivan Khoronzhuk 2019-05-23 540
3cf4eb12 Ivan Khoronzhuk 2019-05-23 541 /* As channels are common for both ports sharing same queues, xdp_rxq
3cf4eb12 Ivan Khoronzhuk 2019-05-23 542 * information also becomes shared and used by every packet on this
3cf4eb12 Ivan Khoronzhuk 2019-05-23 543 * channel. But exch xdp_rxq holds link on netdev, which by the theory
3cf4eb12 Ivan Khoronzhuk 2019-05-23 544 * can have different memory model and so, network device must hold it's
3cf4eb12 Ivan Khoronzhuk 2019-05-23 545 * own set of rxq and thus both netdevs should be prepared
3cf4eb12 Ivan Khoronzhuk 2019-05-23 546 */
3cf4eb12 Ivan Khoronzhuk 2019-05-23 547 for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) {
3cf4eb12 Ivan Khoronzhuk 2019-05-23 548 if (!slave->ndev)
3cf4eb12 Ivan Khoronzhuk 2019-05-23 549 continue;

Smatch always complains that every loop iteration could continue. Or
that cpsw->data.slaves might be zero at the start... It seems
implausible.

3cf4eb12 Ivan Khoronzhuk 2019-05-23 550
3cf4eb12 Ivan Khoronzhuk 2019-05-23 551 priv = netdev_priv(slave->ndev);
3cf4eb12 Ivan Khoronzhuk 2019-05-23 552
3cf4eb12 Ivan Khoronzhuk 2019-05-23 553 ret = xdp_rxq_info_reg(&priv->xdp_rxq[ch], priv->ndev, ch);
3cf4eb12 Ivan Khoronzhuk 2019-05-23 554 if (ret)
3cf4eb12 Ivan Khoronzhuk 2019-05-23 555 goto err;
3cf4eb12 Ivan Khoronzhuk 2019-05-23 556
3cf4eb12 Ivan Khoronzhuk 2019-05-23 557 ret = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq[ch],
3cf4eb12 Ivan Khoronzhuk 2019-05-23 558 MEM_TYPE_PAGE_POOL,
3cf4eb12 Ivan Khoronzhuk 2019-05-23 559 cpsw->rx_page_pool);
3cf4eb12 Ivan Khoronzhuk 2019-05-23 560 if (ret)
3cf4eb12 Ivan Khoronzhuk 2019-05-23 561 goto err;
3cf4eb12 Ivan Khoronzhuk 2019-05-23 562 }
3cf4eb12 Ivan Khoronzhuk 2019-05-23 563
3cf4eb12 Ivan Khoronzhuk 2019-05-23 @564 return ret;

This would be more readable as "return 0;" anyway.

3cf4eb12 Ivan Khoronzhuk 2019-05-23 565
3cf4eb12 Ivan Khoronzhuk 2019-05-23 566 err:
3cf4eb12 Ivan Khoronzhuk 2019-05-23 567 cpsw_xdp_rxq_unreg(cpsw, ch);
3cf4eb12 Ivan Khoronzhuk 2019-05-23 568 return ret;
3cf4eb12 Ivan Khoronzhuk 2019-05-23 569 }

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation

2019-05-27 18:12:12

by Ivan Khoronzhuk

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] net: ethernet: ti: cpsw: add XDP support

On Fri, May 24, 2019 at 01:54:18PM +0200, Jesper Dangaard Brouer wrote:
>On Thu, 23 May 2019 21:20:35 +0300
>Ivan Khoronzhuk <[email protected]> wrote:
>
>> Add XDP support based on rx page_pool allocator, one frame per page.
>> Page pool allocator is used with assumption that only one rx_handler
>> is running simultaneously. DMA map/unmap is reused from page pool
>> despite there is no need to map whole page.
>
>When using page_pool for DMA-mapping, your XDP-memory model must use
>1-page per packet, which you state you do. This is because
>__page_pool_put_page() fallback mode does a __page_pool_clean_page()
>unmapping the DMA. Ilias and I are looking at options for removing this
>restriction as Mlx5 would need it (when we extend the SKB to return
>pages to page_pool).
Thank for what you do, it can simplify a lot...

>
>Unfortunately, I've found another blocker for drivers using the DMA
>mapping feature of page_pool. We don't properly handle the case, where
>a remote TX-driver have xdp_frame's in-flight, and simultaneously the
>sending driver is unloaded and take down the page_pool. Nothing crash,
>but we end-up calling put_page() on a page that is still DMA-mapped.

Seems so, ... for generic solution, but looks like in case of cpsw there
is no issue due to "like direct" dma map by adding offset, so whether page_pool
dma map or dma map/unmap per rx/xmit, shouldn't be big difference. Not sure
about all SoCs thought...

Despite of it, for cpsw I keep page_pool while down/up that I'm going to change
in v2.

>
>I'm working on different solutions for fixing this, see here:
> https://github.com/xdp-project/xdp-project/blob/master/areas/mem/page_pool03_shutdown_inflight.org
Hope there will be no changes in page_pool API.

>-- Best regards,
> Jesper Dangaard Brouer
> MSc.CS, Principal Kernel Engineer at Red Hat
> LinkedIn: http://www.linkedin.com/in/brouer

--
Regards,
Ivan Khoronzhuk

2019-05-27 18:22:40

by Ivan Khoronzhuk

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] net: ethernet: ti: cpsw: add XDP support

On Fri, May 24, 2019 at 02:05:11PM +0300, Ilias Apalodimas wrote:
>On Thu, May 23, 2019 at 09:20:35PM +0300, Ivan Khoronzhuk wrote:
>> Add XDP support based on rx page_pool allocator, one frame per page.
>> Page pool allocator is used with assumption that only one rx_handler
>> is running simultaneously. DMA map/unmap is reused from page pool
>> despite there is no need to map whole page.
>>
>> Due to specific of cpsw, the same TX/RX handler can be used by 2
>> network devices, so special fields in buffer are added to identify
>> an interface the frame is destined to. Thus XDP works for both
>> interfaces, that allows to test xdp redirect between two interfaces
>> easily.
>>
>> XDP prog is common for all channels till appropriate changes are added
>> in XDP infrastructure.
>>
>> Signed-off-by: Ivan Khoronzhuk <[email protected]>
>> ---
>> drivers/net/ethernet/ti/Kconfig | 1 +
>> drivers/net/ethernet/ti/cpsw.c | 555 ++++++++++++++++++++++---
>> drivers/net/ethernet/ti/cpsw_ethtool.c | 53 +++
>> drivers/net/ethernet/ti/cpsw_priv.h | 7 +
>> 4 files changed, 554 insertions(+), 62 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
>> index bd05a977ee7e..3cb8c5214835 100644
>> --- a/drivers/net/ethernet/ti/Kconfig
>> +++ b/drivers/net/ethernet/ti/Kconfig
>> @@ -50,6 +50,7 @@ config TI_CPSW
>> depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST
>> select TI_DAVINCI_MDIO
>> select MFD_SYSCON
>> + select PAGE_POOL
>> select REGMAP
>> ---help---
>> This driver supports TI's CPSW Ethernet Switch.
>> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
>> index 87a600aeee4a..274e6b64ea9e 100644
>> --- a/drivers/net/ethernet/ti/cpsw.c
>> +++ b/drivers/net/ethernet/ti/cpsw.c
>> @@ -31,6 +31,10 @@
>> #include <linux/if_vlan.h>
>> #include <linux/kmemleak.h>
>> #include <linux/sys_soc.h>
>> +#include <net/page_pool.h>
>> +#include <linux/bpf.h>
>> +#include <linux/bpf_trace.h>
>> +#include <linux/filter.h>
>>
>> #include <linux/pinctrl/consumer.h>

[...]

>> + start_free = 1;
>> + continue;
>> + }
>> +
>> + /* if refcnt > 1, page has been holding by netstack, it's pity,
>> + * so put it to the ring to be consumed later when fast cash is
>s/cash/cache
>
>> + * empty. If ring is full then free page by recycling as above.
>> + */
>> + ret = ptr_ring_produce(&pool->ring, page);
>> + if (ret) {
>> + page_pool_recycle_direct(pool, page);
>> + continue;
>> + }
>Although this should be fine since this part won't be called during the driver
>init, i think i'd prefer unmapping the buffer and let the network stack free it,
>instead of pushing it for recycling. The occurence should be pretty low, so
>allocating a buffer every once in a while shouldn't have a noticeable
>performance impact
>

Ok, I will leave previous version from RFC.

--
Regards,
Ivan Khoronzhuk

2019-05-27 18:30:53

by Ivan Khoronzhuk

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] net: ethernet: ti: cpsw: add XDP support

On Fri, May 24, 2019 at 08:49:38PM +0300, grygorii wrote:
>Hi Ivan,
>
>On 23/05/2019 21:20, Ivan Khoronzhuk wrote:
>>Add XDP support based on rx page_pool allocator, one frame per page.
>>Page pool allocator is used with assumption that only one rx_handler
>>is running simultaneously. DMA map/unmap is reused from page pool
>>despite there is no need to map whole page.
>>
>>Due to specific of cpsw, the same TX/RX handler can be used by 2
>>network devices, so special fields in buffer are added to identify
>>an interface the frame is destined to. Thus XDP works for both
>>interfaces, that allows to test xdp redirect between two interfaces
>>easily.
>>
>>XDP prog is common for all channels till appropriate changes are added
>>in XDP infrastructure.
>>
>>Signed-off-by: Ivan Khoronzhuk <[email protected]>
>>---
>> drivers/net/ethernet/ti/Kconfig | 1 +
>> drivers/net/ethernet/ti/cpsw.c | 555 ++++++++++++++++++++++---
>> drivers/net/ethernet/ti/cpsw_ethtool.c | 53 +++
>> drivers/net/ethernet/ti/cpsw_priv.h | 7 +
>> 4 files changed, 554 insertions(+), 62 deletions(-)
>>
>>diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
>>index bd05a977ee7e..3cb8c5214835 100644
>>--- a/drivers/net/ethernet/ti/Kconfig
>>+++ b/drivers/net/ethernet/ti/Kconfig
>>@@ -50,6 +50,7 @@ config TI_CPSW
>> depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST
>> select TI_DAVINCI_MDIO
>> select MFD_SYSCON
>>+ select PAGE_POOL
>> select REGMAP
>> ---help---
>> This driver supports TI's CPSW Ethernet Switch.
>>diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
>>index 87a600aeee4a..274e6b64ea9e 100644
>>--- a/drivers/net/ethernet/ti/cpsw.c
>>+++ b/drivers/net/ethernet/ti/cpsw.c
>>@@ -31,6 +31,10 @@
>> #include <linux/if_vlan.h>
>> #include <linux/kmemleak.h>
>> #include <linux/sys_soc.h>
>>+#include <net/page_pool.h>
>>+#include <linux/bpf.h>
>>+#include <linux/bpf_trace.h>
>>+#include <linux/filter.h>
>> #include <linux/pinctrl/consumer.h>
>> #include <net/pkt_cls.h>
>>@@ -60,6 +64,10 @@ static int descs_pool_size = CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT;
>> module_param(descs_pool_size, int, 0444);
>> MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");
>>+/* The buf includes headroom compatible with both skb and xdpf */
>>+#define CPSW_HEADROOM_NA (max(XDP_PACKET_HEADROOM, NET_SKB_PAD) + NET_IP_ALIGN)
>>+#define CPSW_HEADROOM ALIGN(CPSW_HEADROOM_NA, sizeof(long))
>>+
>> #define for_each_slave(priv, func, arg...) \
>> do { \
>> struct cpsw_slave *slave; \
>>@@ -74,6 +82,8 @@ MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");
>> (func)(slave++, ##arg); \
>> } while (0)
>>+#define CPSW_XMETA_OFFSET ALIGN(sizeof(struct xdp_frame), sizeof(long))
>>+
>> static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
>> __be16 proto, u16 vid);
>>@@ -337,24 +347,58 @@ void cpsw_intr_disable(struct cpsw_common *cpsw)
>> return;
>> }
>
>[..]
>
>>+static int cpsw_xdp_tx_frame_mapped(struct cpsw_priv *priv,
>>+ struct xdp_frame *xdpf, struct page *page)
>>+{
>>+ struct cpsw_common *cpsw = priv->cpsw;
>>+ struct cpsw_meta_xdp *xmeta;
>>+ struct netdev_queue *txq;
>>+ struct cpdma_chan *txch;
>>+ dma_addr_t dma;
>>+ int ret;
>>+
>>+ xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
>>+ xmeta->ch = 0;
>>+
>>+ txch = cpsw->txv[0].ch;
>>+ dma = (xdpf->data - (void *)xdpf) + page->dma_addr;
>>+ ret = cpdma_chan_submit_mapped(txch, cpsw_xdpf_to_handle(xdpf), dma,
>>+ xdpf->len,
>>+ priv->emac_port + cpsw->data.dual_emac);
>>+ if (ret) {
>>+ xdp_return_frame_rx_napi(xdpf);
>>+ goto stop;
>>+ }
>>+
>>+ /* no tx desc - stop sending us tx frames */
>>+ if (unlikely(!cpdma_check_free_tx_desc(txch)))
>>+ goto stop;
>>+
>>+ return ret;
>>+stop:
>>+ txq = netdev_get_tx_queue(priv->ndev, 0);
>>+ netif_tx_stop_queue(txq);
>>+
>>+ /* Barrier, so that stop_queue visible to other cpus */
>>+ smp_mb__after_atomic();
>>+
>>+ if (cpdma_check_free_tx_desc(txch))
>>+ netif_tx_wake_queue(txq);
>>+
>>+ return ret;
>>+}
>>+
>>+static int cpsw_xdp_tx_frame(struct cpsw_priv *priv, struct xdp_frame *xdpf)
>>+{
>>+ struct cpsw_common *cpsw = priv->cpsw;
>>+ struct cpsw_meta_xdp *xmeta;
>>+ struct netdev_queue *txq;
>>+ struct cpdma_chan *txch;
>>+ int ret;
>>+
>>+ xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
>>+ if (sizeof(*xmeta) > xdpf->headroom)
>>+ return -EINVAL;
>>+
>>+ xmeta->ndev = priv->ndev;
>>+ xmeta->ch = 0;
>>+
>>+ txch = cpsw->txv[0].ch;
>>+ ret = cpdma_chan_submit(txch, cpsw_xdpf_to_handle(xdpf), xdpf->data,
>>+ xdpf->len,
>>+ priv->emac_port + cpsw->data.dual_emac);
>>+ if (ret) {
>>+ xdp_return_frame_rx_napi(xdpf);
>>+ goto stop;
>>+ }
>>+
>>+ /* no tx desc - stop sending us tx frames */
>>+ if (unlikely(!cpdma_check_free_tx_desc(txch)))
>>+ goto stop;
>>+
>>+ return ret;
>>+stop:
>>+ txq = netdev_get_tx_queue(priv->ndev, 0);
>>+ netif_tx_stop_queue(txq);
>>+
>>+ /* Barrier, so that stop_queue visible to other cpus */
>>+ smp_mb__after_atomic();
>>+
>>+ if (cpdma_check_free_tx_desc(txch))
>>+ netif_tx_wake_queue(txq);
>>+
>>+ return ret;
>>+}
>
>Above 2 functions are mostly identical - could you do smth. with it?
... I know it should be, but i hadn't found better way for combining them ....

>
>>+
>>+static int cpsw_run_xdp(struct cpsw_priv *priv, struct cpsw_vector *rxv,
>>+ struct xdp_buff *xdp, struct page *page)
>>+{
>>+ struct net_device *ndev = priv->ndev;
>>+ struct xdp_frame *xdpf;
>>+ struct bpf_prog *prog;
>>+ int ret = 1;
>>+ u32 act;
>>+
>>+ rcu_read_lock();
>>+
>>+ prog = READ_ONCE(priv->xdp_prog);
>>+ if (!prog) {
>>+ ret = 0;
>>+ goto out;
>>+ }
>>+
>>+ act = bpf_prog_run_xdp(prog, xdp);
>>+ switch (act) {
>>+ case XDP_PASS:
>>+ ret = 0;
>>+ break;
>>+ case XDP_TX:
>>+ xdpf = convert_to_xdp_frame(xdp);
>>+ if (unlikely(!xdpf))
>>+ xdp_return_buff(xdp);
>>+ else
>>+ cpsw_xdp_tx_frame_mapped(priv, xdpf, page);
>>+ break;
>>+ case XDP_REDIRECT:
>>+ if (xdp_do_redirect(ndev, xdp, prog))
>>+ xdp_return_buff(xdp);
>>+ else
>>+ ret = 2;
>
>could we avoid using consts as return values?
>may be some informative defines/enum?
Ok, for all "const" cases.

--
Regards,
Ivan Khoronzhuk

2019-05-29 08:19:06

by Jesper Dangaard Brouer

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] net: ethernet: ti: cpsw: add XDP support

On Thu, 23 May 2019 21:20:35 +0300
Ivan Khoronzhuk <[email protected]> wrote:

> +static struct page *cpsw_alloc_page(struct cpsw_common *cpsw)
> +{
> + struct page_pool *pool = cpsw->rx_page_pool;
> + struct page *page, *prev_page = NULL;
> + int try = pool->p.pool_size << 2;
> + int start_free = 0, ret;
> +
> + do {
> + page = page_pool_dev_alloc_pages(pool);
> + if (!page)
> + return NULL;
> +
> + /* if netstack has page_pool recycling remove the rest */
> + if (page_ref_count(page) == 1)
> + break;
> +
> + /* start free pages in use, shouldn't happen */
> + if (prev_page == page || start_free) {
> + /* dma unmap/puts page if rfcnt != 1 */
> + page_pool_recycle_direct(pool, page);
> + start_free = 1;
> + continue;
> + }
> +
> + /* if refcnt > 1, page has been holding by netstack, it's pity,
> + * so put it to the ring to be consumed later when fast cash is
> + * empty. If ring is full then free page by recycling as above.
> + */
> + ret = ptr_ring_produce(&pool->ring, page);

This looks very wrong to me! First of all you are manipulation
directly with the internal pool->ring and not using the API, which
makes this code un-maintainable. Second this is wrong, as page_pool
assume the in-variance that pages on the ring have refcnt==1.

> + if (ret) {
> + page_pool_recycle_direct(pool, page);
> + continue;
> + }
> +
> + if (!prev_page)
> + prev_page = page;
> + } while (try--);
> +
> + return page;
> +}


--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer

2019-05-29 09:59:56

by Ivan Khoronzhuk

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] net: ethernet: ti: cpsw: add XDP support

On Wed, May 29, 2019 at 10:16:59AM +0200, Jesper Dangaard Brouer wrote:
>On Thu, 23 May 2019 21:20:35 +0300
>Ivan Khoronzhuk <[email protected]> wrote:
>
>> +static struct page *cpsw_alloc_page(struct cpsw_common *cpsw)
>> +{
>> + struct page_pool *pool = cpsw->rx_page_pool;
>> + struct page *page, *prev_page = NULL;
>> + int try = pool->p.pool_size << 2;
>> + int start_free = 0, ret;
>> +
>> + do {
>> + page = page_pool_dev_alloc_pages(pool);
>> + if (!page)
>> + return NULL;
>> +
>> + /* if netstack has page_pool recycling remove the rest */
>> + if (page_ref_count(page) == 1)
>> + break;
>> +
>> + /* start free pages in use, shouldn't happen */
>> + if (prev_page == page || start_free) {
>> + /* dma unmap/puts page if rfcnt != 1 */
>> + page_pool_recycle_direct(pool, page);
>> + start_free = 1;
>> + continue;
>> + }
>> +
>> + /* if refcnt > 1, page has been holding by netstack, it's pity,
>> + * so put it to the ring to be consumed later when fast cash is
>> + * empty. If ring is full then free page by recycling as above.
>> + */
>> + ret = ptr_ring_produce(&pool->ring, page);
>
>This looks very wrong to me! First of all you are manipulation
>directly with the internal pool->ring and not using the API, which
>makes this code un-maintainable.
Yes I know, it's hack, it was with assumption to be dropped once page_pool
recycling is added.

>Second this is wrong, as page_pool
>assume the in-variance that pages on the ring have refcnt==1.
Yes, but this is w/o obvious reason, seems like it can work with refcnt > 1 if
remove restriction and use >= instead of ==.

As I answered on Ilias comment, I'm going to leave version from RFC and drop
this one.

>
>> + if (ret) {
>> + page_pool_recycle_direct(pool, page);
>> + continue;
>> + }
>> +
>> + if (!prev_page)
>> + prev_page = page;
>> + } while (try--);
>> +
>> + return page;
>> +}
>
>
>--
>Best regards,
> Jesper Dangaard Brouer
> MSc.CS, Principal Kernel Engineer at Red Hat
> LinkedIn: http://www.linkedin.com/in/brouer

--
Regards,
Ivan Khoronzhuk