This patchset is RFC adding XDP support for TI cpsw driver that is
based on page_pool allocator. It was verified with af_xdp sockets
and on xdp drop. For XDP redirect to another interface it's under
verification, still not sure about all cases that should be verified.
Also regular tests with iperf2 were done in order to verify impact on
regular netstack performance, compared with base commit from
net-next/master: 432bc230700f86801cffa5e159e05dea6229f722
It was verified with following configs enabled:
CONFIG_JIT=y
CONFIG_BPFILTER=y
CONFIG_BPF_SYSCALL=y
CONFIG_XDP_SOCKETS=y
CONFIG_BPF_EVENTS=y
CONFIG_HAVE_EBPF_JIT=y
CONFIG_BPF_JIT=y
CONFIG_CGROUP_BPF=y
iperf2 UDP RX summary (packet size / Mbps):
+--------------------------------------------------------------+
| pkt_size/rate | 1024 | 1500 | 1470 | 64 | 128 | 512 | 256 |
|---------------+------+------+------+------+------+-----+-----|
| base commit | 561 | 470 | 796 | 35 | 70.1 | 281 | 140 |
|---------------+------+------+------+------+------+-----+-----|
| XDP patched | 563 | 455 | 808 | 35 | 70.2 | 282 | 141 |
+--------------------------------------------------------------+
iperf2 UDP TX summary (packet size / Mbps):
+--------------------------------------------------------------+
| pkt_size/rate | 1024 | 1500 | 1470 | 64 | 128 | 512 | 256 |
|---------------+------+------+------+------+------+-----+-----|
| base commit | 555 | 666 | 736 | 34.5 | 70.3 | 281 | 140 |
|---------------+------+------+------+------+------+-----+-----|
| XDP patched | 558 | 696 | 759 | 35.2 | 69.2 | 279 | 140 |
+--------------------------------------------------------------+
iperf2 TCP summary (window size / Mbps):
+------------------------------------------------------------+
| window size/rate | 16 | 32 | 64 | 128 | 8 | 256 |
|------------------+------+------+------+------+------+------|
| base commit | 753 | 887 | 931 | 932 | 676 | 932 |
|------------------+------+------+------+------+------+------|
| XDP patched | 823 | 888 | 932 | 933 | 669 | 933 |
+------------------------------------------------------------+
For af_xdp socket type verification several generic changes should be added
that can be seen here (rough fixes, for samples related seems like last version
of samples is more integrated with libbpf api, so should be rebased,
witch I will send as RFC separately):
https://github.com/ikhorn/af_xdp_stuff/tree/af_xdp_armv7
Ivan Khoronzhuk (3):
net: ethernet: ti: davinci_cpdma: add dma mapped submit
net: ethernet: ti: davinci_cpdma: return handler status
net: ethernet: ti: cpsw: add XDP support
drivers/net/ethernet/ti/Kconfig | 1 +
drivers/net/ethernet/ti/cpsw.c | 552 +++++++++++++++++++++---
drivers/net/ethernet/ti/davinci_cpdma.c | 117 +++--
drivers/net/ethernet/ti/davinci_cpdma.h | 6 +-
drivers/net/ethernet/ti/davinci_emac.c | 18 +-
5 files changed, 591 insertions(+), 103 deletions(-)
--
2.17.1
Add XDP support based on rx page_pool allocator, one frame per page.
This patch was verified with af_xdp and xdp drop. Page pool allocator
is used with assumption that only one rx_handler is running
simultaneously. DMA map/unmap is reused from page pool despite there
is no need to map whole page.
Due to specific of cpsw, the same TX/RX handler can be used by 2
network devices, so special fields in buffer are added to identify
an interface the frame is destined to.
XDP prog is common for all channels till appropriate changes are added
in XDP infrastructure.
Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
drivers/net/ethernet/ti/Kconfig | 1 +
drivers/net/ethernet/ti/cpsw.c | 535 ++++++++++++++++++++++++++++----
2 files changed, 474 insertions(+), 62 deletions(-)
diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
index 8b21b40a9fe5..88b95dfed92c 100644
--- a/drivers/net/ethernet/ti/Kconfig
+++ b/drivers/net/ethernet/ti/Kconfig
@@ -66,6 +66,7 @@ config TI_CPSW
select TI_DAVINCI_CPDMA
select TI_DAVINCI_MDIO
select TI_CPSW_ALE
+ select PAGE_POOL
select MFD_SYSCON
select REGMAP
---help---
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 0fd1b3909333..2cd395d012f6 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -13,6 +13,7 @@
* GNU General Public License for more details.
*/
+#include <net/page_pool.h>
#include <linux/kernel.h>
#include <linux/io.h>
#include <linux/clk.h>
@@ -38,6 +39,9 @@
#include <linux/if_vlan.h>
#include <linux/kmemleak.h>
#include <linux/sys_soc.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>
#include <linux/pinctrl/consumer.h>
#include <net/pkt_cls.h>
@@ -456,11 +460,13 @@ struct cpsw_common {
int rx_ch_num, tx_ch_num;
int speed;
int usage_count;
+ struct page_pool *rx_page_pool;
};
struct cpsw_priv {
struct net_device *ndev;
struct device *dev;
+ struct bpf_prog *xdp_prog;
u32 msg_enable;
u8 mac_addr[ETH_ALEN];
bool rx_pause;
@@ -470,6 +476,7 @@ struct cpsw_priv {
int shp_cfg_speed;
int tx_ts_enabled;
int rx_ts_enabled;
+ struct xdp_rxq_info xdp_rxq[CPSW_MAX_QUEUES];
u32 emac_port;
struct cpsw_common *cpsw;
};
@@ -481,6 +488,10 @@ struct cpsw_stats {
int stat_offset;
};
+/* The buf shall include headroom compatible with both skb and xdpf */
+#define CPSW_HEADROOM_NA (max(XDP_PACKET_HEADROOM, NET_SKB_PAD) + NET_IP_ALIGN)
+#define CPSW_HEADROOM ALIGN(CPSW_HEADROOM_NA, sizeof(unsigned long))
+
enum {
CPSW_STATS,
CPDMA_RX_STATS,
@@ -838,24 +849,58 @@ static void cpsw_intr_disable(struct cpsw_common *cpsw)
return;
}
+static int cpsw_is_xdpf_handle(void *handle)
+{
+ return (unsigned long)handle & BIT(0);
+}
+
+static void *cpsw_xdpf_to_handle(struct xdp_frame *xdpf)
+{
+ return (void *)((unsigned long)xdpf | BIT(0));
+}
+
+static struct xdp_frame *cpsw_handle_to_xdpf(void *handle)
+{
+ return (struct xdp_frame *)((unsigned long)handle & ~BIT(0));
+}
+
+struct cpsw_meta_xdp {
+ struct net_device *ndev;
+ int ch;
+};
+
static int cpsw_tx_handler(void *token, int len, int status)
{
+ struct cpsw_meta_xdp *xmeta;
+ struct xdp_frame *xdpf;
+ struct net_device *ndev;
struct netdev_queue *txq;
- struct sk_buff *skb = token;
- struct net_device *ndev = skb->dev;
- struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
+ struct sk_buff *skb;
+ int ch;
+
+ if (cpsw_is_xdpf_handle(token)) {
+ xdpf = cpsw_handle_to_xdpf(token);
+ xmeta = xdpf->data - xdpf->metasize;
+ ndev = xmeta->ndev;
+ ch = xmeta->ch;
+ xdp_return_frame_rx_napi(xdpf);
+ } else {
+ skb = token;
+ ndev = skb->dev;
+ ch = skb_get_queue_mapping(skb);
+ cpts_tx_timestamp(ndev_to_cpsw(ndev)->cpts, skb);
+ dev_kfree_skb_any(skb);
+ }
/* Check whether the queue is stopped due to stalled tx dma, if the
* queue is stopped then start the queue as we have free desc for tx
*/
- txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb));
+ txq = netdev_get_tx_queue(ndev, ch);
if (unlikely(netif_tx_queue_stopped(txq)))
netif_tx_wake_queue(txq);
- cpts_tx_timestamp(cpsw->cpts, skb);
ndev->stats.tx_packets++;
ndev->stats.tx_bytes += len;
- dev_kfree_skb_any(skb);
return 0;
}
@@ -902,22 +947,169 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
}
}
+static inline int cpsw_tx_submit_xdpf(struct cpsw_priv *priv,
+ struct xdp_frame *xdpf,
+ struct cpdma_chan *txch)
+{
+ struct cpsw_common *cpsw = priv->cpsw;
+
+ return cpdma_chan_submit(txch, cpsw_xdpf_to_handle(xdpf), xdpf->data,
+ xdpf->len,
+ priv->emac_port + cpsw->data.dual_emac);
+}
+
+static int cpsw_xdp_tx_frame(struct cpsw_priv *priv, struct xdp_frame *frame)
+{
+ struct cpsw_common *cpsw = priv->cpsw;
+ struct cpsw_meta_xdp *xmeta;
+ struct cpdma_chan *txch;
+ int ret = 0;
+
+ frame->metasize = sizeof(struct cpsw_meta_xdp);
+ xmeta = frame->data - frame->metasize;
+ xmeta->ndev = priv->ndev;
+ xmeta->ch = 0;
+
+ txch = cpsw->txv[0].ch;
+ ret = cpsw_tx_submit_xdpf(priv, frame, txch);
+ if (ret) {
+ xdp_return_frame_rx_napi(frame);
+ ret = -1;
+ }
+
+ /* If there is no more tx desc left free then we need to
+ * tell the kernel to stop sending us tx frames.
+ */
+ if (unlikely(!cpdma_check_free_tx_desc(txch))) {
+ struct netdev_queue *txq = netdev_get_tx_queue(priv->ndev, 0);
+
+ netif_tx_stop_queue(txq);
+
+ /* Barrier, so that stop_queue visible to other cpus */
+ smp_mb__after_atomic();
+
+ if (cpdma_check_free_tx_desc(txch))
+ netif_tx_wake_queue(txq);
+ }
+
+ return ret;
+}
+
+static int cpsw_run_xdp(struct cpsw_priv *priv, struct cpsw_vector *rxv,
+ struct xdp_buff *xdp)
+{
+ struct net_device *ndev = priv->ndev;
+ struct xdp_frame *xdpf;
+ struct bpf_prog *prog;
+ int ret = 1;
+ u32 act;
+
+ rcu_read_lock();
+
+ prog = READ_ONCE(priv->xdp_prog);
+ if (!prog) {
+ ret = 0;
+ goto out;
+ }
+
+ act = bpf_prog_run_xdp(prog, xdp);
+ switch (act) {
+ case XDP_PASS:
+ ret = 0;
+ break;
+ case XDP_TX:
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
+ xdp_return_buff(xdp);
+ else
+ cpsw_xdp_tx_frame(priv, xdpf);
+ break;
+ case XDP_REDIRECT:
+ if (xdp_do_redirect(ndev, xdp, prog))
+ xdp_return_buff(xdp);
+ else
+ ret = 2;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(ndev, prog, act);
+ /* fall through -- handle aborts by dropping packet */
+ case XDP_DROP:
+ xdp_return_buff(xdp);
+ break;
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static unsigned int cpsw_rxbuf_total_len(unsigned int len)
+{
+ len += CPSW_HEADROOM;
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ return SKB_DATA_ALIGN(len);
+}
+
+static struct page_pool *cpsw_create_rx_pool(struct cpsw_common *cpsw)
+{
+ struct page_pool_params pp_params = { 0 };
+
+ pp_params.order = 0;
+ pp_params.flags = PP_FLAG_DMA_MAP;
+
+ /* set it to number of descriptors to be cached from init? */
+ pp_params.pool_size = descs_pool_size;
+ pp_params.nid = NUMA_NO_NODE; /* no numa */
+ pp_params.dma_dir = DMA_FROM_DEVICE;
+ pp_params.dev = cpsw->dev;
+
+ return page_pool_create(&pp_params);
+}
+
+static struct page *cpsw_alloc_page(struct cpsw_common *cpsw)
+{
+ struct page_pool *pool = cpsw->rx_page_pool;
+ struct page *page;
+ int i = 0;
+
+ do {
+ page = page_pool_dev_alloc_pages(pool);
+ if (!page)
+ return NULL;
+
+ /* skip pages used by skb netstack */
+ if (page_ref_count(page) == 1)
+ break;
+
+ /* it's a pitty, but free page */
+ page_pool_recycle_direct(pool, page);
+ } while (++i < descs_pool_size);
+
+ return page;
+}
+
static int cpsw_rx_handler(void *token, int len, int status)
{
- struct cpdma_chan *ch;
- struct sk_buff *skb = token;
- struct sk_buff *new_skb;
- struct net_device *ndev = skb->dev;
- int ret = 0, port;
- struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
+ struct page *new_page, *page = token;
+ struct cpsw_meta_xdp *new_xmeta, *xmeta = page_address(page);
+ struct cpsw_common *cpsw = ndev_to_cpsw(xmeta->ndev);
+ int pkt_size = cpsw->rx_packet_max;
+ int ret = 0, port, ch = xmeta->ch;
+ struct page_pool *pool = cpsw->rx_page_pool;
+ int headroom = CPSW_HEADROOM;
+ struct net_device *ndev = xmeta->ndev;
+ int flush = 0;
struct cpsw_priv *priv;
+ struct sk_buff *skb;
+ struct xdp_buff xdp;
+ dma_addr_t dma;
if (cpsw->data.dual_emac) {
port = CPDMA_RX_SOURCE_PORT(status);
- if (port) {
+ if (port)
ndev = cpsw->slaves[--port].ndev;
- skb->dev = ndev;
- }
}
if (unlikely(status < 0) || unlikely(!netif_running(ndev))) {
@@ -930,47 +1122,105 @@ static int cpsw_rx_handler(void *token, int len, int status)
* in reducing of the number of rx descriptor in
* DMA engine, requeue skb back to cpdma.
*/
- new_skb = skb;
+ new_page = page;
+ new_xmeta = xmeta;
goto requeue;
}
/* the interface is going down, skbs are purged */
- dev_kfree_skb_any(skb);
+ page_pool_recycle_direct(pool, page);
return 0;
}
- new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
- if (new_skb) {
- skb_copy_queue_mapping(new_skb, skb);
- skb_put(skb, len);
- if (status & CPDMA_RX_VLAN_ENCAP)
- cpsw_rx_vlan_encap(skb);
- priv = netdev_priv(ndev);
- if (priv->rx_ts_enabled)
- cpts_rx_timestamp(cpsw->cpts, skb);
- skb->protocol = eth_type_trans(skb, ndev);
- netif_receive_skb(skb);
- ndev->stats.rx_bytes += len;
- ndev->stats.rx_packets++;
- kmemleak_not_leak(new_skb);
- } else {
+ new_page = cpsw_alloc_page(cpsw);
+ if (unlikely(!new_page)) {
+ new_page = page;
+ new_xmeta = xmeta;
ndev->stats.rx_dropped++;
- new_skb = skb;
+ goto requeue;
}
+ new_xmeta = page_address(new_page);
+
+ priv = netdev_priv(ndev);
+ if (priv->xdp_prog) {
+ xdp_set_data_meta_invalid(&xdp);
+
+ if (status & CPDMA_RX_VLAN_ENCAP) {
+ xdp.data = (u8 *)xmeta + CPSW_HEADROOM +
+ CPSW_RX_VLAN_ENCAP_HDR_SIZE;
+ xdp.data_end = xdp.data + len -
+ CPSW_RX_VLAN_ENCAP_HDR_SIZE;
+ } else {
+ xdp.data = (u8 *)xmeta + CPSW_HEADROOM;
+ xdp.data_end = xdp.data + len;
+ }
+
+ xdp.data_hard_start = xmeta;
+ xdp.rxq = &priv->xdp_rxq[ch];
+
+ ret = cpsw_run_xdp(priv, &cpsw->rxv[ch], &xdp);
+ if (ret) {
+ if (ret == 2)
+ flush = 1;
+
+ goto requeue;
+ }
+
+ /* XDP prog might have changed packet data and boundaries */
+ len = xdp.data_end - xdp.data;
+ headroom = xdp.data - xdp.data_hard_start;
+ }
+
+ /* Build skb and pass it to networking stack if XDP off or XDP prog
+ * returned XDP_PASS
+ */
+ skb = build_skb(xmeta, cpsw_rxbuf_total_len(pkt_size));
+ if (!skb) {
+ ndev->stats.rx_dropped++;
+ page_pool_recycle_direct(pool, page);
+ goto requeue;
+ }
+
+ skb_reserve(skb, headroom);
+ skb_put(skb, len);
+ skb->dev = ndev;
+ if (status & CPDMA_RX_VLAN_ENCAP)
+ cpsw_rx_vlan_encap(skb);
+ if (priv->rx_ts_enabled)
+ cpts_rx_timestamp(cpsw->cpts, skb);
+ skb->protocol = eth_type_trans(skb, ndev);
+
+ /* as cpsw handles one packet per NAPI recycle page before increasing
+ * refcounter, holding this in page pool cache
+ */
+ page_pool_recycle_direct(pool, page);
+
+ /* it's decremented by netstack after what can be allocated
+ * in cpsw_alloc_page()
+ */
+ page_ref_inc(page);
+ netif_receive_skb(skb);
+
+ ndev->stats.rx_bytes += len;
+ ndev->stats.rx_packets++;
requeue:
if (netif_dormant(ndev)) {
- dev_kfree_skb_any(new_skb);
- return 0;
+ page_pool_recycle_direct(pool, new_page);
+ return flush;
}
- ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch;
- ret = cpdma_chan_submit(ch, new_skb, new_skb->data,
- skb_tailroom(new_skb), 0);
+ new_xmeta->ndev = ndev;
+ new_xmeta->ch = ch;
+ dma = new_page->dma_addr + CPSW_HEADROOM;
+ ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, (void *)dma,
+ pkt_size, 0);
if (WARN_ON(ret < 0))
- dev_kfree_skb_any(new_skb);
+ page_pool_recycle_direct(pool, new_page);
+ else
+ kmemleak_not_leak(new_xmeta); /* Is it needed? */
- return 0;
+ return flush;
}
static void cpsw_split_res(struct net_device *ndev)
@@ -1147,7 +1397,7 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
{
u32 ch_map;
- int num_rx, cur_budget, ch;
+ int num_rx, cur_budget, ch, flush;
struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
struct cpsw_vector *rxv;
@@ -1163,8 +1413,12 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
else
cur_budget = rxv->budget;
- cpdma_chan_process(rxv->ch, &cur_budget);
+ flush = cpdma_chan_process(rxv->ch, &cur_budget);
num_rx += cur_budget;
+
+ if (flush)
+ xdp_do_flush_map();
+
if (num_rx >= budget)
break;
}
@@ -1180,10 +1434,15 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
{
struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
- int num_rx;
+ struct cpsw_vector *rxv;
+ int num_rx, flush;
num_rx = budget;
- cpdma_chan_process(cpsw->rxv[0].ch, &num_rx);
+ rxv = &cpsw->rxv[0];
+ flush = cpdma_chan_process(rxv->ch, &num_rx);
+ if (flush)
+ xdp_do_flush_map();
+
if (num_rx < budget) {
napi_complete_done(napi_rx, num_rx);
writel(0xff, &cpsw->wr_regs->rx_en);
@@ -1516,9 +1775,9 @@ static void cpsw_get_ethtool_stats(struct net_device *ndev,
}
}
-static inline int cpsw_tx_packet_submit(struct cpsw_priv *priv,
- struct sk_buff *skb,
- struct cpdma_chan *txch)
+static inline int cpsw_tx_submit_skb(struct cpsw_priv *priv,
+ struct sk_buff *skb,
+ struct cpdma_chan *txch)
{
struct cpsw_common *cpsw = priv->cpsw;
@@ -1706,33 +1965,39 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
static int cpsw_fill_rx_channels(struct cpsw_priv *priv)
{
struct cpsw_common *cpsw = priv->cpsw;
- struct sk_buff *skb;
+ struct cpsw_meta_xdp *xmeta;
+ struct page_pool *pool;
+ struct page *page;
int ch_buf_num;
int ch, i, ret;
+ dma_addr_t dma;
+ pool = cpsw->rx_page_pool;
for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
ch_buf_num = cpdma_chan_get_rx_buf_num(cpsw->rxv[ch].ch);
for (i = 0; i < ch_buf_num; i++) {
- skb = __netdev_alloc_skb_ip_align(priv->ndev,
- cpsw->rx_packet_max,
- GFP_KERNEL);
- if (!skb) {
- cpsw_err(priv, ifup, "cannot allocate skb\n");
+ page = cpsw_alloc_page(cpsw);
+ if (!page) {
+ cpsw_err(priv, ifup, "allocate rx page err\n");
return -ENOMEM;
}
- skb_set_queue_mapping(skb, ch);
- ret = cpdma_chan_submit(cpsw->rxv[ch].ch, skb,
- skb->data, skb_tailroom(skb),
- 0);
+ xmeta = page_address(page);
+ xmeta->ndev = priv->ndev;
+ xmeta->ch = ch;
+
+ dma = page->dma_addr + CPSW_HEADROOM;
+ ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, page,
+ (void *)dma,
+ cpsw->rx_packet_max, 0);
if (ret < 0) {
cpsw_err(priv, ifup,
"cannot submit skb to channel %d rx, error %d\n",
ch, ret);
- kfree_skb(skb);
+ page_pool_recycle_direct(pool, page);
return ret;
}
- kmemleak_not_leak(skb);
+ kmemleak_not_leak(xmeta); /* Is it needed? */
}
cpsw_info(priv, ifup, "ch %d rx, submitted %d descriptors\n",
@@ -2154,7 +2419,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
txch = cpsw->txv[q_idx].ch;
txq = netdev_get_tx_queue(ndev, q_idx);
- ret = cpsw_tx_packet_submit(priv, skb, txch);
+ ret = cpsw_tx_submit_skb(priv, skb, txch);
if (unlikely(ret != 0)) {
cpsw_err(priv, tx_err, "desc submit failed\n");
goto fail;
@@ -2684,6 +2949,63 @@ static int cpsw_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type,
}
}
+static int cpsw_xdp_prog_setup(struct net_device *ndev, struct bpf_prog *prog)
+{
+ struct cpsw_priv *priv = netdev_priv(ndev);
+ struct bpf_prog *old_prog;
+
+ if (!priv->xdp_prog && !prog)
+ return 0;
+
+ old_prog = xchg(&priv->xdp_prog, prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+
+ return 0;
+}
+
+static int cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf)
+{
+ struct cpsw_priv *priv = netdev_priv(ndev);
+
+ switch (bpf->command) {
+ case XDP_SETUP_PROG:
+ return cpsw_xdp_prog_setup(ndev, bpf->prog);
+
+ case XDP_QUERY_PROG:
+ bpf->prog_id = priv->xdp_prog ? priv->xdp_prog->aux->id : 0;
+ return 0;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static int cpsw_ndo_xdp_xmit(struct net_device *ndev, int n,
+ struct xdp_frame **frames, u32 flags)
+{
+ struct cpsw_priv *priv = netdev_priv(ndev);
+ struct xdp_frame *xdpf;
+ int i, drops = 0;
+
+ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+ return -EINVAL;
+
+ for (i = 0; i < n; i++) {
+ xdpf = frames[i];
+ if (xdpf->len < CPSW_MIN_PACKET_SIZE) {
+ xdp_return_frame_rx_napi(xdpf);
+ drops++;
+ continue;
+ }
+
+ if (cpsw_xdp_tx_frame(priv, xdpf))
+ drops++;
+ }
+
+ return n - drops;
+}
+
static const struct net_device_ops cpsw_netdev_ops = {
.ndo_open = cpsw_ndo_open,
.ndo_stop = cpsw_ndo_stop,
@@ -2700,6 +3022,8 @@ static const struct net_device_ops cpsw_netdev_ops = {
.ndo_vlan_rx_add_vid = cpsw_ndo_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = cpsw_ndo_vlan_rx_kill_vid,
.ndo_setup_tc = cpsw_ndo_setup_tc,
+ .ndo_bpf = cpsw_ndo_bpf,
+ .ndo_xdp_xmit = cpsw_ndo_xdp_xmit,
};
static int cpsw_get_regs_len(struct net_device *ndev)
@@ -2920,6 +3244,57 @@ static int cpsw_check_ch_settings(struct cpsw_common *cpsw,
return 0;
}
+static void cpsw_xdp_rxq_unreg(struct cpsw_common *cpsw, int ch)
+{
+ struct cpsw_slave *slave;
+ struct cpsw_priv *priv;
+ int i;
+
+ for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) {
+ if (!slave->ndev)
+ continue;
+
+ priv = netdev_priv(slave->ndev);
+ xdp_rxq_info_unreg(&priv->xdp_rxq[ch]);
+ }
+}
+
+static int cpsw_xdp_rxq_reg(struct cpsw_common *cpsw, int ch)
+{
+ struct cpsw_slave *slave;
+ struct cpsw_priv *priv;
+ int i, ret;
+
+ /* As channels are common for both ports sharing same queues, xdp_rxq
+ * information also becomes shared and used by every packet on this
+ * channel. But exch xdp_rxq holds link on netdev, which by the theory
+ * can have different memory model and so, network device must hold it's
+ * own set of rxq and thus both netdevs should be prepared
+ */
+ for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) {
+ if (!slave->ndev)
+ continue;
+
+ priv = netdev_priv(slave->ndev);
+
+ ret = xdp_rxq_info_reg(&priv->xdp_rxq[ch], priv->ndev, ch);
+ if (ret)
+ goto err;
+
+ ret = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq[ch],
+ MEM_TYPE_PAGE_POOL,
+ cpsw->rx_page_pool);
+ if (ret)
+ goto err;
+ }
+
+ return ret;
+
+err:
+ cpsw_xdp_rxq_unreg(cpsw, ch);
+ return ret;
+}
+
static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
{
struct cpsw_common *cpsw = priv->cpsw;
@@ -2950,6 +3325,11 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
if (!vec[*ch].ch)
return -EINVAL;
+ if (rx && cpsw_xdp_rxq_reg(cpsw, *ch)) {
+ cpdma_chan_destroy(vec[*ch].ch);
+ return -EINVAL;
+ }
+
cpsw_info(priv, ifup, "created new %d %s channel\n", *ch,
(rx ? "rx" : "tx"));
(*ch)++;
@@ -2958,6 +3338,9 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
while (*ch > ch_num) {
(*ch)--;
+ if (rx)
+ cpsw_xdp_rxq_unreg(cpsw, *ch);
+
ret = cpdma_chan_destroy(vec[*ch].ch);
if (ret)
return ret;
@@ -3446,6 +3829,15 @@ static int cpsw_probe_dual_emac(struct cpsw_priv *priv)
ndev->netdev_ops = &cpsw_netdev_ops;
ndev->ethtool_ops = &cpsw_ethtool_ops;
+ ret = xdp_rxq_info_reg(&priv_sl2->xdp_rxq[0], ndev, 0);
+ if (ret)
+ return ret;
+
+ ret = xdp_rxq_info_reg_mem_model(&priv_sl2->xdp_rxq[0],
+ MEM_TYPE_PAGE_SHARED, NULL);
+ if (ret)
+ return ret;
+
/* register the network device */
SET_NETDEV_DEV(ndev, cpsw->dev);
ret = register_netdev(ndev);
@@ -3517,6 +3909,12 @@ static int cpsw_probe(struct platform_device *pdev)
goto clean_ndev_ret;
}
+ cpsw->rx_page_pool = cpsw_create_rx_pool(cpsw);
+ if (IS_ERR(cpsw->rx_page_pool)) {
+ dev_err(&pdev->dev, "create rx page pool\n");
+ goto clean_ndev_ret;
+ }
+
/*
* This may be required here for child devices.
*/
@@ -3663,20 +4061,31 @@ static int cpsw_probe(struct platform_device *pdev)
cpsw->quirk_irq = 1;
ch = cpsw->quirk_irq ? 0 : 7;
- cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, ch, cpsw_tx_handler, 0);
+ cpsw->txv[0].ch =
+ cpdma_chan_create(cpsw->dma, ch, cpsw_tx_handler, 0);
if (IS_ERR(cpsw->txv[0].ch)) {
dev_err(priv->dev, "error initializing tx dma channel\n");
ret = PTR_ERR(cpsw->txv[0].ch);
goto clean_dma_ret;
}
- cpsw->rxv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
+ cpsw->rxv[0].ch =
+ cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
if (IS_ERR(cpsw->rxv[0].ch)) {
dev_err(priv->dev, "error initializing rx dma channel\n");
ret = PTR_ERR(cpsw->rxv[0].ch);
goto clean_dma_ret;
}
+ ret = xdp_rxq_info_reg(&priv->xdp_rxq[0], ndev, 0);
+ if (ret)
+ goto clean_dma_ret;
+
+ ret = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq[0], MEM_TYPE_PAGE_POOL,
+ cpsw->rx_page_pool);
+ if (ret)
+ goto clean_dma_ret;
+
ale_params.dev = &pdev->dev;
ale_params.ale_ageout = ale_ageout;
ale_params.ale_entries = data->ale_entries;
@@ -3786,6 +4195,7 @@ static int cpsw_probe(struct platform_device *pdev)
pm_runtime_put_sync(&pdev->dev);
clean_runtime_disable_ret:
pm_runtime_disable(&pdev->dev);
+ page_pool_destroy(cpsw->rx_page_pool);
clean_ndev_ret:
free_netdev(priv->ndev);
return ret;
@@ -3809,6 +4219,7 @@ static int cpsw_remove(struct platform_device *pdev)
cpts_release(cpsw->cpts);
cpdma_ctlr_destroy(cpsw->dma);
+ page_pool_destroy(cpsw->rx_page_pool);
cpsw_remove_dt(pdev);
pm_runtime_put_sync(&pdev->dev);
pm_runtime_disable(&pdev->dev);
--
2.17.1
This change is needed to return flush status of rx handler for
flushing redirected xdp frames after processing channel packets.
Do it as separate patch for simplicity.
Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
drivers/net/ethernet/ti/cpsw.c | 25 ++++++++++++-------
drivers/net/ethernet/ti/davinci_cpdma.c | 33 +++++++++++++++----------
drivers/net/ethernet/ti/davinci_cpdma.h | 4 +--
drivers/net/ethernet/ti/davinci_emac.c | 18 ++++++++------
4 files changed, 49 insertions(+), 31 deletions(-)
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index a591583d120e..0fd1b3909333 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -838,7 +838,7 @@ static void cpsw_intr_disable(struct cpsw_common *cpsw)
return;
}
-static void cpsw_tx_handler(void *token, int len, int status)
+static int cpsw_tx_handler(void *token, int len, int status)
{
struct netdev_queue *txq;
struct sk_buff *skb = token;
@@ -856,6 +856,7 @@ static void cpsw_tx_handler(void *token, int len, int status)
ndev->stats.tx_packets++;
ndev->stats.tx_bytes += len;
dev_kfree_skb_any(skb);
+ return 0;
}
static void cpsw_rx_vlan_encap(struct sk_buff *skb)
@@ -901,7 +902,7 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
}
}
-static void cpsw_rx_handler(void *token, int len, int status)
+static int cpsw_rx_handler(void *token, int len, int status)
{
struct cpdma_chan *ch;
struct sk_buff *skb = token;
@@ -935,7 +936,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
/* the interface is going down, skbs are purged */
dev_kfree_skb_any(skb);
- return;
+ return 0;
}
new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
@@ -960,7 +961,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
requeue:
if (netif_dormant(ndev)) {
dev_kfree_skb_any(new_skb);
- return;
+ return 0;
}
ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch;
@@ -968,6 +969,8 @@ static void cpsw_rx_handler(void *token, int len, int status)
skb_tailroom(new_skb), 0);
if (WARN_ON(ret < 0))
dev_kfree_skb_any(new_skb);
+
+ return 0;
}
static void cpsw_split_res(struct net_device *ndev)
@@ -1108,7 +1111,8 @@ static int cpsw_tx_mq_poll(struct napi_struct *napi_tx, int budget)
else
cur_budget = txv->budget;
- num_tx += cpdma_chan_process(txv->ch, cur_budget);
+ cpdma_chan_process(txv->ch, &cur_budget);
+ num_tx += cur_budget;
if (num_tx >= budget)
break;
}
@@ -1126,7 +1130,8 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
struct cpsw_common *cpsw = napi_to_cpsw(napi_tx);
int num_tx;
- num_tx = cpdma_chan_process(cpsw->txv[0].ch, budget);
+ num_tx = budget;
+ cpdma_chan_process(cpsw->txv[0].ch, &num_tx);
if (num_tx < budget) {
napi_complete(napi_tx);
writel(0xff, &cpsw->wr_regs->tx_en);
@@ -1158,7 +1163,8 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
else
cur_budget = rxv->budget;
- num_rx += cpdma_chan_process(rxv->ch, cur_budget);
+ cpdma_chan_process(rxv->ch, &cur_budget);
+ num_rx += cur_budget;
if (num_rx >= budget)
break;
}
@@ -1176,7 +1182,8 @@ static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
int num_rx;
- num_rx = cpdma_chan_process(cpsw->rxv[0].ch, budget);
+ num_rx = budget;
+ cpdma_chan_process(cpsw->rxv[0].ch, &num_rx);
if (num_rx < budget) {
napi_complete_done(napi_rx, num_rx);
writel(0xff, &cpsw->wr_regs->rx_en);
@@ -2916,8 +2923,8 @@ static int cpsw_check_ch_settings(struct cpsw_common *cpsw,
static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
{
struct cpsw_common *cpsw = priv->cpsw;
- void (*handler)(void *, int, int);
struct netdev_queue *queue;
+ cpdma_handler_fn handler;
struct cpsw_vector *vec;
int ret, *ch, vch;
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 96ec1d9d8f47..95221721ac26 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -1161,15 +1161,16 @@ bool cpdma_check_free_tx_desc(struct cpdma_chan *chan)
}
EXPORT_SYMBOL_GPL(cpdma_check_free_tx_desc);
-static void __cpdma_chan_free(struct cpdma_chan *chan,
- struct cpdma_desc __iomem *desc,
- int outlen, int status)
+static int __cpdma_chan_free(struct cpdma_chan *chan,
+ struct cpdma_desc __iomem *desc,
+ int outlen, int status)
{
struct cpdma_ctlr *ctlr = chan->ctlr;
struct cpdma_desc_pool *pool = ctlr->pool;
dma_addr_t buff_dma;
int origlen;
uintptr_t token;
+ int ret;
token = desc_read(desc, sw_token);
origlen = desc_read(desc, sw_len);
@@ -1184,7 +1185,9 @@ static void __cpdma_chan_free(struct cpdma_chan *chan,
}
cpdma_desc_free(pool, desc, 1);
- (*chan->handler)((void *)token, outlen, status);
+ ret = (*chan->handler)((void *)token, outlen, status);
+
+ return ret;
}
static int __cpdma_chan_process(struct cpdma_chan *chan)
@@ -1196,13 +1199,14 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
struct cpdma_desc_pool *pool = ctlr->pool;
dma_addr_t desc_dma;
unsigned long flags;
+ int ret;
spin_lock_irqsave(&chan->lock, flags);
desc = chan->head;
if (!desc) {
chan->stats.empty_dequeue++;
- status = -ENOENT;
+ ret = -ENOENT;
goto unlock_ret;
}
desc_dma = desc_phys(pool, desc);
@@ -1211,7 +1215,7 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
outlen = status & 0x7ff;
if (status & CPDMA_DESC_OWNER) {
chan->stats.busy_dequeue++;
- status = -EBUSY;
+ ret = -EBUSY;
goto unlock_ret;
}
@@ -1237,28 +1241,31 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
else
cb_status = status;
- __cpdma_chan_free(chan, desc, outlen, cb_status);
- return status;
+ ret = __cpdma_chan_free(chan, desc, outlen, cb_status);
+ return ret;
unlock_ret:
spin_unlock_irqrestore(&chan->lock, flags);
- return status;
+ return ret;
}
-int cpdma_chan_process(struct cpdma_chan *chan, int quota)
+int cpdma_chan_process(struct cpdma_chan *chan, int *quota)
{
- int used = 0, ret = 0;
+ int used = 0, ret = 0, res = 0;
if (chan->state != CPDMA_STATE_ACTIVE)
return -EINVAL;
- while (used < quota) {
+ while (used < *quota) {
ret = __cpdma_chan_process(chan);
if (ret < 0)
break;
+ res += ret;
used++;
}
- return used;
+
+ *quota = used;
+ return res;
}
EXPORT_SYMBOL_GPL(cpdma_chan_process);
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index 20e4d43df6d1..6ae86b1ed23c 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -68,7 +68,7 @@ struct cpdma_chan_stats {
struct cpdma_ctlr;
struct cpdma_chan;
-typedef void (*cpdma_handler_fn)(void *token, int len, int status);
+typedef int (*cpdma_handler_fn)(void *token, int len, int status);
struct cpdma_ctlr *cpdma_ctlr_create(struct cpdma_params *params);
int cpdma_ctlr_destroy(struct cpdma_ctlr *ctlr);
@@ -88,7 +88,7 @@ int cpdma_chan_submit_mapped(struct cpdma_chan *chan, void *token, void *data,
int len, int directed);
int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
int len, int directed);
-int cpdma_chan_process(struct cpdma_chan *chan, int quota);
+int cpdma_chan_process(struct cpdma_chan *chan, int *quota);
int cpdma_ctlr_int_ctrl(struct cpdma_ctlr *ctlr, bool enable);
void cpdma_ctlr_eoi(struct cpdma_ctlr *ctlr, u32 value);
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 57450b174fc4..65211954436f 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -874,7 +874,7 @@ static struct sk_buff *emac_rx_alloc(struct emac_priv *priv)
return skb;
}
-static void emac_rx_handler(void *token, int len, int status)
+static int emac_rx_handler(void *token, int len, int status)
{
struct sk_buff *skb = token;
struct net_device *ndev = skb->dev;
@@ -885,7 +885,7 @@ static void emac_rx_handler(void *token, int len, int status)
/* free and bail if we are shutting down */
if (unlikely(!netif_running(ndev))) {
dev_kfree_skb_any(skb);
- return;
+ return 0;
}
/* recycle on receive error */
@@ -906,7 +906,7 @@ static void emac_rx_handler(void *token, int len, int status)
if (!skb) {
if (netif_msg_rx_err(priv) && net_ratelimit())
dev_err(emac_dev, "failed rx buffer alloc\n");
- return;
+ return 0;
}
recycle:
@@ -916,9 +916,10 @@ static void emac_rx_handler(void *token, int len, int status)
WARN_ON(ret == -ENOMEM);
if (unlikely(ret < 0))
dev_kfree_skb_any(skb);
+ return 0;
}
-static void emac_tx_handler(void *token, int len, int status)
+static int emac_tx_handler(void *token, int len, int status)
{
struct sk_buff *skb = token;
struct net_device *ndev = skb->dev;
@@ -931,6 +932,8 @@ static void emac_tx_handler(void *token, int len, int status)
ndev->stats.tx_packets++;
ndev->stats.tx_bytes += len;
dev_kfree_skb_any(skb);
+
+ return 0;
}
/**
@@ -1251,8 +1254,8 @@ static int emac_poll(struct napi_struct *napi, int budget)
mask = EMAC_DM646X_MAC_IN_VECTOR_TX_INT_VEC;
if (status & mask) {
- num_tx_pkts = cpdma_chan_process(priv->txchan,
- EMAC_DEF_TX_MAX_SERVICE);
+ num_tx_pkts = EMAC_DEF_TX_MAX_SERVICE;
+ cpdma_chan_process(priv->txchan, &num_tx_pkts);
} /* TX processing */
mask = EMAC_DM644X_MAC_IN_VECTOR_RX_INT_VEC;
@@ -1261,7 +1264,8 @@ static int emac_poll(struct napi_struct *napi, int budget)
mask = EMAC_DM646X_MAC_IN_VECTOR_RX_INT_VEC;
if (status & mask) {
- num_rx_pkts = cpdma_chan_process(priv->rxchan, budget);
+ num_rx_pkts = budget;
+ cpdma_chan_process(priv->rxchan, &num_rx_pkts);
} /* RX processing */
mask = EMAC_DM644X_MAC_IN_VECTOR_HOST_INT;
--
2.17.1
In case if dma mapped packet needs to be sent, like with XDP
page pool, the "mapped" submit can be used. This patch adds dma
mapped submit based on regular one.
Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
drivers/net/ethernet/ti/davinci_cpdma.c | 84 ++++++++++++++++++++-----
drivers/net/ethernet/ti/davinci_cpdma.h | 2 +
2 files changed, 72 insertions(+), 14 deletions(-)
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 4236dcdd5634..96ec1d9d8f47 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -132,6 +132,15 @@ struct cpdma_chan {
u32 rate;
};
+struct submit_info {
+ struct cpdma_chan *chan;
+ int directed;
+ void *token;
+ void *data;
+ int flags;
+ int len;
+};
+
struct cpdma_control_info {
u32 reg;
u32 shift, mask;
@@ -183,6 +192,8 @@ static struct cpdma_control_info controls[] = {
(directed << CPDMA_TO_PORT_SHIFT)); \
} while (0)
+#define CPDMA_DMA_EXT_MAP BIT(16)
+
static void cpdma_desc_pool_destroy(struct cpdma_ctlr *ctlr)
{
struct cpdma_desc_pool *pool = ctlr->pool;
@@ -1026,10 +1037,11 @@ static void __cpdma_chan_submit(struct cpdma_chan *chan,
}
}
-int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
- int len, int directed)
+static int cpdma_chan_submit_si(struct submit_info *si)
{
+ struct cpdma_chan *chan = si->chan;
struct cpdma_ctlr *ctlr = chan->ctlr;
+ int len = si->len;
struct cpdma_desc __iomem *desc;
dma_addr_t buffer;
unsigned long flags;
@@ -1061,16 +1073,22 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
chan->stats.runt_transmit_buff++;
}
- buffer = dma_map_single(ctlr->dev, data, len, chan->dir);
- ret = dma_mapping_error(ctlr->dev, buffer);
- if (ret) {
- cpdma_desc_free(ctlr->pool, desc, 1);
- ret = -EINVAL;
- goto unlock_ret;
- }
-
mode = CPDMA_DESC_OWNER | CPDMA_DESC_SOP | CPDMA_DESC_EOP;
- cpdma_desc_to_port(chan, mode, directed);
+ cpdma_desc_to_port(chan, mode, si->directed);
+
+ if (si->flags & CPDMA_DMA_EXT_MAP) {
+ buffer = (dma_addr_t)si->data;
+ dma_sync_single_for_device(ctlr->dev, buffer, len, chan->dir);
+ len |= CPDMA_DMA_EXT_MAP;
+ } else {
+ buffer = dma_map_single(ctlr->dev, si->data, len, chan->dir);
+ ret = dma_mapping_error(ctlr->dev, buffer);
+ if (ret) {
+ cpdma_desc_free(ctlr->pool, desc, 1);
+ ret = -EINVAL;
+ goto unlock_ret;
+ }
+ }
/* Relaxed IO accessors can be used here as there is read barrier
* at the end of write sequence.
@@ -1079,7 +1097,7 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
writel_relaxed(buffer, &desc->hw_buffer);
writel_relaxed(len, &desc->hw_len);
writel_relaxed(mode | len, &desc->hw_mode);
- writel_relaxed((uintptr_t)token, &desc->sw_token);
+ writel_relaxed((uintptr_t)si->token, &desc->sw_token);
writel_relaxed(buffer, &desc->sw_buffer);
writel_relaxed(len, &desc->sw_len);
desc_read(desc, sw_len);
@@ -1095,8 +1113,39 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
spin_unlock_irqrestore(&chan->lock, flags);
return ret;
}
+
+int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data, int len,
+ int directed)
+{
+ struct submit_info si;
+
+ si.chan = chan;
+ si.token = token;
+ si.data = data;
+ si.len = len;
+ si.directed = directed;
+ si.flags = 0;
+
+ return cpdma_chan_submit_si(&si);
+}
EXPORT_SYMBOL_GPL(cpdma_chan_submit);
+int cpdma_chan_submit_mapped(struct cpdma_chan *chan, void *token, void *data,
+ int len, int directed)
+{
+ struct submit_info si;
+
+ si.chan = chan;
+ si.token = token;
+ si.data = data;
+ si.len = len;
+ si.directed = directed;
+ si.flags = CPDMA_DMA_EXT_MAP;
+
+ return cpdma_chan_submit_si(&si);
+}
+EXPORT_SYMBOL_GPL(cpdma_chan_submit_mapped);
+
bool cpdma_check_free_tx_desc(struct cpdma_chan *chan)
{
struct cpdma_ctlr *ctlr = chan->ctlr;
@@ -1123,10 +1172,17 @@ static void __cpdma_chan_free(struct cpdma_chan *chan,
uintptr_t token;
token = desc_read(desc, sw_token);
- buff_dma = desc_read(desc, sw_buffer);
origlen = desc_read(desc, sw_len);
- dma_unmap_single(ctlr->dev, buff_dma, origlen, chan->dir);
+ buff_dma = desc_read(desc, sw_buffer);
+ if (origlen & CPDMA_DMA_EXT_MAP) {
+ origlen &= ~CPDMA_DMA_EXT_MAP;
+ dma_sync_single_for_cpu(ctlr->dev, buff_dma, origlen,
+ chan->dir);
+ } else {
+ dma_unmap_single(ctlr->dev, buff_dma, origlen, chan->dir);
+ }
+
cpdma_desc_free(pool, desc, 1);
(*chan->handler)((void *)token, outlen, status);
}
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index d399af5389b8..20e4d43df6d1 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -84,6 +84,8 @@ int cpdma_chan_stop(struct cpdma_chan *chan);
int cpdma_chan_get_stats(struct cpdma_chan *chan,
struct cpdma_chan_stats *stats);
+int cpdma_chan_submit_mapped(struct cpdma_chan *chan, void *token, void *data,
+ int len, int directed);
int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
int len, int directed);
int cpdma_chan_process(struct cpdma_chan *chan, int quota);
--
2.17.1
On Wed, 17 Apr 2019 20:49:42 +0300, Ivan Khoronzhuk wrote:
> Add XDP support based on rx page_pool allocator, one frame per page.
> This patch was verified with af_xdp and xdp drop. Page pool allocator
> is used with assumption that only one rx_handler is running
> simultaneously. DMA map/unmap is reused from page pool despite there
> is no need to map whole page.
>
> Due to specific of cpsw, the same TX/RX handler can be used by 2
> network devices, so special fields in buffer are added to identify
> an interface the frame is destined to.
>
> XDP prog is common for all channels till appropriate changes are added
> in XDP infrastructure.
>
> Signed-off-by: Ivan Khoronzhuk <[email protected]>
> @@ -902,22 +947,169 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
> }
> }
>
> +static inline int cpsw_tx_submit_xdpf(struct cpsw_priv *priv,
> + struct xdp_frame *xdpf,
> + struct cpdma_chan *txch)
> +{
> + struct cpsw_common *cpsw = priv->cpsw;
> +
> + return cpdma_chan_submit(txch, cpsw_xdpf_to_handle(xdpf), xdpf->data,
> + xdpf->len,
> + priv->emac_port + cpsw->data.dual_emac);
> +}
> +
> +static int cpsw_xdp_tx_frame(struct cpsw_priv *priv, struct xdp_frame *frame)
> +{
> + struct cpsw_common *cpsw = priv->cpsw;
> + struct cpsw_meta_xdp *xmeta;
> + struct cpdma_chan *txch;
> + int ret = 0;
> +
> + frame->metasize = sizeof(struct cpsw_meta_xdp);
> + xmeta = frame->data - frame->metasize;
> + xmeta->ndev = priv->ndev;
> + xmeta->ch = 0;
> +
> + txch = cpsw->txv[0].ch;
> + ret = cpsw_tx_submit_xdpf(priv, frame, txch);
> + if (ret) {
> + xdp_return_frame_rx_napi(frame);
> + ret = -1;
> + }
> +
> + /* If there is no more tx desc left free then we need to
> + * tell the kernel to stop sending us tx frames.
> + */
So you're using the same TX ring for XDP and stack? How does that
work? The stack's TX ring has a lock, and can be used from any CPU,
while XDP TX rings are per-PCU, no?
> + if (unlikely(!cpdma_check_free_tx_desc(txch))) {
> + struct netdev_queue *txq = netdev_get_tx_queue(priv->ndev, 0);
> +
> + netif_tx_stop_queue(txq);
> +
> + /* Barrier, so that stop_queue visible to other cpus */
> + smp_mb__after_atomic();
> +
> + if (cpdma_check_free_tx_desc(txch))
> + netif_tx_wake_queue(txq);
> + }
> +
> + return ret;
> +}
> +static struct page_pool *cpsw_create_rx_pool(struct cpsw_common *cpsw)
> +{
> + struct page_pool_params pp_params = { 0 };
> +
> + pp_params.order = 0;
> + pp_params.flags = PP_FLAG_DMA_MAP;
> +
> + /* set it to number of descriptors to be cached from init? */
> + pp_params.pool_size = descs_pool_size;
> + pp_params.nid = NUMA_NO_NODE; /* no numa */
> + pp_params.dma_dir = DMA_FROM_DEVICE;
DMA_FROM_DEVICE looks suspicious if you support TX, shouldn't this be
BIDIRECTIONAL?
> + pp_params.dev = cpsw->dev;
> +
> + return page_pool_create(&pp_params);
> +}
> static int cpsw_rx_handler(void *token, int len, int status)
> {
> - struct cpdma_chan *ch;
> - struct sk_buff *skb = token;
> - struct sk_buff *new_skb;
> - struct net_device *ndev = skb->dev;
> - int ret = 0, port;
> - struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
> + struct page *new_page, *page = token;
> + struct cpsw_meta_xdp *new_xmeta, *xmeta = page_address(page);
> + struct cpsw_common *cpsw = ndev_to_cpsw(xmeta->ndev);
> + int pkt_size = cpsw->rx_packet_max;
> + int ret = 0, port, ch = xmeta->ch;
> + struct page_pool *pool = cpsw->rx_page_pool;
> + int headroom = CPSW_HEADROOM;
> + struct net_device *ndev = xmeta->ndev;
> + int flush = 0;
> struct cpsw_priv *priv;
> + struct sk_buff *skb;
> + struct xdp_buff xdp;
> + dma_addr_t dma;
>
> if (cpsw->data.dual_emac) {
> port = CPDMA_RX_SOURCE_PORT(status);
> - if (port) {
> + if (port)
> ndev = cpsw->slaves[--port].ndev;
> - skb->dev = ndev;
> - }
> }
>
> if (unlikely(status < 0) || unlikely(!netif_running(ndev))) {
> @@ -930,47 +1122,105 @@ static int cpsw_rx_handler(void *token, int len, int status)
> * in reducing of the number of rx descriptor in
> * DMA engine, requeue skb back to cpdma.
> */
> - new_skb = skb;
> + new_page = page;
> + new_xmeta = xmeta;
> goto requeue;
> }
>
> /* the interface is going down, skbs are purged */
> - dev_kfree_skb_any(skb);
> + page_pool_recycle_direct(pool, page);
> return 0;
> }
>
> - new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
> - if (new_skb) {
> - skb_copy_queue_mapping(new_skb, skb);
> - skb_put(skb, len);
> - if (status & CPDMA_RX_VLAN_ENCAP)
> - cpsw_rx_vlan_encap(skb);
> - priv = netdev_priv(ndev);
> - if (priv->rx_ts_enabled)
> - cpts_rx_timestamp(cpsw->cpts, skb);
> - skb->protocol = eth_type_trans(skb, ndev);
> - netif_receive_skb(skb);
> - ndev->stats.rx_bytes += len;
> - ndev->stats.rx_packets++;
> - kmemleak_not_leak(new_skb);
> - } else {
> + new_page = cpsw_alloc_page(cpsw);
> + if (unlikely(!new_page)) {
> + new_page = page;
> + new_xmeta = xmeta;
> ndev->stats.rx_dropped++;
> - new_skb = skb;
> + goto requeue;
> }
> + new_xmeta = page_address(new_page);
> +
> + priv = netdev_priv(ndev);
> + if (priv->xdp_prog) {
> + xdp_set_data_meta_invalid(&xdp);
> +
> + if (status & CPDMA_RX_VLAN_ENCAP) {
> + xdp.data = (u8 *)xmeta + CPSW_HEADROOM +
> + CPSW_RX_VLAN_ENCAP_HDR_SIZE;
> + xdp.data_end = xdp.data + len -
> + CPSW_RX_VLAN_ENCAP_HDR_SIZE;
> + } else {
> + xdp.data = (u8 *)xmeta + CPSW_HEADROOM;
> + xdp.data_end = xdp.data + len;
> + }
> +
> + xdp.data_hard_start = xmeta;
> + xdp.rxq = &priv->xdp_rxq[ch];
> +
> + ret = cpsw_run_xdp(priv, &cpsw->rxv[ch], &xdp);
> + if (ret) {
> + if (ret == 2)
> + flush = 1;
> +
> + goto requeue;
> + }
> +
> + /* XDP prog might have changed packet data and boundaries */
> + len = xdp.data_end - xdp.data;
> + headroom = xdp.data - xdp.data_hard_start;
> + }
> +
> + /* Build skb and pass it to networking stack if XDP off or XDP prog
> + * returned XDP_PASS
> + */
> + skb = build_skb(xmeta, cpsw_rxbuf_total_len(pkt_size));
> + if (!skb) {
> + ndev->stats.rx_dropped++;
> + page_pool_recycle_direct(pool, page);
> + goto requeue;
> + }
> +
> + skb_reserve(skb, headroom);
> + skb_put(skb, len);
> + skb->dev = ndev;
> + if (status & CPDMA_RX_VLAN_ENCAP)
> + cpsw_rx_vlan_encap(skb);
> + if (priv->rx_ts_enabled)
> + cpts_rx_timestamp(cpsw->cpts, skb);
> + skb->protocol = eth_type_trans(skb, ndev);
> +
> + /* as cpsw handles one packet per NAPI recycle page before increasing
> + * refcounter, holding this in page pool cache
> + */
> + page_pool_recycle_direct(pool, page);
> +
> + /* it's decremented by netstack after what can be allocated
> + * in cpsw_alloc_page()
> + */
> + page_ref_inc(page);
> + netif_receive_skb(skb);
> +
> + ndev->stats.rx_bytes += len;
> + ndev->stats.rx_packets++;
>
> requeue:
> if (netif_dormant(ndev)) {
> - dev_kfree_skb_any(new_skb);
> - return 0;
> + page_pool_recycle_direct(pool, new_page);
> + return flush;
> }
>
> - ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch;
> - ret = cpdma_chan_submit(ch, new_skb, new_skb->data,
> - skb_tailroom(new_skb), 0);
> + new_xmeta->ndev = ndev;
> + new_xmeta->ch = ch;
> + dma = new_page->dma_addr + CPSW_HEADROOM;
> + ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, (void *)dma,
> + pkt_size, 0);
> if (WARN_ON(ret < 0))
> - dev_kfree_skb_any(new_skb);
> + page_pool_recycle_direct(pool, new_page);
> + else
> + kmemleak_not_leak(new_xmeta); /* Is it needed? */
>
> - return 0;
> + return flush;
> }
On a quick scan I don't see DMA syncs, does the DMA driver takes care
of making sure the DMA sync happens?
> static void cpsw_split_res(struct net_device *ndev)
> @@ -2684,6 +2949,63 @@ static int cpsw_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type,
> }
> }
>
> +static int cpsw_xdp_prog_setup(struct net_device *ndev, struct bpf_prog *prog)
> +{
> + struct cpsw_priv *priv = netdev_priv(ndev);
> + struct bpf_prog *old_prog;
> +
> + if (!priv->xdp_prog && !prog)
> + return 0;
> +
> + old_prog = xchg(&priv->xdp_prog, prog);
> + if (old_prog)
> + bpf_prog_put(old_prog);
> +
> + return 0;
> +}
> +
> +static int cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf)
> +{
> + struct cpsw_priv *priv = netdev_priv(ndev);
> +
> + switch (bpf->command) {
> + case XDP_SETUP_PROG:
> + return cpsw_xdp_prog_setup(ndev, bpf->prog);
> +
> + case XDP_QUERY_PROG:
> + bpf->prog_id = priv->xdp_prog ? priv->xdp_prog->aux->id : 0;
Consider using xdp_attachment_query() and friends. This way you'll
also return the flags.
> + return 0;
> +
> + default:
> + return -EINVAL;
> + }
> +}
> +
> +static int cpsw_ndo_xdp_xmit(struct net_device *ndev, int n,
> + struct xdp_frame **frames, u32 flags)
> +{
> + struct cpsw_priv *priv = netdev_priv(ndev);
> + struct xdp_frame *xdpf;
> + int i, drops = 0;
> +
> + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
> + return -EINVAL;
> +
> + for (i = 0; i < n; i++) {
> + xdpf = frames[i];
> + if (xdpf->len < CPSW_MIN_PACKET_SIZE) {
> + xdp_return_frame_rx_napi(xdpf);
> + drops++;
> + continue;
> + }
> +
> + if (cpsw_xdp_tx_frame(priv, xdpf))
> + drops++;
> + }
> +
> + return n - drops;
> +}
> +
> static const struct net_device_ops cpsw_netdev_ops = {
> .ndo_open = cpsw_ndo_open,
> .ndo_stop = cpsw_ndo_stop,
> @@ -2700,6 +3022,8 @@ static const struct net_device_ops cpsw_netdev_ops = {
> .ndo_vlan_rx_add_vid = cpsw_ndo_vlan_rx_add_vid,
> .ndo_vlan_rx_kill_vid = cpsw_ndo_vlan_rx_kill_vid,
> .ndo_setup_tc = cpsw_ndo_setup_tc,
> + .ndo_bpf = cpsw_ndo_bpf,
> + .ndo_xdp_xmit = cpsw_ndo_xdp_xmit,
> };
>
> static int cpsw_get_regs_len(struct net_device *ndev)
> @@ -2920,6 +3244,57 @@ static int cpsw_check_ch_settings(struct cpsw_common *cpsw,
> return 0;
> }
>
> +static void cpsw_xdp_rxq_unreg(struct cpsw_common *cpsw, int ch)
> +{
> + struct cpsw_slave *slave;
> + struct cpsw_priv *priv;
> + int i;
> +
> + for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) {
> + if (!slave->ndev)
> + continue;
> +
> + priv = netdev_priv(slave->ndev);
> + xdp_rxq_info_unreg(&priv->xdp_rxq[ch]);
> + }
> +}
> +
> +static int cpsw_xdp_rxq_reg(struct cpsw_common *cpsw, int ch)
> +{
> + struct cpsw_slave *slave;
> + struct cpsw_priv *priv;
> + int i, ret;
> +
> + /* As channels are common for both ports sharing same queues, xdp_rxq
> + * information also becomes shared and used by every packet on this
> + * channel. But exch xdp_rxq holds link on netdev, which by the theory
> + * can have different memory model and so, network device must hold it's
> + * own set of rxq and thus both netdevs should be prepared
> + */
> + for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) {
> + if (!slave->ndev)
> + continue;
> +
> + priv = netdev_priv(slave->ndev);
> +
> + ret = xdp_rxq_info_reg(&priv->xdp_rxq[ch], priv->ndev, ch);
> + if (ret)
> + goto err;
> +
> + ret = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq[ch],
> + MEM_TYPE_PAGE_POOL,
> + cpsw->rx_page_pool);
> + if (ret)
> + goto err;
> + }
> +
> + return ret;
> +
> +err:
> + cpsw_xdp_rxq_unreg(cpsw, ch);
> + return ret;
> +}
> +
> static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
> {
> struct cpsw_common *cpsw = priv->cpsw;
> @@ -2950,6 +3325,11 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
> if (!vec[*ch].ch)
> return -EINVAL;
>
> + if (rx && cpsw_xdp_rxq_reg(cpsw, *ch)) {
> + cpdma_chan_destroy(vec[*ch].ch);
> + return -EINVAL;
> + }
> +
> cpsw_info(priv, ifup, "created new %d %s channel\n", *ch,
> (rx ? "rx" : "tx"));
> (*ch)++;
> @@ -2958,6 +3338,9 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
> while (*ch > ch_num) {
> (*ch)--;
>
> + if (rx)
> + cpsw_xdp_rxq_unreg(cpsw, *ch);
> +
> ret = cpdma_chan_destroy(vec[*ch].ch);
> if (ret)
> return ret;
> @@ -3446,6 +3829,15 @@ static int cpsw_probe_dual_emac(struct cpsw_priv *priv)
> ndev->netdev_ops = &cpsw_netdev_ops;
> ndev->ethtool_ops = &cpsw_ethtool_ops;
>
> + ret = xdp_rxq_info_reg(&priv_sl2->xdp_rxq[0], ndev, 0);
> + if (ret)
> + return ret;
> +
> + ret = xdp_rxq_info_reg_mem_model(&priv_sl2->xdp_rxq[0],
> + MEM_TYPE_PAGE_SHARED, NULL);
> + if (ret)
> + return ret;
> +
> /* register the network device */
> SET_NETDEV_DEV(ndev, cpsw->dev);
> ret = register_netdev(ndev);
> @@ -3517,6 +3909,12 @@ static int cpsw_probe(struct platform_device *pdev)
> goto clean_ndev_ret;
> }
>
> + cpsw->rx_page_pool = cpsw_create_rx_pool(cpsw);
> + if (IS_ERR(cpsw->rx_page_pool)) {
> + dev_err(&pdev->dev, "create rx page pool\n");
> + goto clean_ndev_ret;
> + }
> +
> /*
> * This may be required here for child devices.
> */
> @@ -3663,20 +4061,31 @@ static int cpsw_probe(struct platform_device *pdev)
> cpsw->quirk_irq = 1;
>
> ch = cpsw->quirk_irq ? 0 : 7;
> - cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, ch, cpsw_tx_handler, 0);
> + cpsw->txv[0].ch =
> + cpdma_chan_create(cpsw->dma, ch, cpsw_tx_handler, 0);
> if (IS_ERR(cpsw->txv[0].ch)) {
> dev_err(priv->dev, "error initializing tx dma channel\n");
> ret = PTR_ERR(cpsw->txv[0].ch);
> goto clean_dma_ret;
> }
>
> - cpsw->rxv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
> + cpsw->rxv[0].ch =
> + cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
> if (IS_ERR(cpsw->rxv[0].ch)) {
> dev_err(priv->dev, "error initializing rx dma channel\n");
> ret = PTR_ERR(cpsw->rxv[0].ch);
> goto clean_dma_ret;
> }
>
> + ret = xdp_rxq_info_reg(&priv->xdp_rxq[0], ndev, 0);
> + if (ret)
> + goto clean_dma_ret;
> +
> + ret = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq[0], MEM_TYPE_PAGE_POOL,
> + cpsw->rx_page_pool);
> + if (ret)
> + goto clean_dma_ret;
> +
> ale_params.dev = &pdev->dev;
> ale_params.ale_ageout = ale_ageout;
> ale_params.ale_entries = data->ale_entries;
I think you need to unreg the mem model somewhere on the failure path,
no?
> @@ -3786,6 +4195,7 @@ static int cpsw_probe(struct platform_device *pdev)
> pm_runtime_put_sync(&pdev->dev);
> clean_runtime_disable_ret:
> pm_runtime_disable(&pdev->dev);
> + page_pool_destroy(cpsw->rx_page_pool);
> clean_ndev_ret:
> free_netdev(priv->ndev);
> return ret;
> @@ -3809,6 +4219,7 @@ static int cpsw_remove(struct platform_device *pdev)
>
> cpts_release(cpsw->cpts);
> cpdma_ctlr_destroy(cpsw->dma);
> + page_pool_destroy(cpsw->rx_page_pool);
> cpsw_remove_dt(pdev);
> pm_runtime_put_sync(&pdev->dev);
> pm_runtime_disable(&pdev->dev);
On Wed, 17 Apr 2019 at 19:51, Ivan Khoronzhuk
<[email protected]> wrote:
>
> This patchset is RFC adding XDP support for TI cpsw driver that is
> based on page_pool allocator. It was verified with af_xdp sockets
> and on xdp drop. For XDP redirect to another interface it's under
> verification, still not sure about all cases that should be verified.
> Also regular tests with iperf2 were done in order to verify impact on
> regular netstack performance, compared with base commit from
> net-next/master: 432bc230700f86801cffa5e159e05dea6229f722
>
> It was verified with following configs enabled:
> CONFIG_JIT=y
> CONFIG_BPFILTER=y
> CONFIG_BPF_SYSCALL=y
> CONFIG_XDP_SOCKETS=y
> CONFIG_BPF_EVENTS=y
> CONFIG_HAVE_EBPF_JIT=y
> CONFIG_BPF_JIT=y
> CONFIG_CGROUP_BPF=y
>
> iperf2 UDP RX summary (packet size / Mbps):
> +--------------------------------------------------------------+
> | pkt_size/rate | 1024 | 1500 | 1470 | 64 | 128 | 512 | 256 |
> |---------------+------+------+------+------+------+-----+-----|
> | base commit | 561 | 470 | 796 | 35 | 70.1 | 281 | 140 |
> |---------------+------+------+------+------+------+-----+-----|
> | XDP patched | 563 | 455 | 808 | 35 | 70.2 | 282 | 141 |
> +--------------------------------------------------------------+
>
> iperf2 UDP TX summary (packet size / Mbps):
> +--------------------------------------------------------------+
> | pkt_size/rate | 1024 | 1500 | 1470 | 64 | 128 | 512 | 256 |
> |---------------+------+------+------+------+------+-----+-----|
> | base commit | 555 | 666 | 736 | 34.5 | 70.3 | 281 | 140 |
> |---------------+------+------+------+------+------+-----+-----|
> | XDP patched | 558 | 696 | 759 | 35.2 | 69.2 | 279 | 140 |
> +--------------------------------------------------------------+
>
> iperf2 TCP summary (window size / Mbps):
> +------------------------------------------------------------+
> | window size/rate | 16 | 32 | 64 | 128 | 8 | 256 |
> |------------------+------+------+------+------+------+------|
> | base commit | 753 | 887 | 931 | 932 | 676 | 932 |
> |------------------+------+------+------+------+------+------|
> | XDP patched | 823 | 888 | 932 | 933 | 669 | 933 |
> +------------------------------------------------------------+
>
> For af_xdp socket type verification several generic changes should be added
> that can be seen here (rough fixes, for samples related seems like last version
> of samples is more integrated with libbpf api, so should be rebased,
> witch I will send as RFC separately):
> https://github.com/ikhorn/af_xdp_stuff/tree/af_xdp_armv7
>
More XDP support, yay!
As for mmap/AF_XDP on 32-bit systems; Instead of hacking the if_xdp.h,
mmap2 should be used. Have a look at the libbpf code here [1]
Björn
[1] https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n81
> Ivan Khoronzhuk (3):
> net: ethernet: ti: davinci_cpdma: add dma mapped submit
> net: ethernet: ti: davinci_cpdma: return handler status
> net: ethernet: ti: cpsw: add XDP support
>
> drivers/net/ethernet/ti/Kconfig | 1 +
> drivers/net/ethernet/ti/cpsw.c | 552 +++++++++++++++++++++---
> drivers/net/ethernet/ti/davinci_cpdma.c | 117 +++--
> drivers/net/ethernet/ti/davinci_cpdma.h | 6 +-
> drivers/net/ethernet/ti/davinci_emac.c | 18 +-
> 5 files changed, 591 insertions(+), 103 deletions(-)
>
> --
> 2.17.1
>
On Wed, Apr 17, 2019 at 03:46:56PM -0700, Jakub Kicinski wrote:
>On Wed, 17 Apr 2019 20:49:42 +0300, Ivan Khoronzhuk wrote:
>> Add XDP support based on rx page_pool allocator, one frame per page.
>> This patch was verified with af_xdp and xdp drop. Page pool allocator
>> is used with assumption that only one rx_handler is running
>> simultaneously. DMA map/unmap is reused from page pool despite there
>> is no need to map whole page.
>>
>> Due to specific of cpsw, the same TX/RX handler can be used by 2
>> network devices, so special fields in buffer are added to identify
>> an interface the frame is destined to.
>>
>> XDP prog is common for all channels till appropriate changes are added
>> in XDP infrastructure.
>>
>> Signed-off-by: Ivan Khoronzhuk <[email protected]>
>
>> @@ -902,22 +947,169 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
>> }
>> }
>>
>> +static inline int cpsw_tx_submit_xdpf(struct cpsw_priv *priv,
>> + struct xdp_frame *xdpf,
>> + struct cpdma_chan *txch)
>> +{
>> + struct cpsw_common *cpsw = priv->cpsw;
>> +
>> + return cpdma_chan_submit(txch, cpsw_xdpf_to_handle(xdpf), xdpf->data,
>> + xdpf->len,
>> + priv->emac_port + cpsw->data.dual_emac);
>> +}
>> +
>> +static int cpsw_xdp_tx_frame(struct cpsw_priv *priv, struct xdp_frame *frame)
>> +{
>> + struct cpsw_common *cpsw = priv->cpsw;
>> + struct cpsw_meta_xdp *xmeta;
>> + struct cpdma_chan *txch;
>> + int ret = 0;
>> +
>> + frame->metasize = sizeof(struct cpsw_meta_xdp);
>> + xmeta = frame->data - frame->metasize;
>> + xmeta->ndev = priv->ndev;
>> + xmeta->ch = 0;
>> +
>> + txch = cpsw->txv[0].ch;
>> + ret = cpsw_tx_submit_xdpf(priv, frame, txch);
>> + if (ret) {
>> + xdp_return_frame_rx_napi(frame);
>> + ret = -1;
>> + }
>> +
>> + /* If there is no more tx desc left free then we need to
>> + * tell the kernel to stop sending us tx frames.
>> + */
>
>So you're using the same TX ring for XDP and stack? How does that
Yes.
>work? The stack's TX ring has a lock, and can be used from any CPU,
>while XDP TX rings are per-PCU, no?
Yes and no.
am572 has more queues then CPU num, How I can choose tx queue not based on CPU
num? It's always shared and has to have lock, and cpdma is done in this way.
Here another thing bothering me, I send it to queue 0 always, instead of
taking cpu num. Not sure about this, but I expect to have some tx queue
not bind to cpu and didn't find a way it can be changed dynamically in
redirect.
>
>> + if (unlikely(!cpdma_check_free_tx_desc(txch))) {
>> + struct netdev_queue *txq = netdev_get_tx_queue(priv->ndev, 0);
>> +
>> + netif_tx_stop_queue(txq);
>> +
>> + /* Barrier, so that stop_queue visible to other cpus */
>> + smp_mb__after_atomic();
>> +
>> + if (cpdma_check_free_tx_desc(txch))
>> + netif_tx_wake_queue(txq);
>> + }
>> +
>> + return ret;
>> +}
>
>> +static struct page_pool *cpsw_create_rx_pool(struct cpsw_common *cpsw)
>> +{
>> + struct page_pool_params pp_params = { 0 };
>> +
>> + pp_params.order = 0;
>> + pp_params.flags = PP_FLAG_DMA_MAP;
>> +
>> + /* set it to number of descriptors to be cached from init? */
>> + pp_params.pool_size = descs_pool_size;
>> + pp_params.nid = NUMA_NO_NODE; /* no numa */
>> + pp_params.dma_dir = DMA_FROM_DEVICE;
>
>DMA_FROM_DEVICE looks suspicious if you support TX, shouldn't this be
>BIDIRECTIONAL?
Not sure about this. DMA_FROM_DEVICE is used for RX and fits in redirect to
another inf. In case of redirect each dev is using own dma map, but TX, maybe
better to behave in similar way? if no then probably you are right I can't
avoid this with TX case. I need properly test this case for sure, thanks!
>
>> + pp_params.dev = cpsw->dev;
>> +
>> + return page_pool_create(&pp_params);
[...]
>> + new_xmeta->ndev = ndev;
>> + new_xmeta->ch = ch;
>> + dma = new_page->dma_addr + CPSW_HEADROOM;
>> + ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, (void *)dma,
>> + pkt_size, 0);
>> if (WARN_ON(ret < 0))
>> - dev_kfree_skb_any(new_skb);
>> + page_pool_recycle_direct(pool, new_page);
>> + else
>> + kmemleak_not_leak(new_xmeta); /* Is it needed? */
>>
>> - return 0;
>> + return flush;
>> }
>
>On a quick scan I don't see DMA syncs, does the DMA driver takes care
>of making sure the DMA sync happens?
In prev. patch to cpdma layer
[RFC PATCH 1/3] net: ethernet: ti: davinci_cpdma: add dma mapped submit
>
>> static void cpsw_split_res(struct net_device *ndev)
>
>> @@ -2684,6 +2949,63 @@ static int cpsw_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type,
>> }
>> }
>>
>> +static int cpsw_xdp_prog_setup(struct net_device *ndev, struct bpf_prog *prog)
>> +{
>> + struct cpsw_priv *priv = netdev_priv(ndev);
>> + struct bpf_prog *old_prog;
>> +
>> + if (!priv->xdp_prog && !prog)
>> + return 0;
>> +
>> + old_prog = xchg(&priv->xdp_prog, prog);
>> + if (old_prog)
>> + bpf_prog_put(old_prog);
>> +
>> + return 0;
>> +}
>> +
>> +static int cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf)
>> +{
>> + struct cpsw_priv *priv = netdev_priv(ndev);
>> +
>> + switch (bpf->command) {
>> + case XDP_SETUP_PROG:
>> + return cpsw_xdp_prog_setup(ndev, bpf->prog);
>> +
>> + case XDP_QUERY_PROG:
>> + bpf->prog_id = priv->xdp_prog ? priv->xdp_prog->aux->id : 0;
>
>Consider using xdp_attachment_query() and friends. This way you'll
>also return the flags.
I will.
>
>> + return 0;
>> +
>> + default:
[...]
>> - cpsw->rxv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
>> + cpsw->rxv[0].ch =
>> + cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
>> if (IS_ERR(cpsw->rxv[0].ch)) {
>> dev_err(priv->dev, "error initializing rx dma channel\n");
>> ret = PTR_ERR(cpsw->rxv[0].ch);
>> goto clean_dma_ret;
>> }
>>
>> + ret = xdp_rxq_info_reg(&priv->xdp_rxq[0], ndev, 0);
>> + if (ret)
>> + goto clean_dma_ret;
>> +
>> + ret = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq[0], MEM_TYPE_PAGE_POOL,
>> + cpsw->rx_page_pool);
>> + if (ret)
>> + goto clean_dma_ret;
>> +
>> ale_params.dev = &pdev->dev;
>> ale_params.ale_ageout = ale_ageout;
>> ale_params.ale_entries = data->ale_entries;
>
>I think you need to unreg the mem model somewhere on the failure path,
>no?
yes, seems so. Thanks.
>
>
>> @@ -3786,6 +4195,7 @@ static int cpsw_probe(struct platform_device *pdev)
>> pm_runtime_put_sync(&pdev->dev);
>> clean_runtime_disable_ret:
>> pm_runtime_disable(&pdev->dev);
>> + page_pool_destroy(cpsw->rx_page_pool);
>> clean_ndev_ret:
>> free_netdev(priv->ndev);
>> return ret;
>> @@ -3809,6 +4219,7 @@ static int cpsw_remove(struct platform_device *pdev)
>>
>> cpts_release(cpsw->cpts);
>> cpdma_ctlr_destroy(cpsw->dma);
>> + page_pool_destroy(cpsw->rx_page_pool);
>> cpsw_remove_dt(pdev);
>> pm_runtime_put_sync(&pdev->dev);
>> pm_runtime_disable(&pdev->dev);
--
Regards,
Ivan Khoronzhuk
On Thu, Apr 18, 2019 at 08:12:06AM +0200, Bj?rn T?pel wrote:
>On Wed, 17 Apr 2019 at 19:51, Ivan Khoronzhuk
><[email protected]> wrote:
>>
>> This patchset is RFC adding XDP support for TI cpsw driver that is
>> based on page_pool allocator. It was verified with af_xdp sockets
>> and on xdp drop. For XDP redirect to another interface it's under
>> verification, still not sure about all cases that should be verified.
>> Also regular tests with iperf2 were done in order to verify impact on
>> regular netstack performance, compared with base commit from
>> net-next/master: 432bc230700f86801cffa5e159e05dea6229f722
>>
>> It was verified with following configs enabled:
>> CONFIG_JIT=y
>> CONFIG_BPFILTER=y
>> CONFIG_BPF_SYSCALL=y
>> CONFIG_XDP_SOCKETS=y
>> CONFIG_BPF_EVENTS=y
>> CONFIG_HAVE_EBPF_JIT=y
>> CONFIG_BPF_JIT=y
>> CONFIG_CGROUP_BPF=y
>>
>> iperf2 UDP RX summary (packet size / Mbps):
>> +--------------------------------------------------------------+
>> | pkt_size/rate | 1024 | 1500 | 1470 | 64 | 128 | 512 | 256 |
>> |---------------+------+------+------+------+------+-----+-----|
>> | base commit | 561 | 470 | 796 | 35 | 70.1 | 281 | 140 |
>> |---------------+------+------+------+------+------+-----+-----|
>> | XDP patched | 563 | 455 | 808 | 35 | 70.2 | 282 | 141 |
>> +--------------------------------------------------------------+
>>
>> iperf2 UDP TX summary (packet size / Mbps):
>> +--------------------------------------------------------------+
>> | pkt_size/rate | 1024 | 1500 | 1470 | 64 | 128 | 512 | 256 |
>> |---------------+------+------+------+------+------+-----+-----|
>> | base commit | 555 | 666 | 736 | 34.5 | 70.3 | 281 | 140 |
>> |---------------+------+------+------+------+------+-----+-----|
>> | XDP patched | 558 | 696 | 759 | 35.2 | 69.2 | 279 | 140 |
>> +--------------------------------------------------------------+
>>
>> iperf2 TCP summary (window size / Mbps):
>> +------------------------------------------------------------+
>> | window size/rate | 16 | 32 | 64 | 128 | 8 | 256 |
>> |------------------+------+------+------+------+------+------|
>> | base commit | 753 | 887 | 931 | 932 | 676 | 932 |
>> |------------------+------+------+------+------+------+------|
>> | XDP patched | 823 | 888 | 932 | 933 | 669 | 933 |
>> +------------------------------------------------------------+
>>
>> For af_xdp socket type verification several generic changes should be added
>> that can be seen here (rough fixes, for samples related seems like last version
>> of samples is more integrated with libbpf api, so should be rebased,
>> witch I will send as RFC separately):
>> https://github.com/ikhorn/af_xdp_stuff/tree/af_xdp_armv7
>>
>
>More XDP support, yay!
>
>As for mmap/AF_XDP on 32-bit systems; Instead of hacking the if_xdp.h,
>mmap2 should be used. Have a look at the libbpf code here [1]
I more bothering about this fast fix (not investigated deep enough):
https://github.com/ikhorn/af_xdp_stuff/commit/f77cd4faf95ad744bbf37d0ba81d0ec79c1b0d29
That's about when page_address(umem->pgs[i]) returns nothing.
>
>
>Bj?rn
>
>[1] https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n81
>
>
>> Ivan Khoronzhuk (3):
>> net: ethernet: ti: davinci_cpdma: add dma mapped submit
>> net: ethernet: ti: davinci_cpdma: return handler status
>> net: ethernet: ti: cpsw: add XDP support
>>
>> drivers/net/ethernet/ti/Kconfig | 1 +
>> drivers/net/ethernet/ti/cpsw.c | 552 +++++++++++++++++++++---
>> drivers/net/ethernet/ti/davinci_cpdma.c | 117 +++--
>> drivers/net/ethernet/ti/davinci_cpdma.h | 6 +-
>> drivers/net/ethernet/ti/davinci_emac.c | 18 +-
>> 5 files changed, 591 insertions(+), 103 deletions(-)
>>
>> --
>> 2.17.1
>>
--
Regards,
Ivan Khoronzhuk
On Thu, 18 Apr 2019 12:40:10 +0300, Ivan Khoronzhuk wrote:
> >work? The stack's TX ring has a lock, and can be used from any CPU,
> >while XDP TX rings are per-PCU, no?
> Yes and no.
> am572 has more queues then CPU num, How I can choose tx queue not based on CPU
> num? It's always shared and has to have lock, and cpdma is done in this way.
Oh, I see, you already have another lock.
> Here another thing bothering me, I send it to queue 0 always, instead of
> taking cpu num. Not sure about this, but I expect to have some tx queue
> not bind to cpu and didn't find a way it can be changed dynamically in
> redirect.
Not sure I understand :)
On Thu, Apr 18, 2019 at 10:41:11AM -0700, Jakub Kicinski wrote:
>On Thu, 18 Apr 2019 12:40:10 +0300, Ivan Khoronzhuk wrote:
>> >work? The stack's TX ring has a lock, and can be used from any CPU,
>> >while XDP TX rings are per-PCU, no?
>> Yes and no.
>> am572 has more queues then CPU num, How I can choose tx queue not based on CPU
>> num? It's always shared and has to have lock, and cpdma is done in this way.
>
>Oh, I see, you already have another lock.
>
>> Here another thing bothering me, I send it to queue 0 always, instead of
>> taking cpu num. Not sure about this, but I expect to have some tx queue
>> not bind to cpu and didn't find a way it can be changed dynamically in
>> redirect.
>
>Not sure I understand :)
I mean, is there a capability to choose tx queue while XDP_REDIRECT or XDP_TX.
From what I've seen it's taken by:
qidx = smp_processor_id();
What if I need to send classified traffic to hw queue 4 for instance...
--
Regards,
Ivan Khoronzhuk
On Thu, 18 Apr 2019 21:30:44 +0300, Ivan Khoronzhuk wrote:
> >> Here another thing bothering me, I send it to queue 0 always, instead of
> >> taking cpu num. Not sure about this, but I expect to have some tx queue
> >> not bind to cpu and didn't find a way it can be changed dynamically in
> >> redirect.
> >
> >Not sure I understand :)
>
> I mean, is there a capability to choose tx queue while XDP_REDIRECT or XDP_TX.
>
> From what I've seen it's taken by:
> qidx = smp_processor_id();
>
> What if I need to send classified traffic to hw queue 4 for instance...
I see, no we don't have any API for that, yet. I think all NICs which
implement XDP REDIRECT today will allocate a single normal prio queue
per CPU..
Hi Ivan,
> +static struct page *cpsw_alloc_page(struct cpsw_common *cpsw)
> +{
> + struct page_pool *pool = cpsw->rx_page_pool;
> + struct page *page;
> + int i = 0;
> +
> + do {
> + page = page_pool_dev_alloc_pages(pool);
> + if (!page)
> + return NULL;
> +
> + /* skip pages used by skb netstack */
I think the comment here is wrong and might confuse people.
The page ref cnt is 1, which means the packet was *processed* and netstack is
done with it, hence you can re-use it.
If it's !=1 then you correctly unmap the buffer and decrease the ref cnt, so it
will eventually be freed and not returned to the pool, right?
> + if (page_ref_count(page) == 1)
> + break;
> +
> + /* it's a pitty, but free page */
> + page_pool_recycle_direct(pool, page);
> + } while (++i < descs_pool_size);
> +
> + return page;
> +}
> +
/Ilias
On Fri, Apr 19, 2019 at 11:31:56AM +0300, Ilias Apalodimas wrote:
>Hi Ivan,
>
>> +static struct page *cpsw_alloc_page(struct cpsw_common *cpsw)
>> +{
>> + struct page_pool *pool = cpsw->rx_page_pool;
>> + struct page *page;
>> + int i = 0;
>> +
>> + do {
>> + page = page_pool_dev_alloc_pages(pool);
>> + if (!page)
>> + return NULL;
>> +
>> + /* skip pages used by skb netstack */
>I think the comment here is wrong and might confuse people.
>The page ref cnt is 1, which means the packet was *processed* and netstack is
>done with it, hence you can re-use it.
>If it's !=1 then you correctly unmap the buffer and decrease the ref cnt, so it
>will eventually be freed and not returned to the pool, right?
It's compensation substitution for page_pool support in skb netsack.
And should be considered in combine with:
skb = build_skb(xmeta, cpsw_rxbuf_total_len(pkt_size));
...
page_pool_recycle_direct(pool, page);
page_ref_inc(page);
netif_receive_skb(skb);
Here order is important.
I will correct comments in final version (w/o overloading) ofc, leaving
thinking environment for people. I think it's fair enough about this.
>> + if (page_ref_count(page) == 1)
>> + break;
>> +
>> + /* it's a pitty, but free page */
>> + page_pool_recycle_direct(pool, page);
>> + } while (++i < descs_pool_size);
>> +
>> + return page;
>> +}
>> +
>
> /Ilias
--
Regards,
Ivan Khoronzhuk
Hi Ivan,
On 17.04.19 20:49, Ivan Khoronzhuk wrote:
> This patchset is RFC adding XDP support for TI cpsw driver that is
> based on page_pool allocator. It was verified with af_xdp sockets
> and on xdp drop. For XDP redirect to another interface it's under
> verification, still not sure about all cases that should be verified.
> Also regular tests with iperf2 were done in order to verify impact on
> regular netstack performance, compared with base commit from
> net-next/master: 432bc230700f86801cffa5e159e05dea6229f722
>
> It was verified with following configs enabled:
> CONFIG_JIT=y
> CONFIG_BPFILTER=y
> CONFIG_BPF_SYSCALL=y
> CONFIG_XDP_SOCKETS=y
> CONFIG_BPF_EVENTS=y
> CONFIG_HAVE_EBPF_JIT=y
> CONFIG_BPF_JIT=y
> CONFIG_CGROUP_BPF=y
>
> iperf2 UDP RX summary (packet size / Mbps):
> +--------------------------------------------------------------+
> | pkt_size/rate | 1024 | 1500 | 1470 | 64 | 128 | 512 | 256 |
> |---------------+------+------+------+------+------+-----+-----|
> | base commit | 561 | 470 | 796 | 35 | 70.1 | 281 | 140 |
> |---------------+------+------+------+------+------+-----+-----|
> | XDP patched | 563 | 455 | 808 | 35 | 70.2 | 282 | 141 |
> +--------------------------------------------------------------+
>
> iperf2 UDP TX summary (packet size / Mbps):
> +--------------------------------------------------------------+
> | pkt_size/rate | 1024 | 1500 | 1470 | 64 | 128 | 512 | 256 |
> |---------------+------+------+------+------+------+-----+-----|
> | base commit | 555 | 666 | 736 | 34.5 | 70.3 | 281 | 140 |
> |---------------+------+------+------+------+------+-----+-----|
> | XDP patched | 558 | 696 | 759 | 35.2 | 69.2 | 279 | 140 |
> +--------------------------------------------------------------+
>
> iperf2 TCP summary (window size / Mbps):
> +------------------------------------------------------------+
> | window size/rate | 16 | 32 | 64 | 128 | 8 | 256 |
> |------------------+------+------+------+------+------+------|
> | base commit | 753 | 887 | 931 | 932 | 676 | 932 |
> |------------------+------+------+------+------+------+------|
> | XDP patched | 823 | 888 | 932 | 933 | 669 | 933 |
> +------------------------------------------------------------+
>
> For af_xdp socket type verification several generic changes should be added
> that can be seen here (rough fixes, for samples related seems like last version
> of samples is more integrated with libbpf api, so should be rebased,
> witch I will send as RFC separately):
> https://github.com/ikhorn/af_xdp_stuff/tree/af_xdp_armv7
>
> Ivan Khoronzhuk (3):
> net: ethernet: ti: davinci_cpdma: add dma mapped submit
> net: ethernet: ti: davinci_cpdma: return handler status
> net: ethernet: ti: cpsw: add XDP support
>
> drivers/net/ethernet/ti/Kconfig | 1 +
> drivers/net/ethernet/ti/cpsw.c | 552 +++++++++++++++++++++---
> drivers/net/ethernet/ti/davinci_cpdma.c | 117 +++--
> drivers/net/ethernet/ti/davinci_cpdma.h | 6 +-
> drivers/net/ethernet/ti/davinci_emac.c | 18 +-
> 5 files changed, 591 insertions(+), 103 deletions(-)
>
Thanks for your patches. It's great to have XDP support.
I have no objection to this series in general, but i'm not xdp expert, so
it can be moved forward when you resolve comments from other reviewers.
--
Best regards,
grygorii