2019-06-25 19:49:33

by Ivan Khoronzhuk

[permalink] [raw]
Subject: [PATCH v4 net-next 0/4] net: ethernet: ti: cpsw: Add XDP support

This patchset adds XDP support for TI cpsw driver and base it on
page_pool allocator. It was verified on af_xdp socket drop,
af_xdp l2f, ebpf XDP_DROP, XDP_REDIRECT, XDP_PASS, XDP_TX.

It was verified with following configs enabled:
CONFIG_JIT=y
CONFIG_BPFILTER=y
CONFIG_BPF_SYSCALL=y
CONFIG_XDP_SOCKETS=y
CONFIG_BPF_EVENTS=y
CONFIG_HAVE_EBPF_JIT=y
CONFIG_BPF_JIT=y
CONFIG_CGROUP_BPF=y

Link on previous v3:
https://lkml.org/lkml/2019/6/5/446

Also regular tests with iperf2 were done in order to verify impact on
regular netstack performance, compared with base commit:
https://pastebin.com/JSMT0iZ4

v3..v4:
- added page pool user counter
- use same pool for ndevs in dual mac
- restructured page pool create/destroy according to the last changes in API

v2..v3:
- each rxq and ndev has its own page pool

v1..v2:
- combined xdp_xmit functions
- used page allocation w/o refcnt juggle
- unmapped page for skb netstack
- moved rxq/page pool allocation to open/close pair
- added several preliminary patches:
net: page_pool: add helper function to retrieve dma addresses
net: page_pool: add helper function to unmap dma addresses
net: ethernet: ti: cpsw: use cpsw as drv data
net: ethernet: ti: cpsw_ethtool: simplify slave loops


Based on net-next/master

Ivan Khoronzhuk (4):
net: core: page_pool: add user cnt preventing pool deletion
net: ethernet: ti: davinci_cpdma: add dma mapped submit
net: ethernet: ti: davinci_cpdma: return handler status
net: ethernet: ti: cpsw: add XDP support

.../net/ethernet/mellanox/mlx5/core/en_main.c | 8 +-
drivers/net/ethernet/ti/Kconfig | 1 +
drivers/net/ethernet/ti/cpsw.c | 536 ++++++++++++++++--
drivers/net/ethernet/ti/cpsw_ethtool.c | 25 +-
drivers/net/ethernet/ti/cpsw_priv.h | 9 +-
drivers/net/ethernet/ti/davinci_cpdma.c | 123 +++-
drivers/net/ethernet/ti/davinci_cpdma.h | 8 +-
drivers/net/ethernet/ti/davinci_emac.c | 18 +-
include/net/page_pool.h | 7 +
net/core/page_pool.c | 7 +
net/core/xdp.c | 3 +
11 files changed, 640 insertions(+), 105 deletions(-)

--
2.17.1


2019-06-25 19:49:58

by Ivan Khoronzhuk

[permalink] [raw]
Subject: [PATCH v4 net-next 2/4] net: ethernet: ti: davinci_cpdma: add dma mapped submit

In case if dma mapped packet needs to be sent, like with XDP
page pool, the "mapped" submit can be used. This patch adds dma
mapped submit based on regular one.

Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
drivers/net/ethernet/ti/davinci_cpdma.c | 89 ++++++++++++++++++++++---
drivers/net/ethernet/ti/davinci_cpdma.h | 4 ++
2 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 5cf1758d425b..8da46394c0e7 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -139,6 +139,7 @@ struct submit_info {
int directed;
void *token;
void *data;
+ int flags;
int len;
};

@@ -184,6 +185,8 @@ static struct cpdma_control_info controls[] = {
(directed << CPDMA_TO_PORT_SHIFT)); \
} while (0)

+#define CPDMA_DMA_EXT_MAP BIT(16)
+
static void cpdma_desc_pool_destroy(struct cpdma_ctlr *ctlr)
{
struct cpdma_desc_pool *pool = ctlr->pool;
@@ -1015,6 +1018,7 @@ static int cpdma_chan_submit_si(struct submit_info *si)
struct cpdma_chan *chan = si->chan;
struct cpdma_ctlr *ctlr = chan->ctlr;
int len = si->len;
+ int swlen = len;
struct cpdma_desc __iomem *desc;
dma_addr_t buffer;
u32 mode;
@@ -1036,16 +1040,22 @@ static int cpdma_chan_submit_si(struct submit_info *si)
chan->stats.runt_transmit_buff++;
}

- buffer = dma_map_single(ctlr->dev, si->data, len, chan->dir);
- ret = dma_mapping_error(ctlr->dev, buffer);
- if (ret) {
- cpdma_desc_free(ctlr->pool, desc, 1);
- return -EINVAL;
- }
-
mode = CPDMA_DESC_OWNER | CPDMA_DESC_SOP | CPDMA_DESC_EOP;
cpdma_desc_to_port(chan, mode, si->directed);

+ if (si->flags & CPDMA_DMA_EXT_MAP) {
+ buffer = (u32)si->data;
+ dma_sync_single_for_device(ctlr->dev, buffer, len, chan->dir);
+ swlen |= CPDMA_DMA_EXT_MAP;
+ } else {
+ buffer = dma_map_single(ctlr->dev, si->data, len, chan->dir);
+ ret = dma_mapping_error(ctlr->dev, buffer);
+ if (ret) {
+ cpdma_desc_free(ctlr->pool, desc, 1);
+ return -EINVAL;
+ }
+ }
+
/* Relaxed IO accessors can be used here as there is read barrier
* at the end of write sequence.
*/
@@ -1055,7 +1065,7 @@ static int cpdma_chan_submit_si(struct submit_info *si)
writel_relaxed(mode | len, &desc->hw_mode);
writel_relaxed((uintptr_t)si->token, &desc->sw_token);
writel_relaxed(buffer, &desc->sw_buffer);
- writel_relaxed(len, &desc->sw_len);
+ writel_relaxed(swlen, &desc->sw_len);
desc_read(desc, sw_len);

__cpdma_chan_submit(chan, desc);
@@ -1079,6 +1089,32 @@ int cpdma_chan_idle_submit(struct cpdma_chan *chan, void *token, void *data,
si.data = data;
si.len = len;
si.directed = directed;
+ si.flags = 0;
+
+ spin_lock_irqsave(&chan->lock, flags);
+ if (chan->state == CPDMA_STATE_TEARDOWN) {
+ spin_unlock_irqrestore(&chan->lock, flags);
+ return -EINVAL;
+ }
+
+ ret = cpdma_chan_submit_si(&si);
+ spin_unlock_irqrestore(&chan->lock, flags);
+ return ret;
+}
+
+int cpdma_chan_idle_submit_mapped(struct cpdma_chan *chan, void *token,
+ dma_addr_t data, int len, int directed)
+{
+ struct submit_info si;
+ unsigned long flags;
+ int ret;
+
+ si.chan = chan;
+ si.token = token;
+ si.data = (void *)(u32)data;
+ si.len = len;
+ si.directed = directed;
+ si.flags = CPDMA_DMA_EXT_MAP;

spin_lock_irqsave(&chan->lock, flags);
if (chan->state == CPDMA_STATE_TEARDOWN) {
@@ -1103,6 +1139,32 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
si.data = data;
si.len = len;
si.directed = directed;
+ si.flags = 0;
+
+ spin_lock_irqsave(&chan->lock, flags);
+ if (chan->state != CPDMA_STATE_ACTIVE) {
+ spin_unlock_irqrestore(&chan->lock, flags);
+ return -EINVAL;
+ }
+
+ ret = cpdma_chan_submit_si(&si);
+ spin_unlock_irqrestore(&chan->lock, flags);
+ return ret;
+}
+
+int cpdma_chan_submit_mapped(struct cpdma_chan *chan, void *token,
+ dma_addr_t data, int len, int directed)
+{
+ struct submit_info si;
+ unsigned long flags;
+ int ret;
+
+ si.chan = chan;
+ si.token = token;
+ si.data = (void *)(u32)data;
+ si.len = len;
+ si.directed = directed;
+ si.flags = CPDMA_DMA_EXT_MAP;

spin_lock_irqsave(&chan->lock, flags);
if (chan->state != CPDMA_STATE_ACTIVE) {
@@ -1140,10 +1202,17 @@ static void __cpdma_chan_free(struct cpdma_chan *chan,
uintptr_t token;

token = desc_read(desc, sw_token);
- buff_dma = desc_read(desc, sw_buffer);
origlen = desc_read(desc, sw_len);

- dma_unmap_single(ctlr->dev, buff_dma, origlen, chan->dir);
+ buff_dma = desc_read(desc, sw_buffer);
+ if (origlen & CPDMA_DMA_EXT_MAP) {
+ origlen &= ~CPDMA_DMA_EXT_MAP;
+ dma_sync_single_for_cpu(ctlr->dev, buff_dma, origlen,
+ chan->dir);
+ } else {
+ dma_unmap_single(ctlr->dev, buff_dma, origlen, chan->dir);
+ }
+
cpdma_desc_free(pool, desc, 1);
(*chan->handler)((void *)token, outlen, status);
}
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index 9343c8c73c1b..0271a20c2e09 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -77,8 +77,12 @@ int cpdma_chan_stop(struct cpdma_chan *chan);

int cpdma_chan_get_stats(struct cpdma_chan *chan,
struct cpdma_chan_stats *stats);
+int cpdma_chan_submit_mapped(struct cpdma_chan *chan, void *token,
+ dma_addr_t data, int len, int directed);
int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
int len, int directed);
+int cpdma_chan_idle_submit_mapped(struct cpdma_chan *chan, void *token,
+ dma_addr_t data, int len, int directed);
int cpdma_chan_idle_submit(struct cpdma_chan *chan, void *token, void *data,
int len, int directed);
int cpdma_chan_process(struct cpdma_chan *chan, int quota);
--
2.17.1

2019-06-25 19:50:17

by Ivan Khoronzhuk

[permalink] [raw]
Subject: [PATCH v4 net-next 3/4] net: ethernet: ti: davinci_cpdma: return handler status

This change is needed to return flush status of rx handler for
flushing redirected xdp frames after processing channel packets.
Do it as separate patch for simplicity.

Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
drivers/net/ethernet/ti/cpsw.c | 21 ++++++++++-----
drivers/net/ethernet/ti/cpsw_ethtool.c | 2 +-
drivers/net/ethernet/ti/cpsw_priv.h | 2 +-
drivers/net/ethernet/ti/davinci_cpdma.c | 34 +++++++++++++++----------
drivers/net/ethernet/ti/davinci_cpdma.h | 4 +--
drivers/net/ethernet/ti/davinci_emac.c | 18 ++++++++-----
6 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 32b7b3b74a6b..726925df8d97 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -337,7 +337,7 @@ void cpsw_intr_disable(struct cpsw_common *cpsw)
return;
}

-void cpsw_tx_handler(void *token, int len, int status)
+int cpsw_tx_handler(void *token, int len, int status)
{
struct netdev_queue *txq;
struct sk_buff *skb = token;
@@ -355,6 +355,7 @@ void cpsw_tx_handler(void *token, int len, int status)
ndev->stats.tx_packets++;
ndev->stats.tx_bytes += len;
dev_kfree_skb_any(skb);
+ return 0;
}

static void cpsw_rx_vlan_encap(struct sk_buff *skb)
@@ -400,7 +401,7 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
}
}

-static void cpsw_rx_handler(void *token, int len, int status)
+static int cpsw_rx_handler(void *token, int len, int status)
{
struct cpdma_chan *ch;
struct sk_buff *skb = token;
@@ -434,7 +435,7 @@ static void cpsw_rx_handler(void *token, int len, int status)

/* the interface is going down, skbs are purged */
dev_kfree_skb_any(skb);
- return;
+ return 0;
}

new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
@@ -464,6 +465,8 @@ static void cpsw_rx_handler(void *token, int len, int status)
WARN_ON(ret == -ENOMEM);
dev_kfree_skb_any(new_skb);
}
+
+ return 0;
}

void cpsw_split_res(struct cpsw_common *cpsw)
@@ -602,7 +605,8 @@ static int cpsw_tx_mq_poll(struct napi_struct *napi_tx, int budget)
else
cur_budget = txv->budget;

- num_tx += cpdma_chan_process(txv->ch, cur_budget);
+ cpdma_chan_process(txv->ch, &cur_budget);
+ num_tx += cur_budget;
if (num_tx >= budget)
break;
}
@@ -620,7 +624,8 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
struct cpsw_common *cpsw = napi_to_cpsw(napi_tx);
int num_tx;

- num_tx = cpdma_chan_process(cpsw->txv[0].ch, budget);
+ num_tx = budget;
+ cpdma_chan_process(cpsw->txv[0].ch, &num_tx);
if (num_tx < budget) {
napi_complete(napi_tx);
writel(0xff, &cpsw->wr_regs->tx_en);
@@ -652,7 +657,8 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
else
cur_budget = rxv->budget;

- num_rx += cpdma_chan_process(rxv->ch, cur_budget);
+ cpdma_chan_process(rxv->ch, &cur_budget);
+ num_rx += cur_budget;
if (num_rx >= budget)
break;
}
@@ -670,7 +676,8 @@ static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
int num_rx;

- num_rx = cpdma_chan_process(cpsw->rxv[0].ch, budget);
+ num_rx = budget;
+ cpdma_chan_process(cpsw->rxv[0].ch, &num_rx);
if (num_rx < budget) {
napi_complete_done(napi_rx, num_rx);
writel(0xff, &cpsw->wr_regs->rx_en);
diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c
index f60dc1dfc443..7c19eebbabcc 100644
--- a/drivers/net/ethernet/ti/cpsw_ethtool.c
+++ b/drivers/net/ethernet/ti/cpsw_ethtool.c
@@ -532,8 +532,8 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx,
cpdma_handler_fn rx_handler)
{
struct cpsw_common *cpsw = priv->cpsw;
- void (*handler)(void *, int, int);
struct netdev_queue *queue;
+ cpdma_handler_fn handler;
struct cpsw_vector *vec;
int ret, *ch, vch;

diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
index 04795b97ee71..2ecb3af59fe9 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.h
+++ b/drivers/net/ethernet/ti/cpsw_priv.h
@@ -390,7 +390,7 @@ void cpsw_split_res(struct cpsw_common *cpsw);
int cpsw_fill_rx_channels(struct cpsw_priv *priv);
void cpsw_intr_enable(struct cpsw_common *cpsw);
void cpsw_intr_disable(struct cpsw_common *cpsw);
-void cpsw_tx_handler(void *token, int len, int status);
+int cpsw_tx_handler(void *token, int len, int status);

/* ethtool */
u32 cpsw_get_msglevel(struct net_device *ndev);
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 8da46394c0e7..b96f5ae974ba 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -1191,15 +1191,16 @@ bool cpdma_check_free_tx_desc(struct cpdma_chan *chan)
return free_tx_desc;
}

-static void __cpdma_chan_free(struct cpdma_chan *chan,
- struct cpdma_desc __iomem *desc,
- int outlen, int status)
+static int __cpdma_chan_free(struct cpdma_chan *chan,
+ struct cpdma_desc __iomem *desc, int outlen,
+ int status)
{
struct cpdma_ctlr *ctlr = chan->ctlr;
struct cpdma_desc_pool *pool = ctlr->pool;
dma_addr_t buff_dma;
int origlen;
uintptr_t token;
+ int ret;

token = desc_read(desc, sw_token);
origlen = desc_read(desc, sw_len);
@@ -1214,14 +1215,16 @@ static void __cpdma_chan_free(struct cpdma_chan *chan,
}

cpdma_desc_free(pool, desc, 1);
- (*chan->handler)((void *)token, outlen, status);
+ ret = (*chan->handler)((void *)token, outlen, status);
+
+ return ret;
}

static int __cpdma_chan_process(struct cpdma_chan *chan)
{
+ int status, outlen, ret;
struct cpdma_ctlr *ctlr = chan->ctlr;
struct cpdma_desc __iomem *desc;
- int status, outlen;
int cb_status = 0;
struct cpdma_desc_pool *pool = ctlr->pool;
dma_addr_t desc_dma;
@@ -1232,7 +1235,7 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
desc = chan->head;
if (!desc) {
chan->stats.empty_dequeue++;
- status = -ENOENT;
+ ret = -ENOENT;
goto unlock_ret;
}
desc_dma = desc_phys(pool, desc);
@@ -1241,7 +1244,7 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
outlen = status & 0x7ff;
if (status & CPDMA_DESC_OWNER) {
chan->stats.busy_dequeue++;
- status = -EBUSY;
+ ret = -EBUSY;
goto unlock_ret;
}

@@ -1267,28 +1270,31 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
else
cb_status = status;

- __cpdma_chan_free(chan, desc, outlen, cb_status);
- return status;
+ ret = __cpdma_chan_free(chan, desc, outlen, cb_status);
+ return ret;

unlock_ret:
spin_unlock_irqrestore(&chan->lock, flags);
- return status;
+ return ret;
}

-int cpdma_chan_process(struct cpdma_chan *chan, int quota)
+int cpdma_chan_process(struct cpdma_chan *chan, int *quota)
{
- int used = 0, ret = 0;
+ int used = 0, ret = 0, res = 0;

if (chan->state != CPDMA_STATE_ACTIVE)
return -EINVAL;

- while (used < quota) {
+ while (used < *quota) {
ret = __cpdma_chan_process(chan);
if (ret < 0)
break;
+ res |= ret;
used++;
}
- return used;
+
+ *quota = used;
+ return res;
}

int cpdma_chan_start(struct cpdma_chan *chan)
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index 0271a20c2e09..69074738bef0 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -61,7 +61,7 @@ struct cpdma_chan_stats {
struct cpdma_ctlr;
struct cpdma_chan;

-typedef void (*cpdma_handler_fn)(void *token, int len, int status);
+typedef int (*cpdma_handler_fn)(void *token, int len, int status);

struct cpdma_ctlr *cpdma_ctlr_create(struct cpdma_params *params);
int cpdma_ctlr_destroy(struct cpdma_ctlr *ctlr);
@@ -85,7 +85,7 @@ int cpdma_chan_idle_submit_mapped(struct cpdma_chan *chan, void *token,
dma_addr_t data, int len, int directed);
int cpdma_chan_idle_submit(struct cpdma_chan *chan, void *token, void *data,
int len, int directed);
-int cpdma_chan_process(struct cpdma_chan *chan, int quota);
+int cpdma_chan_process(struct cpdma_chan *chan, int *quota);

int cpdma_ctlr_int_ctrl(struct cpdma_ctlr *ctlr, bool enable);
void cpdma_ctlr_eoi(struct cpdma_ctlr *ctlr, u32 value);
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 5f4ece0d5a73..0b768a426848 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -860,7 +860,7 @@ static struct sk_buff *emac_rx_alloc(struct emac_priv *priv)
return skb;
}

-static void emac_rx_handler(void *token, int len, int status)
+static int emac_rx_handler(void *token, int len, int status)
{
struct sk_buff *skb = token;
struct net_device *ndev = skb->dev;
@@ -871,7 +871,7 @@ static void emac_rx_handler(void *token, int len, int status)
/* free and bail if we are shutting down */
if (unlikely(!netif_running(ndev))) {
dev_kfree_skb_any(skb);
- return;
+ return 0;
}

/* recycle on receive error */
@@ -892,7 +892,7 @@ static void emac_rx_handler(void *token, int len, int status)
if (!skb) {
if (netif_msg_rx_err(priv) && net_ratelimit())
dev_err(emac_dev, "failed rx buffer alloc\n");
- return;
+ return 0;
}

recycle:
@@ -902,9 +902,11 @@ static void emac_rx_handler(void *token, int len, int status)
WARN_ON(ret == -ENOMEM);
if (unlikely(ret < 0))
dev_kfree_skb_any(skb);
+
+ return 0;
}

-static void emac_tx_handler(void *token, int len, int status)
+static int emac_tx_handler(void *token, int len, int status)
{
struct sk_buff *skb = token;
struct net_device *ndev = skb->dev;
@@ -917,6 +919,7 @@ static void emac_tx_handler(void *token, int len, int status)
ndev->stats.tx_packets++;
ndev->stats.tx_bytes += len;
dev_kfree_skb_any(skb);
+ return 0;
}

/**
@@ -1237,8 +1240,8 @@ static int emac_poll(struct napi_struct *napi, int budget)
mask = EMAC_DM646X_MAC_IN_VECTOR_TX_INT_VEC;

if (status & mask) {
- num_tx_pkts = cpdma_chan_process(priv->txchan,
- EMAC_DEF_TX_MAX_SERVICE);
+ num_tx_pkts = EMAC_DEF_TX_MAX_SERVICE;
+ cpdma_chan_process(priv->txchan, &num_tx_pkts);
} /* TX processing */

mask = EMAC_DM644X_MAC_IN_VECTOR_RX_INT_VEC;
@@ -1247,7 +1250,8 @@ static int emac_poll(struct napi_struct *napi, int budget)
mask = EMAC_DM646X_MAC_IN_VECTOR_RX_INT_VEC;

if (status & mask) {
- num_rx_pkts = cpdma_chan_process(priv->rxchan, budget);
+ num_rx_pkts = budget;
+ cpdma_chan_process(priv->rxchan, &num_rx_pkts);
} /* RX processing */

mask = EMAC_DM644X_MAC_IN_VECTOR_HOST_INT;
--
2.17.1

2019-06-25 19:51:41

by Ivan Khoronzhuk

[permalink] [raw]
Subject: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

Add user counter allowing to delete pool only when no users.
It doesn't prevent pool from flush, only prevents freeing the
pool instance. Helps when no need to delete the pool and now
it's user responsibility to free it by calling page_pool_free()
while destroying procedure. It also makes to use page_pool_free()
explicitly, not fully hidden in xdp unreg, which looks more
correct after page pool "create" routine.

Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 +++++---
include/net/page_pool.h | 7 +++++++
net/core/page_pool.c | 7 +++++++
net/core/xdp.c | 3 +++
4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 5e40db8f92e6..cb028de64a1d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -545,10 +545,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
}
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
MEM_TYPE_PAGE_POOL, rq->page_pool);
- if (err) {
- page_pool_free(rq->page_pool);
+ if (err)
goto err_free;
- }

for (i = 0; i < wq_sz; i++) {
if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
@@ -613,6 +611,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
if (rq->xdp_prog)
bpf_prog_put(rq->xdp_prog);
xdp_rxq_info_unreg(&rq->xdp_rxq);
+ if (rq->page_pool)
+ page_pool_free(rq->page_pool);
mlx5_wq_destroy(&rq->wq_ctrl);

return err;
@@ -643,6 +643,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
}

xdp_rxq_info_unreg(&rq->xdp_rxq);
+ if (rq->page_pool)
+ page_pool_free(rq->page_pool);
mlx5_wq_destroy(&rq->wq_ctrl);
}

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index f07c518ef8a5..1ec838e9927e 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -101,6 +101,7 @@ struct page_pool {
struct ptr_ring ring;

atomic_t pages_state_release_cnt;
+ atomic_t user_cnt;
};

struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
@@ -183,6 +184,12 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
return page->dma_addr;
}

+/* used to prevent pool from deallocation */
+static inline void page_pool_get(struct page_pool *pool)
+{
+ atomic_inc(&pool->user_cnt);
+}
+
static inline bool is_page_pool_compiled_in(void)
{
#ifdef CONFIG_PAGE_POOL
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index b366f59885c1..169b0e3c870e 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -48,6 +48,7 @@ static int page_pool_init(struct page_pool *pool,
return -ENOMEM;

atomic_set(&pool->pages_state_release_cnt, 0);
+ atomic_set(&pool->user_cnt, 0);

if (pool->p.flags & PP_FLAG_DMA_MAP)
get_device(pool->p.dev);
@@ -70,6 +71,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
kfree(pool);
return ERR_PTR(err);
}
+
+ page_pool_get(pool);
return pool;
}
EXPORT_SYMBOL(page_pool_create);
@@ -356,6 +359,10 @@ static void __warn_in_flight(struct page_pool *pool)

void __page_pool_free(struct page_pool *pool)
{
+ /* free only if no users */
+ if (!atomic_dec_and_test(&pool->user_cnt))
+ return;
+
WARN(pool->alloc.count, "API usage violation");
WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");

diff --git a/net/core/xdp.c b/net/core/xdp.c
index 829377cc83db..04bdcd784d2e 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -372,6 +372,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,

mutex_unlock(&mem_id_lock);

+ if (type == MEM_TYPE_PAGE_POOL)
+ page_pool_get(xdp_alloc->page_pool);
+
trace_mem_connect(xdp_alloc, xdp_rxq);
return 0;
err:
--
2.17.1

2019-06-25 19:51:41

by Ivan Khoronzhuk

[permalink] [raw]
Subject: [PATCH v4 net-next 4/4] net: ethernet: ti: cpsw: add XDP support

Add XDP support based on rx page_pool allocator, one frame per page.
Page pool allocator is used with assumption that only one rx_handler
is running simultaneously. DMA map/unmap is reused from page pool
despite there is no need to map whole page.

Due to specific of cpsw, the same TX/RX handler can be used by 2
network devices, so special fields in buffer are added to identify
an interface the frame is destined to. Thus XDP works for both
interfaces, that allows to test xdp redirect between two interfaces
easily. Aslo, each rx queue have own page pools, but common for both
netdevs.

XDP prog is common for all channels till appropriate changes are added
in XDP infrastructure. Also, once page_pool recycling becomes part of
skb netstack some simplifications can be added, like removing marked
with comments.

Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
drivers/net/ethernet/ti/Kconfig | 1 +
drivers/net/ethernet/ti/cpsw.c | 521 ++++++++++++++++++++++---
drivers/net/ethernet/ti/cpsw_ethtool.c | 23 +-
drivers/net/ethernet/ti/cpsw_priv.h | 7 +
4 files changed, 489 insertions(+), 63 deletions(-)

diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
index a800d3417411..834afca3a019 100644
--- a/drivers/net/ethernet/ti/Kconfig
+++ b/drivers/net/ethernet/ti/Kconfig
@@ -50,6 +50,7 @@ config TI_CPSW
depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST
select TI_DAVINCI_MDIO
select MFD_SYSCON
+ select PAGE_POOL
select REGMAP
---help---
This driver supports TI's CPSW Ethernet Switch.
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 726925df8d97..b3c526b3a1c7 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -31,6 +31,10 @@
#include <linux/if_vlan.h>
#include <linux/kmemleak.h>
#include <linux/sys_soc.h>
+#include <net/page_pool.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>

#include <linux/pinctrl/consumer.h>
#include <net/pkt_cls.h>
@@ -60,6 +64,10 @@ static int descs_pool_size = CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT;
module_param(descs_pool_size, int, 0444);
MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");

+/* The buf includes headroom compatible with both skb and xdpf */
+#define CPSW_HEADROOM_NA (max(XDP_PACKET_HEADROOM, NET_SKB_PAD) + NET_IP_ALIGN)
+#define CPSW_HEADROOM ALIGN(CPSW_HEADROOM_NA, sizeof(long))
+
#define for_each_slave(priv, func, arg...) \
do { \
struct cpsw_slave *slave; \
@@ -74,6 +82,13 @@ MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");
(func)(slave++, ##arg); \
} while (0)

+#define CPSW_XMETA_OFFSET ALIGN(sizeof(struct xdp_frame), sizeof(long))
+
+#define CPSW_XDP_CONSUMED 1
+#define CPSW_XDP_CONSUMED_FLUSH 2
+#define CPSW_XDP_PASS 0
+#define CPSW_FLUSH_XDP_MAP 1
+
static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
__be16 proto, u16 vid);

@@ -337,24 +352,58 @@ void cpsw_intr_disable(struct cpsw_common *cpsw)
return;
}

+static int cpsw_is_xdpf_handle(void *handle)
+{
+ return (unsigned long)handle & BIT(0);
+}
+
+static void *cpsw_xdpf_to_handle(struct xdp_frame *xdpf)
+{
+ return (void *)((unsigned long)xdpf | BIT(0));
+}
+
+static struct xdp_frame *cpsw_handle_to_xdpf(void *handle)
+{
+ return (struct xdp_frame *)((unsigned long)handle & ~BIT(0));
+}
+
+struct __aligned(sizeof(long)) cpsw_meta_xdp {
+ struct net_device *ndev;
+ int ch;
+};
+
int cpsw_tx_handler(void *token, int len, int status)
{
+ struct cpsw_meta_xdp *xmeta;
+ struct xdp_frame *xdpf;
+ struct net_device *ndev;
struct netdev_queue *txq;
- struct sk_buff *skb = token;
- struct net_device *ndev = skb->dev;
- struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
+ struct sk_buff *skb;
+ int ch;
+
+ if (cpsw_is_xdpf_handle(token)) {
+ xdpf = cpsw_handle_to_xdpf(token);
+ xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
+ ndev = xmeta->ndev;
+ ch = xmeta->ch;
+ xdp_return_frame(xdpf);
+ } else {
+ skb = token;
+ ndev = skb->dev;
+ ch = skb_get_queue_mapping(skb);
+ cpts_tx_timestamp(ndev_to_cpsw(ndev)->cpts, skb);
+ dev_kfree_skb_any(skb);
+ }

/* Check whether the queue is stopped due to stalled tx dma, if the
* queue is stopped then start the queue as we have free desc for tx
*/
- txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb));
+ txq = netdev_get_tx_queue(ndev, ch);
if (unlikely(netif_tx_queue_stopped(txq)))
netif_tx_wake_queue(txq);

- cpts_tx_timestamp(cpsw->cpts, skb);
ndev->stats.tx_packets++;
ndev->stats.tx_bytes += len;
- dev_kfree_skb_any(skb);
return 0;
}

@@ -401,24 +450,249 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
}
}

+static int cpsw_xdp_tx_frame(struct cpsw_priv *priv, struct xdp_frame *xdpf,
+ struct page *page)
+{
+ struct cpsw_common *cpsw = priv->cpsw;
+ struct cpsw_meta_xdp *xmeta;
+ struct cpdma_chan *txch;
+ dma_addr_t dma;
+ int ret, port;
+
+ xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
+ xmeta->ndev = priv->ndev;
+ xmeta->ch = 0;
+ txch = cpsw->txv[0].ch;
+
+ port = priv->emac_port + cpsw->data.dual_emac;
+ if (page) {
+ dma = page_pool_get_dma_addr(page);
+ dma += xdpf->data - (void *)xdpf;
+ ret = cpdma_chan_submit_mapped(txch, cpsw_xdpf_to_handle(xdpf),
+ dma, xdpf->len, port);
+ } else {
+ if (sizeof(*xmeta) > xdpf->headroom) {
+ xdp_return_frame_rx_napi(xdpf);
+ return -EINVAL;
+ }
+
+ ret = cpdma_chan_submit(txch, cpsw_xdpf_to_handle(xdpf),
+ xdpf->data, xdpf->len, port);
+ }
+
+ if (ret)
+ xdp_return_frame_rx_napi(xdpf);
+
+ return ret;
+}
+
+static int cpsw_run_xdp(struct cpsw_priv *priv, int ch, struct xdp_buff *xdp,
+ struct page *page)
+{
+ struct cpsw_common *cpsw = priv->cpsw;
+ struct net_device *ndev = priv->ndev;
+ int ret = CPSW_XDP_CONSUMED;
+ struct xdp_frame *xdpf;
+ struct bpf_prog *prog;
+ u32 act;
+
+ rcu_read_lock();
+
+ prog = READ_ONCE(priv->xdp_prog);
+ if (!prog) {
+ ret = CPSW_XDP_PASS;
+ goto out;
+ }
+
+ act = bpf_prog_run_xdp(prog, xdp);
+ switch (act) {
+ case XDP_PASS:
+ ret = CPSW_XDP_PASS;
+ break;
+ case XDP_TX:
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
+ goto drop;
+
+ cpsw_xdp_tx_frame(priv, xdpf, page);
+ break;
+ case XDP_REDIRECT:
+ if (xdp_do_redirect(ndev, xdp, prog))
+ goto drop;
+
+ ret = CPSW_XDP_CONSUMED_FLUSH;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(ndev, prog, act);
+ /* fall through -- handle aborts by dropping packet */
+ case XDP_DROP:
+ goto drop;
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+drop:
+ rcu_read_unlock();
+ page_pool_recycle_direct(cpsw->page_pool[ch], page);
+ return ret;
+}
+
+static unsigned int cpsw_rxbuf_total_len(unsigned int len)
+{
+ len += CPSW_HEADROOM;
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ return SKB_DATA_ALIGN(len);
+}
+
+static struct page_pool *cpsw_create_page_pool(struct cpsw_common *cpsw,
+ int size)
+{
+ struct page_pool_params pp_params;
+ struct page_pool *pool;
+
+ pp_params.order = 0;
+ pp_params.flags = PP_FLAG_DMA_MAP;
+ pp_params.pool_size = size;
+ pp_params.nid = NUMA_NO_NODE;
+ pp_params.dma_dir = DMA_BIDIRECTIONAL;
+ pp_params.dev = cpsw->dev;
+
+ pool = page_pool_create(&pp_params);
+ if (IS_ERR(pool))
+ dev_err(cpsw->dev, "cannot create rx page pool\n");
+
+ return pool;
+}
+
+static int cpsw_ndev_create_xdp_rxq(struct cpsw_priv *priv, int ch)
+{
+ struct cpsw_common *cpsw = priv->cpsw;
+ struct xdp_rxq_info *rxq;
+ struct page_pool *pool;
+ int ret;
+
+ pool = cpsw->page_pool[ch];
+ rxq = &priv->xdp_rxq[ch];
+
+ ret = xdp_rxq_info_reg(rxq, priv->ndev, ch);
+ if (ret)
+ return ret;
+
+ ret = xdp_rxq_info_reg_mem_model(rxq, MEM_TYPE_PAGE_POOL, pool);
+ if (ret)
+ xdp_rxq_info_unreg(rxq);
+
+ return ret;
+}
+
+static void cpsw_ndev_destroy_xdp_rxq(struct cpsw_priv *priv, int ch)
+{
+ struct xdp_rxq_info *rxq = &priv->xdp_rxq[ch];
+
+ if (!xdp_rxq_info_is_reg(rxq))
+ return;
+
+ xdp_rxq_info_unreg(rxq);
+}
+
+static int cpsw_create_rx_pool(struct cpsw_common *cpsw, int ch)
+{
+ struct page_pool *pool;
+ int ret = 0, pool_size;
+
+ pool_size = cpdma_chan_get_rx_buf_num(cpsw->rxv[ch].ch);
+ pool = cpsw_create_page_pool(cpsw, pool_size);
+ if (IS_ERR(pool))
+ ret = PTR_ERR(pool);
+ else
+ cpsw->page_pool[ch] = pool;
+
+ return ret;
+}
+
+void cpsw_destroy_rx_pools(struct cpsw_common *cpsw)
+{
+ struct net_device *ndev;
+ int i, ch;
+
+ for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
+ if (!cpsw->page_pool[ch])
+ continue;
+
+ for (i = 0; i < cpsw->data.slaves; i++) {
+ ndev = cpsw->slaves[i].ndev;
+ if (!ndev)
+ continue;
+
+ cpsw_ndev_destroy_xdp_rxq(netdev_priv(ndev), ch);
+ }
+
+ page_pool_free(cpsw->page_pool[ch]);
+ cpsw->page_pool[ch] = NULL;
+ }
+}
+
+int cpsw_create_rx_pools(struct cpsw_common *cpsw)
+{
+ struct net_device *ndev;
+ int i, ch, ret;
+
+ for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
+ ret = cpsw_create_rx_pool(cpsw, ch);
+ if (ret)
+ goto err_cleanup;
+
+ /* using same page pool is allowed as no running rx handlers
+ * simultaneously for both ndevs
+ */
+ for (i = 0; i < cpsw->data.slaves; i++) {
+ ndev = cpsw->slaves[i].ndev;
+ if (!ndev)
+ continue;
+
+ ret = cpsw_ndev_create_xdp_rxq(netdev_priv(ndev), ch);
+ if (ret)
+ goto err_cleanup;
+ }
+ }
+
+ return 0;
+
+err_cleanup:
+ cpsw_destroy_rx_pools(cpsw);
+
+ return ret;
+}
+
static int cpsw_rx_handler(void *token, int len, int status)
{
- struct cpdma_chan *ch;
- struct sk_buff *skb = token;
- struct sk_buff *new_skb;
- struct net_device *ndev = skb->dev;
- int ret = 0, port;
- struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
+ struct page *new_page, *page = token;
+ void *pa = page_address(page);
+ struct cpsw_meta_xdp *xmeta = pa + CPSW_XMETA_OFFSET;
+ struct cpsw_common *cpsw = ndev_to_cpsw(xmeta->ndev);
+ int pkt_size = cpsw->rx_packet_max;
+ int ret = 0, port, ch = xmeta->ch;
+ int headroom = CPSW_HEADROOM;
+ struct net_device *ndev = xmeta->ndev;
+ int res = 0;
struct cpsw_priv *priv;
+ struct page_pool *pool;
+ struct sk_buff *skb;
+ struct xdp_buff xdp;
+ dma_addr_t dma;

- if (cpsw->data.dual_emac) {
+ if (cpsw->data.dual_emac && status >= 0) {
port = CPDMA_RX_SOURCE_PORT(status);
- if (port) {
+ if (port)
ndev = cpsw->slaves[--port].ndev;
- skb->dev = ndev;
- }
}

+ priv = netdev_priv(ndev);
+ pool = cpsw->page_pool[ch];
if (unlikely(status < 0) || unlikely(!netif_running(ndev))) {
/* In dual emac mode check for all interfaces */
if (cpsw->data.dual_emac && cpsw->usage_count &&
@@ -427,46 +701,94 @@ static int cpsw_rx_handler(void *token, int len, int status)
* is already down and the other interface is up
* and running, instead of freeing which results
* in reducing of the number of rx descriptor in
- * DMA engine, requeue skb back to cpdma.
+ * DMA engine, requeue page back to cpdma.
*/
- new_skb = skb;
+ new_page = page;
goto requeue;
}

- /* the interface is going down, skbs are purged */
- dev_kfree_skb_any(skb);
+ /* the interface is going down, pages are purged */
+ page_pool_recycle_direct(pool, page);
return 0;
}

- new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
- if (new_skb) {
- skb_copy_queue_mapping(new_skb, skb);
- skb_put(skb, len);
- if (status & CPDMA_RX_VLAN_ENCAP)
- cpsw_rx_vlan_encap(skb);
- priv = netdev_priv(ndev);
- if (priv->rx_ts_enabled)
- cpts_rx_timestamp(cpsw->cpts, skb);
- skb->protocol = eth_type_trans(skb, ndev);
- netif_receive_skb(skb);
- ndev->stats.rx_bytes += len;
- ndev->stats.rx_packets++;
- kmemleak_not_leak(new_skb);
- } else {
+ new_page = page_pool_dev_alloc_pages(pool);
+ if (unlikely(!new_page)) {
+ new_page = page;
+ ndev->stats.rx_dropped++;
+ goto requeue;
+ }
+
+ if (priv->xdp_prog) {
+ if (status & CPDMA_RX_VLAN_ENCAP) {
+ xdp.data = pa + CPSW_HEADROOM +
+ CPSW_RX_VLAN_ENCAP_HDR_SIZE;
+ xdp.data_end = xdp.data + len -
+ CPSW_RX_VLAN_ENCAP_HDR_SIZE;
+ } else {
+ xdp.data = pa + CPSW_HEADROOM;
+ xdp.data_end = xdp.data + len;
+ }
+
+ xdp_set_data_meta_invalid(&xdp);
+
+ xdp.data_hard_start = pa;
+ xdp.rxq = &priv->xdp_rxq[ch];
+
+ ret = cpsw_run_xdp(priv, ch, &xdp, page);
+ if (ret != CPSW_XDP_PASS) {
+ if (ret == CPSW_XDP_CONSUMED_FLUSH)
+ res = CPSW_FLUSH_XDP_MAP;
+
+ goto requeue;
+ }
+
+ /* XDP prog might have changed packet data and boundaries */
+ len = xdp.data_end - xdp.data;
+ headroom = xdp.data - xdp.data_hard_start;
+
+ /* XDP prog can modify vlan tag, so can't use encap header */
+ status &= ~CPDMA_RX_VLAN_ENCAP;
+ }
+
+ /* pass skb to netstack if no XDP prog or returned XDP_PASS */
+ skb = build_skb(pa, cpsw_rxbuf_total_len(pkt_size));
+ if (!skb) {
ndev->stats.rx_dropped++;
- new_skb = skb;
+ page_pool_recycle_direct(pool, page);
+ goto requeue;
}

+ skb_reserve(skb, headroom);
+ skb_put(skb, len);
+ skb->dev = ndev;
+ if (status & CPDMA_RX_VLAN_ENCAP)
+ cpsw_rx_vlan_encap(skb);
+ if (priv->rx_ts_enabled)
+ cpts_rx_timestamp(cpsw->cpts, skb);
+ skb->protocol = eth_type_trans(skb, ndev);
+
+ /* unmap page as no netstack skb page recycling */
+ page_pool_release_page(pool, page);
+ netif_receive_skb(skb);
+
+ ndev->stats.rx_bytes += len;
+ ndev->stats.rx_packets++;
+
requeue:
- ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch;
- ret = cpdma_chan_submit(ch, new_skb, new_skb->data,
- skb_tailroom(new_skb), 0);
+ xmeta = page_address(new_page) + CPSW_XMETA_OFFSET;
+ xmeta->ndev = ndev;
+ xmeta->ch = ch;
+
+ dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM;
+ ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma,
+ pkt_size, 0);
if (ret < 0) {
WARN_ON(ret == -ENOMEM);
- dev_kfree_skb_any(new_skb);
+ page_pool_recycle_direct(pool, new_page);
}

- return 0;
+ return res;
}

void cpsw_split_res(struct cpsw_common *cpsw)
@@ -641,8 +963,8 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
{
u32 ch_map;
- int num_rx, cur_budget, ch;
struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
+ int num_rx, cur_budget, ch, res;
struct cpsw_vector *rxv;

/* process every unprocessed channel */
@@ -657,8 +979,12 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
else
cur_budget = rxv->budget;

- cpdma_chan_process(rxv->ch, &cur_budget);
+ res = cpdma_chan_process(rxv->ch, &cur_budget);
num_rx += cur_budget;
+
+ if (res & CPSW_FLUSH_XDP_MAP)
+ xdp_do_flush_map();
+
if (num_rx >= budget)
break;
}
@@ -674,10 +1000,15 @@ static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
{
struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
- int num_rx;
+ struct cpsw_vector *rxv;
+ int num_rx, res;

num_rx = budget;
- cpdma_chan_process(cpsw->rxv[0].ch, &num_rx);
+ rxv = &cpsw->rxv[0];
+ res = cpdma_chan_process(rxv->ch, &num_rx);
+ if (res & CPSW_FLUSH_XDP_MAP)
+ xdp_do_flush_map();
+
if (num_rx < budget) {
napi_complete_done(napi_rx, num_rx);
writel(0xff, &cpsw->wr_regs->rx_en);
@@ -1039,33 +1370,39 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
int cpsw_fill_rx_channels(struct cpsw_priv *priv)
{
struct cpsw_common *cpsw = priv->cpsw;
- struct sk_buff *skb;
+ struct cpsw_meta_xdp *xmeta;
+ struct page_pool *pool;
+ struct page *page;
int ch_buf_num;
int ch, i, ret;
+ dma_addr_t dma;

for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
+ pool = cpsw->page_pool[ch];
ch_buf_num = cpdma_chan_get_rx_buf_num(cpsw->rxv[ch].ch);
for (i = 0; i < ch_buf_num; i++) {
- skb = __netdev_alloc_skb_ip_align(priv->ndev,
- cpsw->rx_packet_max,
- GFP_KERNEL);
- if (!skb) {
- cpsw_err(priv, ifup, "cannot allocate skb\n");
+ page = page_pool_dev_alloc_pages(pool);
+ if (!page) {
+ cpsw_err(priv, ifup, "allocate rx page err\n");
return -ENOMEM;
}

- skb_set_queue_mapping(skb, ch);
- ret = cpdma_chan_idle_submit(cpsw->rxv[ch].ch, skb,
- skb->data,
- skb_tailroom(skb), 0);
+ xmeta = page_address(page) + CPSW_XMETA_OFFSET;
+ xmeta->ndev = priv->ndev;
+ xmeta->ch = ch;
+
+ dma = page_pool_get_dma_addr(page) + CPSW_HEADROOM;
+ ret = cpdma_chan_idle_submit_mapped(cpsw->rxv[ch].ch,
+ page, dma,
+ cpsw->rx_packet_max,
+ 0);
if (ret < 0) {
cpsw_err(priv, ifup,
- "cannot submit skb to channel %d rx, error %d\n",
+ "cannot submit page to channel %d rx, error %d\n",
ch, ret);
- kfree_skb(skb);
+ page_pool_recycle_direct(pool, page);
return ret;
}
- kmemleak_not_leak(skb);
}

cpsw_info(priv, ifup, "ch %d rx, submitted %d descriptors\n",
@@ -1401,6 +1738,10 @@ static int cpsw_ndo_open(struct net_device *ndev)
enable_irq(cpsw->irqs_table[0]);
}

+ ret = cpsw_create_rx_pools(cpsw);
+ if (ret < 0)
+ goto err_cleanup;
+
ret = cpsw_fill_rx_channels(priv);
if (ret < 0)
goto err_cleanup;
@@ -1429,9 +1770,10 @@ static int cpsw_ndo_open(struct net_device *ndev)
err_cleanup:
if (!cpsw->usage_count) {
cpdma_ctlr_stop(cpsw->dma);
- for_each_slave(priv, cpsw_slave_stop, cpsw);
+ cpsw_destroy_rx_pools(cpsw);
}

+ for_each_slave(priv, cpsw_slave_stop, cpsw);
pm_runtime_put_sync(cpsw->dev);
netif_carrier_off(priv->ndev);
return ret;
@@ -1454,6 +1796,7 @@ static int cpsw_ndo_stop(struct net_device *ndev)
cpsw_intr_disable(cpsw);
cpdma_ctlr_stop(cpsw->dma);
cpsw_ale_stop(cpsw->ale);
+ cpsw_destroy_rx_pools(cpsw);
}
for_each_slave(priv, cpsw_slave_stop, cpsw);

@@ -2011,6 +2354,64 @@ static int cpsw_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type,
}
}

+static int cpsw_xdp_prog_setup(struct cpsw_priv *priv, struct netdev_bpf *bpf)
+{
+ struct bpf_prog *prog = bpf->prog;
+
+ if (!priv->xdpi.prog && !prog)
+ return 0;
+
+ if (!xdp_attachment_flags_ok(&priv->xdpi, bpf))
+ return -EBUSY;
+
+ WRITE_ONCE(priv->xdp_prog, prog);
+
+ xdp_attachment_setup(&priv->xdpi, bpf);
+
+ return 0;
+}
+
+static int cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf)
+{
+ struct cpsw_priv *priv = netdev_priv(ndev);
+
+ switch (bpf->command) {
+ case XDP_SETUP_PROG:
+ return cpsw_xdp_prog_setup(priv, bpf);
+
+ case XDP_QUERY_PROG:
+ return xdp_attachment_query(&priv->xdpi, bpf);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static int cpsw_ndo_xdp_xmit(struct net_device *ndev, int n,
+ struct xdp_frame **frames, u32 flags)
+{
+ struct cpsw_priv *priv = netdev_priv(ndev);
+ struct xdp_frame *xdpf;
+ int i, drops = 0;
+
+ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+ return -EINVAL;
+
+ for (i = 0; i < n; i++) {
+ xdpf = frames[i];
+ if (xdpf->len < CPSW_MIN_PACKET_SIZE) {
+ xdp_return_frame_rx_napi(xdpf);
+ drops++;
+ continue;
+ }
+
+ if (cpsw_xdp_tx_frame(priv, xdpf, NULL))
+ drops++;
+ }
+
+ return n - drops;
+}
+
#ifdef CONFIG_NET_POLL_CONTROLLER
static void cpsw_ndo_poll_controller(struct net_device *ndev)
{
@@ -2039,6 +2440,8 @@ static const struct net_device_ops cpsw_netdev_ops = {
.ndo_vlan_rx_add_vid = cpsw_ndo_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = cpsw_ndo_vlan_rx_kill_vid,
.ndo_setup_tc = cpsw_ndo_setup_tc,
+ .ndo_bpf = cpsw_ndo_bpf,
+ .ndo_xdp_xmit = cpsw_ndo_xdp_xmit,
};

static void cpsw_get_drvinfo(struct net_device *ndev,
diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c
index 7c19eebbabcc..d2007f5f988e 100644
--- a/drivers/net/ethernet/ti/cpsw_ethtool.c
+++ b/drivers/net/ethernet/ti/cpsw_ethtool.c
@@ -585,7 +585,7 @@ int cpsw_set_channels_common(struct net_device *ndev,
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
struct net_device *sl_ndev;
- int i, ret;
+ int i, new_pools, ret;

ret = cpsw_check_ch_settings(cpsw, chs);
if (ret < 0)
@@ -593,6 +593,10 @@ int cpsw_set_channels_common(struct net_device *ndev,

cpsw_suspend_data_pass(ndev);

+ new_pools = (chs->rx_count != cpsw->rx_ch_num) && cpsw->usage_count;
+ if (new_pools)
+ cpsw_destroy_rx_pools(cpsw);
+
ret = cpsw_update_channels_res(priv, chs->rx_count, 1, rx_handler);
if (ret)
goto err;
@@ -623,6 +627,12 @@ int cpsw_set_channels_common(struct net_device *ndev,
if (cpsw->usage_count)
cpsw_split_res(cpsw);

+ if (new_pools) {
+ ret = cpsw_create_rx_pools(cpsw);
+ if (ret)
+ goto err;
+ }
+
ret = cpsw_resume_data_pass(ndev);
if (!ret)
return 0;
@@ -648,8 +658,7 @@ void cpsw_get_ringparam(struct net_device *ndev,
int cpsw_set_ringparam(struct net_device *ndev,
struct ethtool_ringparam *ering)
{
- struct cpsw_priv *priv = netdev_priv(ndev);
- struct cpsw_common *cpsw = priv->cpsw;
+ struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
int ret;

/* ignore ering->tx_pending - only rx_pending adjustment is supported */
@@ -664,15 +673,21 @@ int cpsw_set_ringparam(struct net_device *ndev,

cpsw_suspend_data_pass(ndev);

+ cpsw_destroy_rx_pools(cpsw);
+
cpdma_set_num_rx_descs(cpsw->dma, ering->rx_pending);

if (cpsw->usage_count)
cpdma_chan_split_pool(cpsw->dma);

+ ret = cpsw_create_rx_pools(cpsw);
+ if (ret)
+ goto err;
+
ret = cpsw_resume_data_pass(ndev);
if (!ret)
return 0;
-
+err:
dev_err(cpsw->dev, "cannot set ring params, closing device\n");
dev_close(ndev);
return ret;
diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
index 2ecb3af59fe9..8eeda3456f37 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.h
+++ b/drivers/net/ethernet/ti/cpsw_priv.h
@@ -346,6 +346,7 @@ struct cpsw_common {
int rx_ch_num, tx_ch_num;
int speed;
int usage_count;
+ struct page_pool *page_pool[CPSW_MAX_QUEUES];
};

struct cpsw_priv {
@@ -360,6 +361,10 @@ struct cpsw_priv {
int shp_cfg_speed;
int tx_ts_enabled;
int rx_ts_enabled;
+ struct bpf_prog *xdp_prog;
+ struct xdp_rxq_info xdp_rxq[CPSW_MAX_QUEUES];
+ struct xdp_attachment_info xdpi;
+
u32 emac_port;
struct cpsw_common *cpsw;
};
@@ -391,6 +396,8 @@ int cpsw_fill_rx_channels(struct cpsw_priv *priv);
void cpsw_intr_enable(struct cpsw_common *cpsw);
void cpsw_intr_disable(struct cpsw_common *cpsw);
int cpsw_tx_handler(void *token, int len, int status);
+int cpsw_create_rx_pools(struct cpsw_common *cpsw);
+void cpsw_destroy_rx_pools(struct cpsw_common *cpsw);

/* ethtool */
u32 cpsw_get_msglevel(struct net_device *ndev);
--
2.17.1

2019-06-25 20:46:38

by David Miller

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 0/4] net: ethernet: ti: cpsw: Add XDP support

From: Ivan Khoronzhuk <[email protected]>
Date: Tue, 25 Jun 2019 20:59:44 +0300

> This patchset adds XDP support for TI cpsw driver and base it on
> page_pool allocator. It was verified on af_xdp socket drop,
> af_xdp l2f, ebpf XDP_DROP, XDP_REDIRECT, XDP_PASS, XDP_TX.

So happy to watch the progress of this patch set.

Jesper and others, please review.

Thank you.

2019-06-26 01:37:41

by Willem de Bruijn

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

On Tue, Jun 25, 2019 at 2:00 PM Ivan Khoronzhuk
<[email protected]> wrote:
>
> Add user counter allowing to delete pool only when no users.
> It doesn't prevent pool from flush, only prevents freeing the
> pool instance. Helps when no need to delete the pool and now
> it's user responsibility to free it by calling page_pool_free()
> while destroying procedure. It also makes to use page_pool_free()
> explicitly, not fully hidden in xdp unreg, which looks more
> correct after page pool "create" routine.
>
> Signed-off-by: Ivan Khoronzhuk <[email protected]>
> ---

> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index f07c518ef8a5..1ec838e9927e 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -101,6 +101,7 @@ struct page_pool {
> struct ptr_ring ring;
>
> atomic_t pages_state_release_cnt;
> + atomic_t user_cnt;

refcount_t?

> };
>
> struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
> @@ -183,6 +184,12 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
> return page->dma_addr;
> }
>
> +/* used to prevent pool from deallocation */
> +static inline void page_pool_get(struct page_pool *pool)
> +{
> + atomic_inc(&pool->user_cnt);
> +}
> +
> static inline bool is_page_pool_compiled_in(void)
> {
> #ifdef CONFIG_PAGE_POOL
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index b366f59885c1..169b0e3c870e 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -48,6 +48,7 @@ static int page_pool_init(struct page_pool *pool,
> return -ENOMEM;
>
> atomic_set(&pool->pages_state_release_cnt, 0);
> + atomic_set(&pool->user_cnt, 0);
>
> if (pool->p.flags & PP_FLAG_DMA_MAP)
> get_device(pool->p.dev);
> @@ -70,6 +71,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
> kfree(pool);
> return ERR_PTR(err);
> }
> +
> + page_pool_get(pool);
> return pool;
> }
> EXPORT_SYMBOL(page_pool_create);
> @@ -356,6 +359,10 @@ static void __warn_in_flight(struct page_pool *pool)
>
> void __page_pool_free(struct page_pool *pool)
> {
> + /* free only if no users */
> + if (!atomic_dec_and_test(&pool->user_cnt))
> + return;
> +
> WARN(pool->alloc.count, "API usage violation");
> WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 829377cc83db..04bdcd784d2e 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -372,6 +372,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
>
> mutex_unlock(&mem_id_lock);
>
> + if (type == MEM_TYPE_PAGE_POOL)
> + page_pool_get(xdp_alloc->page_pool);
> +

need an analogous page_pool_put in xdp_rxq_info_unreg_mem_model? mlx5
does not use that inverse function, but intel drivers do.

> trace_mem_connect(xdp_alloc, xdp_rxq);
> return 0;
> err:
> --
> 2.17.1
>

2019-06-26 02:18:11

by Willem de Bruijn

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 3/4] net: ethernet: ti: davinci_cpdma: return handler status

On Tue, Jun 25, 2019 at 2:00 PM Ivan Khoronzhuk
<[email protected]> wrote:
>
> This change is needed to return flush status of rx handler for
> flushing redirected xdp frames after processing channel packets.
> Do it as separate patch for simplicity.
>
> Signed-off-by: Ivan Khoronzhuk <[email protected]>

> @@ -602,7 +605,8 @@ static int cpsw_tx_mq_poll(struct napi_struct *napi_tx, int budget)
> else
> cur_budget = txv->budget;
>
> - num_tx += cpdma_chan_process(txv->ch, cur_budget);
> + cpdma_chan_process(txv->ch, &cur_budget);
> + num_tx += cur_budget;

Less code change to add a new argument int *flush to communicate the
new state and leave the existing argument and return values as is?

2019-06-26 10:42:50

by Jesper Dangaard Brouer

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

On Tue, 25 Jun 2019 20:59:45 +0300
Ivan Khoronzhuk <[email protected]> wrote:

> Add user counter allowing to delete pool only when no users.
> It doesn't prevent pool from flush, only prevents freeing the
> pool instance. Helps when no need to delete the pool and now
> it's user responsibility to free it by calling page_pool_free()
> while destroying procedure. It also makes to use page_pool_free()
> explicitly, not fully hidden in xdp unreg, which looks more
> correct after page pool "create" routine.

No, this is wrong.

> Signed-off-by: Ivan Khoronzhuk <[email protected]>
> ---
> drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 +++++---
> include/net/page_pool.h | 7 +++++++
> net/core/page_pool.c | 7 +++++++
> net/core/xdp.c | 3 +++
> 4 files changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 5e40db8f92e6..cb028de64a1d 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -545,10 +545,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
> }
> err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
> MEM_TYPE_PAGE_POOL, rq->page_pool);
> - if (err) {
> - page_pool_free(rq->page_pool);
> + if (err)
> goto err_free;
> - }
>
> for (i = 0; i < wq_sz; i++) {
> if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
> @@ -613,6 +611,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
> if (rq->xdp_prog)
> bpf_prog_put(rq->xdp_prog);
> xdp_rxq_info_unreg(&rq->xdp_rxq);
> + if (rq->page_pool)
> + page_pool_free(rq->page_pool);
> mlx5_wq_destroy(&rq->wq_ctrl);
>
> return err;
> @@ -643,6 +643,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
> }
>
> xdp_rxq_info_unreg(&rq->xdp_rxq);
> + if (rq->page_pool)
> + page_pool_free(rq->page_pool);

No, this is wrong. The hole point with the merged page_pool fixes
patchset was that page_pool_free() needs to be delayed until no-more
in-flight packets exist.


> mlx5_wq_destroy(&rq->wq_ctrl);
> }
>
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index f07c518ef8a5..1ec838e9927e 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -101,6 +101,7 @@ struct page_pool {
> struct ptr_ring ring;
>
> atomic_t pages_state_release_cnt;
> + atomic_t user_cnt;
> };
>
> struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
> @@ -183,6 +184,12 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
> return page->dma_addr;
> }
>
> +/* used to prevent pool from deallocation */
> +static inline void page_pool_get(struct page_pool *pool)
> +{
> + atomic_inc(&pool->user_cnt);
> +}
> +
> static inline bool is_page_pool_compiled_in(void)
> {
> #ifdef CONFIG_PAGE_POOL
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index b366f59885c1..169b0e3c870e 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -48,6 +48,7 @@ static int page_pool_init(struct page_pool *pool,
> return -ENOMEM;
>
> atomic_set(&pool->pages_state_release_cnt, 0);
> + atomic_set(&pool->user_cnt, 0);
>
> if (pool->p.flags & PP_FLAG_DMA_MAP)
> get_device(pool->p.dev);
> @@ -70,6 +71,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
> kfree(pool);
> return ERR_PTR(err);
> }
> +
> + page_pool_get(pool);
> return pool;
> }
> EXPORT_SYMBOL(page_pool_create);
> @@ -356,6 +359,10 @@ static void __warn_in_flight(struct page_pool *pool)
>
> void __page_pool_free(struct page_pool *pool)
> {
> + /* free only if no users */
> + if (!atomic_dec_and_test(&pool->user_cnt))
> + return;
> +
> WARN(pool->alloc.count, "API usage violation");
> WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 829377cc83db..04bdcd784d2e 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -372,6 +372,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
>
> mutex_unlock(&mem_id_lock);
>
> + if (type == MEM_TYPE_PAGE_POOL)
> + page_pool_get(xdp_alloc->page_pool);
> +
> trace_mem_connect(xdp_alloc, xdp_rxq);
> return 0;
> err:



--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer

2019-06-26 10:50:17

by Ivan Khoronzhuk

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

On Wed, Jun 26, 2019 at 12:42:16PM +0200, Jesper Dangaard Brouer wrote:
>On Tue, 25 Jun 2019 20:59:45 +0300
>Ivan Khoronzhuk <[email protected]> wrote:
>
>> Add user counter allowing to delete pool only when no users.
>> It doesn't prevent pool from flush, only prevents freeing the
>> pool instance. Helps when no need to delete the pool and now
>> it's user responsibility to free it by calling page_pool_free()
>> while destroying procedure. It also makes to use page_pool_free()
>> explicitly, not fully hidden in xdp unreg, which looks more
>> correct after page pool "create" routine.
>
>No, this is wrong.
below.

>
>> Signed-off-by: Ivan Khoronzhuk <[email protected]>
>> ---
>> drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 +++++---
>> include/net/page_pool.h | 7 +++++++
>> net/core/page_pool.c | 7 +++++++
>> net/core/xdp.c | 3 +++
>> 4 files changed, 22 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> index 5e40db8f92e6..cb028de64a1d 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> @@ -545,10 +545,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
>> }
>> err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
>> MEM_TYPE_PAGE_POOL, rq->page_pool);
>> - if (err) {
>> - page_pool_free(rq->page_pool);
>> + if (err)
>> goto err_free;
>> - }
>>
>> for (i = 0; i < wq_sz; i++) {
>> if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
>> @@ -613,6 +611,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
>> if (rq->xdp_prog)
>> bpf_prog_put(rq->xdp_prog);
>> xdp_rxq_info_unreg(&rq->xdp_rxq);
>> + if (rq->page_pool)
>> + page_pool_free(rq->page_pool);
>> mlx5_wq_destroy(&rq->wq_ctrl);
>>
>> return err;
>> @@ -643,6 +643,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
>> }
>>
>> xdp_rxq_info_unreg(&rq->xdp_rxq);
>> + if (rq->page_pool)
>> + page_pool_free(rq->page_pool);
>
>No, this is wrong. The hole point with the merged page_pool fixes
>patchset was that page_pool_free() needs to be delayed until no-more
>in-flight packets exist.

Probably it's not so obvious, but it's still delayed and deleted only
after no-more in-flight packets exist. Here question is only who is able
to do this first based on refcnt.

>
>
>> mlx5_wq_destroy(&rq->wq_ctrl);
>> }
>>
>> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
>> index f07c518ef8a5..1ec838e9927e 100644
>> --- a/include/net/page_pool.h
>> +++ b/include/net/page_pool.h
>> @@ -101,6 +101,7 @@ struct page_pool {
>> struct ptr_ring ring;
>>
>> atomic_t pages_state_release_cnt;
>> + atomic_t user_cnt;
>> };
>>
>> struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
>> @@ -183,6 +184,12 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
>> return page->dma_addr;
>> }
>>
>> +/* used to prevent pool from deallocation */
>> +static inline void page_pool_get(struct page_pool *pool)
>> +{
>> + atomic_inc(&pool->user_cnt);
>> +}
>> +
>> static inline bool is_page_pool_compiled_in(void)
>> {
>> #ifdef CONFIG_PAGE_POOL
>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> index b366f59885c1..169b0e3c870e 100644
>> --- a/net/core/page_pool.c
>> +++ b/net/core/page_pool.c
>> @@ -48,6 +48,7 @@ static int page_pool_init(struct page_pool *pool,
>> return -ENOMEM;
>>
>> atomic_set(&pool->pages_state_release_cnt, 0);
>> + atomic_set(&pool->user_cnt, 0);
>>
>> if (pool->p.flags & PP_FLAG_DMA_MAP)
>> get_device(pool->p.dev);
>> @@ -70,6 +71,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
>> kfree(pool);
>> return ERR_PTR(err);
>> }
>> +
>> + page_pool_get(pool);
>> return pool;
>> }
>> EXPORT_SYMBOL(page_pool_create);
>> @@ -356,6 +359,10 @@ static void __warn_in_flight(struct page_pool *pool)
>>
>> void __page_pool_free(struct page_pool *pool)
>> {
>> + /* free only if no users */
>> + if (!atomic_dec_and_test(&pool->user_cnt))
>> + return;
>> +
>> WARN(pool->alloc.count, "API usage violation");
>> WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>>
>> diff --git a/net/core/xdp.c b/net/core/xdp.c
>> index 829377cc83db..04bdcd784d2e 100644
>> --- a/net/core/xdp.c
>> +++ b/net/core/xdp.c
>> @@ -372,6 +372,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
>>
>> mutex_unlock(&mem_id_lock);
>>
>> + if (type == MEM_TYPE_PAGE_POOL)
>> + page_pool_get(xdp_alloc->page_pool);
>> +
>> trace_mem_connect(xdp_alloc, xdp_rxq);
>> return 0;
>> err:
>
>
>
>--
>Best regards,
> Jesper Dangaard Brouer
> MSc.CS, Principal Kernel Engineer at Red Hat
> LinkedIn: http://www.linkedin.com/in/brouer

--
Regards,
Ivan Khoronzhuk

2019-06-26 11:52:06

by Jesper Dangaard Brouer

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

On Wed, 26 Jun 2019 13:49:49 +0300
Ivan Khoronzhuk <[email protected]> wrote:

> On Wed, Jun 26, 2019 at 12:42:16PM +0200, Jesper Dangaard Brouer wrote:
> >On Tue, 25 Jun 2019 20:59:45 +0300
> >Ivan Khoronzhuk <[email protected]> wrote:
> >
> >> Add user counter allowing to delete pool only when no users.
> >> It doesn't prevent pool from flush, only prevents freeing the
> >> pool instance. Helps when no need to delete the pool and now
> >> it's user responsibility to free it by calling page_pool_free()
> >> while destroying procedure. It also makes to use page_pool_free()
> >> explicitly, not fully hidden in xdp unreg, which looks more
> >> correct after page pool "create" routine.
> >
> >No, this is wrong.
> below.
>
> >
> >> Signed-off-by: Ivan Khoronzhuk <[email protected]>
> >> ---
> >> drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 +++++---
> >> include/net/page_pool.h | 7 +++++++
> >> net/core/page_pool.c | 7 +++++++
> >> net/core/xdp.c | 3 +++
> >> 4 files changed, 22 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> >> index 5e40db8f92e6..cb028de64a1d 100644
> >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> >> @@ -545,10 +545,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
> >> }
> >> err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
> >> MEM_TYPE_PAGE_POOL, rq->page_pool);
> >> - if (err) {
> >> - page_pool_free(rq->page_pool);
> >> + if (err)
> >> goto err_free;
> >> - }
> >>
> >> for (i = 0; i < wq_sz; i++) {
> >> if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
> >> @@ -613,6 +611,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
> >> if (rq->xdp_prog)
> >> bpf_prog_put(rq->xdp_prog);
> >> xdp_rxq_info_unreg(&rq->xdp_rxq);
> >> + if (rq->page_pool)
> >> + page_pool_free(rq->page_pool);
> >> mlx5_wq_destroy(&rq->wq_ctrl);
> >>
> >> return err;
> >> @@ -643,6 +643,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
> >> }
> >>
> >> xdp_rxq_info_unreg(&rq->xdp_rxq);
> >> + if (rq->page_pool)
> >> + page_pool_free(rq->page_pool);
> >
> >No, this is wrong. The hole point with the merged page_pool fixes
> >patchset was that page_pool_free() needs to be delayed until no-more
> >in-flight packets exist.
>
> Probably it's not so obvious, but it's still delayed and deleted only
> after no-more in-flight packets exist. Here question is only who is able
> to do this first based on refcnt.

Hmm... then I find this API is rather misleading, even the function
name page_pool_free is misleading ("free"). (Now, I do see, below, that
page_pool_create() take an extra reference).

But it is still wrong / problematic. As you allow
__page_pool_request_shutdown() to be called with elevated refcnt. Your
use-case is to have more than 1 xdp_rxq_info struct using the same
page_pool. Then you have to call xdp_rxq_info_unreg_mem_model() for
each, which will call __page_pool_request_shutdown().

For this to be safe, your driver have to stop RX for all the
xdp_rxq_info structs that share the page_pool. The page_pool already
have this requirement, but it comes as natural step when shutting down
an RXQ. With your change, you have to take care of stopping the RXQs
first, and then call xdp_rxq_info_unreg_mem_model() for each
xdp_rxq_info afterwards. I assume you do this, but it is just a driver
bug waiting to happen.


> >> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> >> index b366f59885c1..169b0e3c870e 100644
> >> --- a/net/core/page_pool.c
> >> +++ b/net/core/page_pool.c
[...]
> >> @@ -70,6 +71,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
> >> kfree(pool);
> >> return ERR_PTR(err);
> >> }
> >> +
> >> + page_pool_get(pool);
> >> return pool;
> >> }
> >> EXPORT_SYMBOL(page_pool_create);

The thing (perhaps) like about your API change, is that you also allow
the driver to explicitly keep the page_pool object across/after a
xdp_rxq_info_unreg_mem_model(). And this way possibly reuse it for
another RXQ. The problem is of-cause that on driver shutdown, this
will force drivers to implement the same shutdown logic with
schedule_delayed_work as the core xdp.c code already does.

--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer

2019-06-26 12:41:00

by Ivan Khoronzhuk

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

On Wed, Jun 26, 2019 at 01:51:28PM +0200, Jesper Dangaard Brouer wrote:
>On Wed, 26 Jun 2019 13:49:49 +0300
>Ivan Khoronzhuk <[email protected]> wrote:
>
>> On Wed, Jun 26, 2019 at 12:42:16PM +0200, Jesper Dangaard Brouer wrote:
>> >On Tue, 25 Jun 2019 20:59:45 +0300
>> >Ivan Khoronzhuk <[email protected]> wrote:
>> >
>> >> Add user counter allowing to delete pool only when no users.
>> >> It doesn't prevent pool from flush, only prevents freeing the
>> >> pool instance. Helps when no need to delete the pool and now
>> >> it's user responsibility to free it by calling page_pool_free()
>> >> while destroying procedure. It also makes to use page_pool_free()
>> >> explicitly, not fully hidden in xdp unreg, which looks more
>> >> correct after page pool "create" routine.
>> >
>> >No, this is wrong.
>> below.
>>
>> >
>> >> Signed-off-by: Ivan Khoronzhuk <[email protected]>
>> >> ---
>> >> drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 +++++---
>> >> include/net/page_pool.h | 7 +++++++
>> >> net/core/page_pool.c | 7 +++++++
>> >> net/core/xdp.c | 3 +++
>> >> 4 files changed, 22 insertions(+), 3 deletions(-)
>> >>
>> >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> >> index 5e40db8f92e6..cb028de64a1d 100644
>> >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> >> @@ -545,10 +545,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
>> >> }
>> >> err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
>> >> MEM_TYPE_PAGE_POOL, rq->page_pool);
>> >> - if (err) {
>> >> - page_pool_free(rq->page_pool);
>> >> + if (err)
>> >> goto err_free;
>> >> - }
>> >>
>> >> for (i = 0; i < wq_sz; i++) {
>> >> if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
>> >> @@ -613,6 +611,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
>> >> if (rq->xdp_prog)
>> >> bpf_prog_put(rq->xdp_prog);
>> >> xdp_rxq_info_unreg(&rq->xdp_rxq);
>> >> + if (rq->page_pool)
>> >> + page_pool_free(rq->page_pool);
>> >> mlx5_wq_destroy(&rq->wq_ctrl);
>> >>
>> >> return err;
>> >> @@ -643,6 +643,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
>> >> }
>> >>
>> >> xdp_rxq_info_unreg(&rq->xdp_rxq);
>> >> + if (rq->page_pool)
>> >> + page_pool_free(rq->page_pool);
>> >
>> >No, this is wrong. The hole point with the merged page_pool fixes
>> >patchset was that page_pool_free() needs to be delayed until no-more
>> >in-flight packets exist.
>>
>> Probably it's not so obvious, but it's still delayed and deleted only
>> after no-more in-flight packets exist. Here question is only who is able
>> to do this first based on refcnt.
>
>Hmm... then I find this API is rather misleading, even the function
>name page_pool_free is misleading ("free"). (Now, I do see, below, that
>page_pool_create() take an extra reference).
In feneral "free" looks not bad after "create".
It's called after "create" if some error with registering it rxq.
and it looks logical, if it's called after no need in pool.

obj = create()
/* a lot of different stuff */
free(obj);


>
>But it is still wrong / problematic. As you allow
>__page_pool_request_shutdown() to be called with elevated refcnt. Your
>use-case is to have more than 1 xdp_rxq_info struct using the same
>page_pool. Then you have to call xdp_rxq_info_unreg_mem_model() for
>each, which will call __page_pool_request_shutdown().
>
>For this to be safe, your driver have to stop RX for all the
>xdp_rxq_info structs that share the page_pool. The page_pool already
>have this requirement, but it comes as natural step when shutting down
>an RXQ. With your change, you have to take care of stopping the RXQs
>first, and then call xdp_rxq_info_unreg_mem_model() for each
>xdp_rxq_info afterwards. I assume you do this, but it is just a driver
>bug waiting to happen.
All rxq queues are stopped before this, and only after this the pools are freed,
exactly as it required for one xdp_rxq_info_unreg_mem_model(), w/o exclusions,
as it requires the API.

>
>> >> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> >> index b366f59885c1..169b0e3c870e 100644
>> >> --- a/net/core/page_pool.c
>> >> +++ b/net/core/page_pool.c
>[...]
>> >> @@ -70,6 +71,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
>> >> kfree(pool);
>> >> return ERR_PTR(err);
>> >> }
>> >> +
>> >> + page_pool_get(pool);
>> >> return pool;
>> >> }
>> >> EXPORT_SYMBOL(page_pool_create);
>
>The thing (perhaps) like about your API change, is that you also allow
>the driver to explicitly keep the page_pool object across/after a
>xdp_rxq_info_unreg_mem_model(). And this way possibly reuse it for
>another RXQ.
>The problem is of-cause that on driver shutdown, this
>will force drivers to implement the same shutdown logic with
>schedule_delayed_work as the core xdp.c code already does.
I see.

The cpsw dosn't re-use it, so here all is fine, but if a driver needs
to re-use it again, lets suppose, as it can happen, the pool needs to
be registered with xdp_rxq_info_reg_mem_model() again, and for that
potentially can be added verification on in-flight packets
or some register state...but better mention in some place
to not do this, frankly, I don't know where it should be at this moment.

--
Regards,
Ivan Khoronzhuk

2019-06-26 14:01:57

by Ivan Khoronzhuk

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

On Tue, Jun 25, 2019 at 09:36:15PM -0400, Willem de Bruijn wrote:
>On Tue, Jun 25, 2019 at 2:00 PM Ivan Khoronzhuk
><[email protected]> wrote:
>>
>> Add user counter allowing to delete pool only when no users.
>> It doesn't prevent pool from flush, only prevents freeing the
>> pool instance. Helps when no need to delete the pool and now
>> it's user responsibility to free it by calling page_pool_free()
>> while destroying procedure. It also makes to use page_pool_free()
>> explicitly, not fully hidden in xdp unreg, which looks more
>> correct after page pool "create" routine.
>>
>> Signed-off-by: Ivan Khoronzhuk <[email protected]>
>> ---
>
>> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
>> index f07c518ef8a5..1ec838e9927e 100644
>> --- a/include/net/page_pool.h
>> +++ b/include/net/page_pool.h
>> @@ -101,6 +101,7 @@ struct page_pool {
>> struct ptr_ring ring;
>>
>> atomic_t pages_state_release_cnt;
>> + atomic_t user_cnt;
>
>refcount_t?
yes, thanks.

>
>> };
>>
>> struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
>> @@ -183,6 +184,12 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
>> return page->dma_addr;
>> }
>>
>> +/* used to prevent pool from deallocation */
>> +static inline void page_pool_get(struct page_pool *pool)
>> +{
>> + atomic_inc(&pool->user_cnt);
>> +}
>> +
>> static inline bool is_page_pool_compiled_in(void)
>> {
>> #ifdef CONFIG_PAGE_POOL
>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> index b366f59885c1..169b0e3c870e 100644
>> --- a/net/core/page_pool.c
>> +++ b/net/core/page_pool.c
>> @@ -48,6 +48,7 @@ static int page_pool_init(struct page_pool *pool,
>> return -ENOMEM;
>>
>> atomic_set(&pool->pages_state_release_cnt, 0);
>> + atomic_set(&pool->user_cnt, 0);
>>
>> if (pool->p.flags & PP_FLAG_DMA_MAP)
>> get_device(pool->p.dev);
>> @@ -70,6 +71,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
>> kfree(pool);
>> return ERR_PTR(err);
>> }
>> +
>> + page_pool_get(pool);
>> return pool;
>> }
>> EXPORT_SYMBOL(page_pool_create);
>> @@ -356,6 +359,10 @@ static void __warn_in_flight(struct page_pool *pool)
>>
>> void __page_pool_free(struct page_pool *pool)
>> {
>> + /* free only if no users */
>> + if (!atomic_dec_and_test(&pool->user_cnt))
>> + return;
>> +
>> WARN(pool->alloc.count, "API usage violation");
>> WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>>
>> diff --git a/net/core/xdp.c b/net/core/xdp.c
>> index 829377cc83db..04bdcd784d2e 100644
>> --- a/net/core/xdp.c
>> +++ b/net/core/xdp.c
>> @@ -372,6 +372,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
>>
>> mutex_unlock(&mem_id_lock);
>>
>> + if (type == MEM_TYPE_PAGE_POOL)
>> + page_pool_get(xdp_alloc->page_pool);
>> +
>
>need an analogous page_pool_put in xdp_rxq_info_unreg_mem_model? mlx5
>does not use that inverse function, but intel drivers do.
no need, it's put after call to page_pool_free() in unreg workqueue.

>
>> trace_mem_connect(xdp_alloc, xdp_rxq);
>> return 0;
>> err:
>> --
>> 2.17.1
>>

--
Regards,
Ivan Khoronzhuk

2019-06-27 19:46:43

by Jesper Dangaard Brouer

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

On Tue, 25 Jun 2019 20:59:45 +0300
Ivan Khoronzhuk <[email protected]> wrote:

> Add user counter allowing to delete pool only when no users.
> It doesn't prevent pool from flush, only prevents freeing the
> pool instance. Helps when no need to delete the pool and now
> it's user responsibility to free it by calling page_pool_free()
> while destroying procedure. It also makes to use page_pool_free()
> explicitly, not fully hidden in xdp unreg, which looks more
> correct after page pool "create" routine.

I don't think that "create" and "free" routines paring looks "more
correct" together.

Maybe we can scale back your solution(?), via creating a page_pool_get()
and page_pool_put() API that can be used by your driver, to keep the
page_pool object after a xdp_rxq_info_unreg() call. Then you can use
it for two xdp_rxq_info structs, and call page_pool_put() after you
have unregistered both.

The API would basically be:

diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index b366f59885c1..691ddacfb5a6 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -357,6 +357,10 @@ static void __warn_in_flight(struct page_pool *pool)
void __page_pool_free(struct page_pool *pool)
{
WARN(pool->alloc.count, "API usage violation");
+
+ if (atomic_read(&pool->user_cnt) != 0)
+ return;
+
WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");

/* Can happen due to forced shutdown */
@@ -372,6 +376,19 @@ void __page_pool_free(struct page_pool *pool)
}
EXPORT_SYMBOL(__page_pool_free);

+void page_pool_put(struct page_pool *pool)
+{
+ if (!atomic_dec_and_test(&pool->user_cnt))
+ __page_pool_free(pool);
+}
+EXPORT_SYMBOL(page_pool_put);
+
+void page_pool_get(struct page_pool *pool)
+{
+ atomic_inc(&pool->user_cnt);
+}
+EXPORT_SYMBOL(page_pool_get);
+


--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer

2019-06-27 22:03:26

by Ivan Khoronzhuk

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

Hi Jesper, thanks you remember about it.

>
>I don't think that "create" and "free" routines paring looks "more
>correct" together.
>
>Maybe we can scale back your solution(?), via creating a page_pool_get()
>and page_pool_put() API that can be used by your driver, to keep the
>page_pool object after a xdp_rxq_info_unreg() call. Then you can use
>it for two xdp_rxq_info structs, and call page_pool_put() after you
>have unregistered both.
>
>The API would basically be:
>
>diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>index b366f59885c1..691ddacfb5a6 100644
>--- a/net/core/page_pool.c
>+++ b/net/core/page_pool.c
>@@ -357,6 +357,10 @@ static void __warn_in_flight(struct page_pool *pool)
> void __page_pool_free(struct page_pool *pool)
> {
> WARN(pool->alloc.count, "API usage violation");
>+
>+ if (atomic_read(&pool->user_cnt) != 0)
>+ return;
>+
> WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>
> /* Can happen due to forced shutdown */
>@@ -372,6 +376,19 @@ void __page_pool_free(struct page_pool *pool)
> }
> EXPORT_SYMBOL(__page_pool_free);
>
>+void page_pool_put(struct page_pool *pool)
>+{
>+ if (!atomic_dec_and_test(&pool->user_cnt))
>+ __page_pool_free(pool);
>+}
>+EXPORT_SYMBOL(page_pool_put);
>+
>+void page_pool_get(struct page_pool *pool)
>+{
>+ atomic_inc(&pool->user_cnt);
>+}
>+EXPORT_SYMBOL(page_pool_get);
>+

I have another solution that doesn't touch page pool and adds modifications
to xdp allocator. As for me it looks better and work wider, I don't need to
think about this in the driver also.

It's supposed allocator works as before, no any changes to mlx5 and
page_pool API and its usage and seems like fits your requirements.
It still supposes that allocator runs under same napi softirq but allows
to reuse allocator.

I have not verified yet, but looks like:

diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h
index 6a8cba6ea79a..995b21da2f27 100644
--- a/include/net/xdp_priv.h
+++ b/include/net/xdp_priv.h
@@ -18,6 +18,7 @@ struct xdp_mem_allocator {
struct rcu_head rcu;
struct delayed_work defer_wq;
unsigned long defer_warn;
+ unsigned long refcnt;
};

#endif /* __LINUX_NET_XDP_PRIV_H__ */
diff --git a/net/core/xdp.c b/net/core/xdp.c
index f98ab6b98674..6239483e3793 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -98,6 +98,12 @@ static bool __mem_id_disconnect(int id, bool force)
WARN(1, "Request remove non-existing id(%d), driver bug?", id);
return true;
}
+
+ if (--xa->refcnt) {
+ mutex_unlock(&mem_id_lock);
+ return true;
+ }
+
xa->disconnect_cnt++;

/* Detects in-flight packet-pages for page_pool */
@@ -312,6 +318,33 @@ static bool __is_supported_mem_type(enum xdp_mem_type type)
return true;
}

+static struct xdp_mem_allocator *xdp_allocator_get(void *allocator)
+{
+ struct xdp_mem_allocator *xae, *xa == NULL;
+ struct rhashtable_iter iter;
+
+ mutex_lock(&mem_id_lock);
+ rhashtable_walk_enter(mem_id_ht, &iter);
+ do {
+ rhashtable_walk_start(&iter);
+
+ while ((xae = rhashtable_walk_next(&iter)) && !IS_ERR(xae)) {
+ if (xae->allocator == allocator) {
+ xae->refcnt++;
+ xa = xae;
+ break;
+ }
+ }
+
+ rhashtable_walk_stop(&iter);
+
+ } while (xae == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
+ mutex_unlock(&mem_id_lock);
+
+ return xa;
+}
+
int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
enum xdp_mem_type type, void *allocator)
{
@@ -347,6 +380,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
}
}

+ if (xdp_allocator_get(allocator))
+ return 0;
+
xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
if (!xdp_alloc)
return -ENOMEM;
@@ -360,6 +396,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
xdp_rxq->mem.id = id;
xdp_alloc->mem = xdp_rxq->mem;
xdp_alloc->allocator = allocator;
+ xdp_alloc->refcnt = 1;

/* Insert allocator into ID lookup table */
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);


--
Regards,
Ivan Khoronzhuk

2019-06-28 06:36:01

by Jesper Dangaard Brouer

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

On Fri, 28 Jun 2019 01:02:47 +0300
Ivan Khoronzhuk <[email protected]> wrote:

> Hi Jesper, thanks you remember about it.
>
> >
> >I don't think that "create" and "free" routines paring looks "more
> >correct" together.
> >
> >Maybe we can scale back your solution(?), via creating a page_pool_get()
> >and page_pool_put() API that can be used by your driver, to keep the
> >page_pool object after a xdp_rxq_info_unreg() call. Then you can use
> >it for two xdp_rxq_info structs, and call page_pool_put() after you
> >have unregistered both.
> >
> >The API would basically be:
> >
> >diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> >index b366f59885c1..691ddacfb5a6 100644
> >--- a/net/core/page_pool.c
> >+++ b/net/core/page_pool.c
> >@@ -357,6 +357,10 @@ static void __warn_in_flight(struct page_pool *pool)
> > void __page_pool_free(struct page_pool *pool)
> > {
> > WARN(pool->alloc.count, "API usage violation");
> >+
> >+ if (atomic_read(&pool->user_cnt) != 0)
> >+ return;
> >+
> > WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
> >
> > /* Can happen due to forced shutdown */
> >@@ -372,6 +376,19 @@ void __page_pool_free(struct page_pool *pool)
> > }
> > EXPORT_SYMBOL(__page_pool_free);
> >
> >+void page_pool_put(struct page_pool *pool)
> >+{
> >+ if (!atomic_dec_and_test(&pool->user_cnt))
> >+ __page_pool_free(pool);
> >+}
> >+EXPORT_SYMBOL(page_pool_put);
> >+
> >+void page_pool_get(struct page_pool *pool)
> >+{
> >+ atomic_inc(&pool->user_cnt);
> >+}
> >+EXPORT_SYMBOL(page_pool_get);
> >+
>
> I have another solution that doesn't touch page pool and adds modifications
> to xdp allocator. As for me it looks better and work wider, I don't need to
> think about this in the driver also.
>
> It's supposed allocator works as before, no any changes to mlx5 and
> page_pool API and its usage and seems like fits your requirements.
> It still supposes that allocator runs under same napi softirq but allows
> to reuse allocator.
>
> I have not verified yet, but looks like:
>
> diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h
> index 6a8cba6ea79a..995b21da2f27 100644
> --- a/include/net/xdp_priv.h
> +++ b/include/net/xdp_priv.h
> @@ -18,6 +18,7 @@ struct xdp_mem_allocator {
> struct rcu_head rcu;
> struct delayed_work defer_wq;
> unsigned long defer_warn;
> + unsigned long refcnt;
> };
>
> #endif /* __LINUX_NET_XDP_PRIV_H__ */
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index f98ab6b98674..6239483e3793 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -98,6 +98,12 @@ static bool __mem_id_disconnect(int id, bool force)
> WARN(1, "Request remove non-existing id(%d), driver bug?", id);
> return true;
> }
> +
> + if (--xa->refcnt) {
> + mutex_unlock(&mem_id_lock);
> + return true;

This doesn't work. This function __mem_id_disconnect() can be called
multiple times. E.g. if there are in-flight packets.


> + }
> +
> xa->disconnect_cnt++;
>
> /* Detects in-flight packet-pages for page_pool */
> @@ -312,6 +318,33 @@ static bool __is_supported_mem_type(enum xdp_mem_type type)
> return true;
> }
>
> +static struct xdp_mem_allocator *xdp_allocator_get(void *allocator)
> +{
> + struct xdp_mem_allocator *xae, *xa == NULL;
> + struct rhashtable_iter iter;
> +
> + mutex_lock(&mem_id_lock);
> + rhashtable_walk_enter(mem_id_ht, &iter);
> + do {
> + rhashtable_walk_start(&iter);
> +
> + while ((xae = rhashtable_walk_next(&iter)) && !IS_ERR(xae)) {
> + if (xae->allocator == allocator) {
> + xae->refcnt++;
> + xa = xae;
> + break;
> + }
> + }
> +
> + rhashtable_walk_stop(&iter);
> +
> + } while (xae == ERR_PTR(-EAGAIN));
> + rhashtable_walk_exit(&iter);
> + mutex_unlock(&mem_id_lock);
> +
> + return xa;
> +}
> +
> int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
> enum xdp_mem_type type, void *allocator)
> {
> @@ -347,6 +380,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
> }
> }
>
> + if (xdp_allocator_get(allocator))
> + return 0;
> +
> xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
> if (!xdp_alloc)
> return -ENOMEM;
> @@ -360,6 +396,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
> xdp_rxq->mem.id = id;
> xdp_alloc->mem = xdp_rxq->mem;
> xdp_alloc->allocator = allocator;
> + xdp_alloc->refcnt = 1;
>
> /* Insert allocator into ID lookup table */
> ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
>
>



--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer

2019-06-28 08:54:50

by Ivan Khoronzhuk

[permalink] [raw]
Subject: Re: [PATCH v4 net-next 1/4] net: core: page_pool: add user cnt preventing pool deletion

On Fri, Jun 28, 2019 at 08:35:20AM +0200, Jesper Dangaard Brouer wrote:
>On Fri, 28 Jun 2019 01:02:47 +0300
>Ivan Khoronzhuk <[email protected]> wrote:
>
>> Hi Jesper, thanks you remember about it.
>>
>> >
>> >I don't think that "create" and "free" routines paring looks "more
>> >correct" together.
>> >
>> >Maybe we can scale back your solution(?), via creating a page_pool_get()
>> >and page_pool_put() API that can be used by your driver, to keep the
>> >page_pool object after a xdp_rxq_info_unreg() call. Then you can use
>> >it for two xdp_rxq_info structs, and call page_pool_put() after you
>> >have unregistered both.
>> >
>> >The API would basically be:
>> >
>> >diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> >index b366f59885c1..691ddacfb5a6 100644
>> >--- a/net/core/page_pool.c
>> >+++ b/net/core/page_pool.c
>> >@@ -357,6 +357,10 @@ static void __warn_in_flight(struct page_pool *pool)
>> > void __page_pool_free(struct page_pool *pool)
>> > {
>> > WARN(pool->alloc.count, "API usage violation");
>> >+
>> >+ if (atomic_read(&pool->user_cnt) != 0)
>> >+ return;
>> >+
>> > WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>> >
>> > /* Can happen due to forced shutdown */
>> >@@ -372,6 +376,19 @@ void __page_pool_free(struct page_pool *pool)
>> > }
>> > EXPORT_SYMBOL(__page_pool_free);
>> >
>> >+void page_pool_put(struct page_pool *pool)
>> >+{
>> >+ if (!atomic_dec_and_test(&pool->user_cnt))
>> >+ __page_pool_free(pool);
>> >+}
>> >+EXPORT_SYMBOL(page_pool_put);
>> >+
>> >+void page_pool_get(struct page_pool *pool)
>> >+{
>> >+ atomic_inc(&pool->user_cnt);
>> >+}
>> >+EXPORT_SYMBOL(page_pool_get);
>> >+
>>
>> I have another solution that doesn't touch page pool and adds modifications
>> to xdp allocator. As for me it looks better and work wider, I don't need to
>> think about this in the driver also.
>>
>> It's supposed allocator works as before, no any changes to mlx5 and
>> page_pool API and its usage and seems like fits your requirements.
>> It still supposes that allocator runs under same napi softirq but allows
>> to reuse allocator.
>>
>> I have not verified yet, but looks like:
>>
>> diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h
>> index 6a8cba6ea79a..995b21da2f27 100644
>> --- a/include/net/xdp_priv.h
>> +++ b/include/net/xdp_priv.h
>> @@ -18,6 +18,7 @@ struct xdp_mem_allocator {
>> struct rcu_head rcu;
>> struct delayed_work defer_wq;
>> unsigned long defer_warn;
>> + unsigned long refcnt;
>> };
>>
>> #endif /* __LINUX_NET_XDP_PRIV_H__ */
>> diff --git a/net/core/xdp.c b/net/core/xdp.c
>> index f98ab6b98674..6239483e3793 100644
>> --- a/net/core/xdp.c
>> +++ b/net/core/xdp.c
>> @@ -98,6 +98,12 @@ static bool __mem_id_disconnect(int id, bool force)
>> WARN(1, "Request remove non-existing id(%d), driver bug?", id);
>> return true;
>> }
>> +
>> + if (--xa->refcnt) {
>> + mutex_unlock(&mem_id_lock);
>> + return true;
>
>This doesn't work. This function __mem_id_disconnect() can be called
>multiple times. E.g. if there are in-flight packets.
Yes, it was draft. I still have not completely verify it due
to several changes in cpsw and holiday in my side, second draft
looks like:

Subject: [PATCH] net: core: xdp: allow same allocator for rxq

XDP rxq can be same for ndevs running under same rx napi softirq.
But there is no ability to register same allocator for both rxqs,
by fact it's same rxq but has different ndev as a reference.

Due to last changes allocator destroy can be defered till the moment
all packets are recycled by destination interface, afterwards it's
freed. In order to schedule allocator destroy only after all users are
unregistered, add refcnt to allocator object and start to destroy
only it reaches 0.

Signed-off-by: Ivan Khoronzhuk <[email protected]>
---
include/net/xdp_priv.h | 1 +
net/core/xdp.c | 46 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 47 insertions(+)

diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h
index 6a8cba6ea79a..995b21da2f27 100644
--- a/include/net/xdp_priv.h
+++ b/include/net/xdp_priv.h
@@ -18,6 +18,7 @@ struct xdp_mem_allocator {
struct rcu_head rcu;
struct delayed_work defer_wq;
unsigned long defer_warn;
+ unsigned long refcnt;
};

#endif /* __LINUX_NET_XDP_PRIV_H__ */
diff --git a/net/core/xdp.c b/net/core/xdp.c
index f98ab6b98674..7b0185eec124 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -98,6 +98,18 @@ static bool __mem_id_disconnect(int id, bool force)
WARN(1, "Request remove non-existing id(%d), driver bug?", id);
return true;
}
+
+ /* to avoid hash lookup twice do refcnt dec here, but not when
+ * it's 0 as it can be called from workqueue aftewards
+ */
+ if (xa->refcnt)
+ xa->refcnt--;
+
+ if (xa->refcnt) {
+ mutex_unlock(&mem_id_lock);
+ return true;
+ }
+
xa->disconnect_cnt++;

/* Detects in-flight packet-pages for page_pool */
@@ -312,6 +324,33 @@ static bool __is_supported_mem_type(enum xdp_mem_type type)
return true;
}

+static struct xdp_mem_allocator *xdp_allocator_get(void *allocator)
+{
+ struct xdp_mem_allocator *xae, *xa = NULL;
+ struct rhashtable_iter iter;
+
+ mutex_lock(&mem_id_lock);
+ rhashtable_walk_enter(mem_id_ht, &iter);
+ do {
+ rhashtable_walk_start(&iter);
+
+ while ((xae = rhashtable_walk_next(&iter)) && !IS_ERR(xae)) {
+ if (xae->allocator == allocator) {
+ xae->refcnt++;
+ xa = xae;
+ break;
+ }
+ }
+
+ rhashtable_walk_stop(&iter);
+
+ } while (xae == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
+ mutex_unlock(&mem_id_lock);
+
+ return xa;
+}
+
int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
enum xdp_mem_type type, void *allocator)
{
@@ -347,6 +386,12 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
}
}

+ xdp_alloc = xdp_allocator_get(allocator);
+ if (xdp_alloc) {
+ xdp_rxq->mem.id = xdp_alloc->mem.id;
+ return 0;
+ }
+
xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
if (!xdp_alloc)
return -ENOMEM;
@@ -360,6 +405,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
xdp_rxq->mem.id = id;
xdp_alloc->mem = xdp_rxq->mem;
xdp_alloc->allocator = allocator;
+ xdp_alloc->refcnt = 1;

/* Insert allocator into ID lookup table */
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);

--
Regards,
Ivan Khoronzhuk