This reverts commit f83331bab149e29fa2c49cf102c0cd8c3f1ce9f9.
As the tests PPC64 (powernv platform) show, IOMMU pages are leaking
when transferring big amount of small packets (<=64 bytes),
"ping -f" and waiting for 15 seconds is the simplest way to confirm the bug.
Cc: Linus Torvalds <[email protected]>
Cc: Santosh Rastapur <[email protected]>
Cc: Jay Fenlason <[email protected]>
Cc: David S. Miller <[email protected]>
Cc: Divy Le ray <[email protected]>
Signed-off-by: Alexey Kardashevskiy <[email protected]>
---
drivers/net/ethernet/chelsio/cxgb3/sge.c | 107 +++++++------------------------
1 file changed, 24 insertions(+), 83 deletions(-)
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
index 687ec4a..9c89dc8 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -455,11 +455,6 @@ static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q,
q->pg_chunk.offset = 0;
mapping = pci_map_page(adapter->pdev, q->pg_chunk.page,
0, q->alloc_size, PCI_DMA_FROMDEVICE);
- if (unlikely(pci_dma_mapping_error(adapter->pdev, mapping))) {
- __free_pages(q->pg_chunk.page, order);
- q->pg_chunk.page = NULL;
- return -EIO;
- }
q->pg_chunk.mapping = mapping;
}
sd->pg_chunk = q->pg_chunk;
@@ -954,75 +949,40 @@ static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
return flits_to_desc(flits);
}
-
-/* map_skb - map a packet main body and its page fragments
- * @pdev: the PCI device
- * @skb: the packet
- * @addr: placeholder to save the mapped addresses
- *
- * map the main body of an sk_buff and its page fragments, if any.
- */
-static int map_skb(struct pci_dev *pdev, const struct sk_buff *skb,
- dma_addr_t *addr)
-{
- const skb_frag_t *fp, *end;
- const struct skb_shared_info *si;
-
- *addr = pci_map_single(pdev, skb->data, skb_headlen(skb),
- PCI_DMA_TODEVICE);
- if (pci_dma_mapping_error(pdev, *addr))
- goto out_err;
-
- si = skb_shinfo(skb);
- end = &si->frags[si->nr_frags];
-
- for (fp = si->frags; fp < end; fp++) {
- *++addr = skb_frag_dma_map(&pdev->dev, fp, 0, skb_frag_size(fp),
- DMA_TO_DEVICE);
- if (pci_dma_mapping_error(pdev, *addr))
- goto unwind;
- }
- return 0;
-
-unwind:
- while (fp-- > si->frags)
- dma_unmap_page(&pdev->dev, *--addr, skb_frag_size(fp),
- DMA_TO_DEVICE);
-
- pci_unmap_single(pdev, addr[-1], skb_headlen(skb), PCI_DMA_TODEVICE);
-out_err:
- return -ENOMEM;
-}
-
/**
- * write_sgl - populate a scatter/gather list for a packet
+ * make_sgl - populate a scatter/gather list for a packet
* @skb: the packet
* @sgp: the SGL to populate
* @start: start address of skb main body data to include in the SGL
* @len: length of skb main body data to include in the SGL
- * @addr: the list of the mapped addresses
+ * @pdev: the PCI device
*
- * Copies the scatter/gather list for the buffers that make up a packet
+ * Generates a scatter/gather list for the buffers that make up a packet
* and returns the SGL size in 8-byte words. The caller must size the SGL
* appropriately.
*/
-static inline unsigned int write_sgl(const struct sk_buff *skb,
+static inline unsigned int make_sgl(const struct sk_buff *skb,
struct sg_ent *sgp, unsigned char *start,
- unsigned int len, const dma_addr_t *addr)
+ unsigned int len, struct pci_dev *pdev)
{
- unsigned int i, j = 0, k = 0, nfrags;
+ dma_addr_t mapping;
+ unsigned int i, j = 0, nfrags;
if (len) {
+ mapping = pci_map_single(pdev, start, len, PCI_DMA_TODEVICE);
sgp->len[0] = cpu_to_be32(len);
- sgp->addr[j++] = cpu_to_be64(addr[k++]);
+ sgp->addr[0] = cpu_to_be64(mapping);
+ j = 1;
}
nfrags = skb_shinfo(skb)->nr_frags;
for (i = 0; i < nfrags; i++) {
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ mapping = skb_frag_dma_map(&pdev->dev, frag, 0, skb_frag_size(frag),
+ DMA_TO_DEVICE);
sgp->len[j] = cpu_to_be32(skb_frag_size(frag));
- sgp->addr[j] = cpu_to_be64(addr[k++]);
+ sgp->addr[j] = cpu_to_be64(mapping);
j ^= 1;
if (j == 0)
++sgp;
@@ -1178,7 +1138,7 @@ static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
const struct port_info *pi,
unsigned int pidx, unsigned int gen,
struct sge_txq *q, unsigned int ndesc,
- unsigned int compl, const dma_addr_t *addr)
+ unsigned int compl)
{
unsigned int flits, sgl_flits, cntrl, tso_info;
struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
@@ -1236,7 +1196,7 @@ static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
}
sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
- sgl_flits = write_sgl(skb, sgp, skb->data, skb_headlen(skb), addr);
+ sgl_flits = make_sgl(skb, sgp, skb->data, skb_headlen(skb), adap->pdev);
write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen,
htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl),
@@ -1267,7 +1227,6 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
struct netdev_queue *txq;
struct sge_qset *qs;
struct sge_txq *q;
- dma_addr_t addr[MAX_SKB_FRAGS + 1];
/*
* The chip min packet length is 9 octets but play safe and reject
@@ -1296,11 +1255,6 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_BUSY;
}
- if (unlikely(map_skb(adap->pdev, skb, addr) < 0)) {
- dev_kfree_skb(skb);
- return NETDEV_TX_OK;
- }
-
q->in_use += ndesc;
if (unlikely(credits - ndesc < q->stop_thres)) {
t3_stop_tx_queue(txq, qs, q);
@@ -1358,7 +1312,7 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
if (likely(!skb_shared(skb)))
skb_orphan(skb);
- write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl, addr);
+ write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl);
check_ring_tx_db(adap, q);
return NETDEV_TX_OK;
}
@@ -1623,8 +1577,7 @@ static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev,
*/
static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
struct sge_txq *q, unsigned int pidx,
- unsigned int gen, unsigned int ndesc,
- const dma_addr_t *addr)
+ unsigned int gen, unsigned int ndesc)
{
unsigned int sgl_flits, flits;
struct work_request_hdr *from;
@@ -1645,9 +1598,9 @@ static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
flits = skb_transport_offset(skb) / 8;
sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
- sgl_flits = write_sgl(skb, sgp, skb_transport_header(skb),
- skb_tail_pointer(skb) -
- skb_transport_header(skb), addr);
+ sgl_flits = make_sgl(skb, sgp, skb_transport_header(skb),
+ skb->tail - skb->transport_header,
+ adap->pdev);
if (need_skb_unmap()) {
setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits);
skb->destructor = deferred_unmap_destructor;
@@ -1705,11 +1658,6 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
goto again;
}
- if (map_skb(adap->pdev, skb, (dma_addr_t *)skb->head)) {
- spin_unlock(&q->lock);
- return NET_XMIT_SUCCESS;
- }
-
gen = q->gen;
q->in_use += ndesc;
pidx = q->pidx;
@@ -1720,7 +1668,7 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
}
spin_unlock(&q->lock);
- write_ofld_wr(adap, skb, q, pidx, gen, ndesc, (dma_addr_t *)skb->head);
+ write_ofld_wr(adap, skb, q, pidx, gen, ndesc);
check_ring_tx_db(adap, q);
return NET_XMIT_SUCCESS;
}
@@ -1738,7 +1686,6 @@ static void restart_offloadq(unsigned long data)
struct sge_txq *q = &qs->txq[TXQ_OFLD];
const struct port_info *pi = netdev_priv(qs->netdev);
struct adapter *adap = pi->adapter;
- unsigned int written = 0;
spin_lock(&q->lock);
again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
@@ -1758,14 +1705,10 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
break;
}
- if (map_skb(adap->pdev, skb, (dma_addr_t *)skb->head))
- break;
-
gen = q->gen;
q->in_use += ndesc;
pidx = q->pidx;
q->pidx += ndesc;
- written += ndesc;
if (q->pidx >= q->size) {
q->pidx -= q->size;
q->gen ^= 1;
@@ -1773,8 +1716,7 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
__skb_unlink(skb, &q->sendq);
spin_unlock(&q->lock);
- write_ofld_wr(adap, skb, q, pidx, gen, ndesc,
- (dma_addr_t *)skb->head);
+ write_ofld_wr(adap, skb, q, pidx, gen, ndesc);
spin_lock(&q->lock);
}
spin_unlock(&q->lock);
@@ -1784,9 +1726,8 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
set_bit(TXQ_LAST_PKT_DB, &q->flags);
#endif
wmb();
- if (likely(written))
- t3_write_reg(adap, A_SG_KDOORBELL,
- F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+ t3_write_reg(adap, A_SG_KDOORBELL,
+ F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
}
/**
--
1.8.3.2
On 08/14/2013 02:19 AM, Alexey Kardashevskiy wrote:
> This reverts commit f83331bab149e29fa2c49cf102c0cd8c3f1ce9f9.
>
> As the tests PPC64 (powernv platform) show, IOMMU pages are leaking
> when transferring big amount of small packets (<=64 bytes),
> "ping -f" and waiting for 15 seconds is the simplest way to confirm the bug.
>
> Cc: Linus Torvalds<[email protected]>
> Cc: Santosh Rastapur<[email protected]>
> Cc: Jay Fenlason<[email protected]>
> Cc: David S. Miller<[email protected]>
> Cc: Divy Le ray<[email protected]>
> Signed-off-by: Alexey Kardashevskiy<[email protected]>
Acked-by: Divy Le Ray <[email protected]>
We are revisiting this patch in the light of the leak, and will repost
once fixed.
Cheers,
Divy
> ---
> drivers/net/ethernet/chelsio/cxgb3/sge.c | 107 +++++++------------------------
> 1 file changed, 24 insertions(+), 83 deletions(-)
>
> diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
> index 687ec4a..9c89dc8 100644
> --- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
> +++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
> @@ -455,11 +455,6 @@ static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q,
> q->pg_chunk.offset = 0;
> mapping = pci_map_page(adapter->pdev, q->pg_chunk.page,
> 0, q->alloc_size, PCI_DMA_FROMDEVICE);
> - if (unlikely(pci_dma_mapping_error(adapter->pdev, mapping))) {
> - __free_pages(q->pg_chunk.page, order);
> - q->pg_chunk.page = NULL;
> - return -EIO;
> - }
> q->pg_chunk.mapping = mapping;
> }
> sd->pg_chunk = q->pg_chunk;
> @@ -954,75 +949,40 @@ static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
> return flits_to_desc(flits);
> }
>
> -
> -/* map_skb - map a packet main body and its page fragments
> - * @pdev: the PCI device
> - * @skb: the packet
> - * @addr: placeholder to save the mapped addresses
> - *
> - * map the main body of an sk_buff and its page fragments, if any.
> - */
> -static int map_skb(struct pci_dev *pdev, const struct sk_buff *skb,
> - dma_addr_t *addr)
> -{
> - const skb_frag_t *fp, *end;
> - const struct skb_shared_info *si;
> -
> - *addr = pci_map_single(pdev, skb->data, skb_headlen(skb),
> - PCI_DMA_TODEVICE);
> - if (pci_dma_mapping_error(pdev, *addr))
> - goto out_err;
> -
> - si = skb_shinfo(skb);
> - end = &si->frags[si->nr_frags];
> -
> - for (fp = si->frags; fp < end; fp++) {
> - *++addr = skb_frag_dma_map(&pdev->dev, fp, 0, skb_frag_size(fp),
> - DMA_TO_DEVICE);
> - if (pci_dma_mapping_error(pdev, *addr))
> - goto unwind;
> - }
> - return 0;
> -
> -unwind:
> - while (fp-- > si->frags)
> - dma_unmap_page(&pdev->dev, *--addr, skb_frag_size(fp),
> - DMA_TO_DEVICE);
> -
> - pci_unmap_single(pdev, addr[-1], skb_headlen(skb), PCI_DMA_TODEVICE);
> -out_err:
> - return -ENOMEM;
> -}
> -
> /**
> - * write_sgl - populate a scatter/gather list for a packet
> + * make_sgl - populate a scatter/gather list for a packet
> * @skb: the packet
> * @sgp: the SGL to populate
> * @start: start address of skb main body data to include in the SGL
> * @len: length of skb main body data to include in the SGL
> - * @addr: the list of the mapped addresses
> + * @pdev: the PCI device
> *
> - * Copies the scatter/gather list for the buffers that make up a packet
> + * Generates a scatter/gather list for the buffers that make up a packet
> * and returns the SGL size in 8-byte words. The caller must size the SGL
> * appropriately.
> */
> -static inline unsigned int write_sgl(const struct sk_buff *skb,
> +static inline unsigned int make_sgl(const struct sk_buff *skb,
> struct sg_ent *sgp, unsigned char *start,
> - unsigned int len, const dma_addr_t *addr)
> + unsigned int len, struct pci_dev *pdev)
> {
> - unsigned int i, j = 0, k = 0, nfrags;
> + dma_addr_t mapping;
> + unsigned int i, j = 0, nfrags;
>
> if (len) {
> + mapping = pci_map_single(pdev, start, len, PCI_DMA_TODEVICE);
> sgp->len[0] = cpu_to_be32(len);
> - sgp->addr[j++] = cpu_to_be64(addr[k++]);
> + sgp->addr[0] = cpu_to_be64(mapping);
> + j = 1;
> }
>
> nfrags = skb_shinfo(skb)->nr_frags;
> for (i = 0; i < nfrags; i++) {
> const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
>
> + mapping = skb_frag_dma_map(&pdev->dev, frag, 0, skb_frag_size(frag),
> + DMA_TO_DEVICE);
> sgp->len[j] = cpu_to_be32(skb_frag_size(frag));
> - sgp->addr[j] = cpu_to_be64(addr[k++]);
> + sgp->addr[j] = cpu_to_be64(mapping);
> j ^= 1;
> if (j == 0)
> ++sgp;
> @@ -1178,7 +1138,7 @@ static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
> const struct port_info *pi,
> unsigned int pidx, unsigned int gen,
> struct sge_txq *q, unsigned int ndesc,
> - unsigned int compl, const dma_addr_t *addr)
> + unsigned int compl)
> {
> unsigned int flits, sgl_flits, cntrl, tso_info;
> struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
> @@ -1236,7 +1196,7 @@ static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
> }
>
> sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
> - sgl_flits = write_sgl(skb, sgp, skb->data, skb_headlen(skb), addr);
> + sgl_flits = make_sgl(skb, sgp, skb->data, skb_headlen(skb), adap->pdev);
>
> write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen,
> htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl),
> @@ -1267,7 +1227,6 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
> struct netdev_queue *txq;
> struct sge_qset *qs;
> struct sge_txq *q;
> - dma_addr_t addr[MAX_SKB_FRAGS + 1];
>
> /*
> * The chip min packet length is 9 octets but play safe and reject
> @@ -1296,11 +1255,6 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
> return NETDEV_TX_BUSY;
> }
>
> - if (unlikely(map_skb(adap->pdev, skb, addr) < 0)) {
> - dev_kfree_skb(skb);
> - return NETDEV_TX_OK;
> - }
> -
> q->in_use += ndesc;
> if (unlikely(credits - ndesc < q->stop_thres)) {
> t3_stop_tx_queue(txq, qs, q);
> @@ -1358,7 +1312,7 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
> if (likely(!skb_shared(skb)))
> skb_orphan(skb);
>
> - write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl, addr);
> + write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl);
> check_ring_tx_db(adap, q);
> return NETDEV_TX_OK;
> }
> @@ -1623,8 +1577,7 @@ static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev,
> */
> static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
> struct sge_txq *q, unsigned int pidx,
> - unsigned int gen, unsigned int ndesc,
> - const dma_addr_t *addr)
> + unsigned int gen, unsigned int ndesc)
> {
> unsigned int sgl_flits, flits;
> struct work_request_hdr *from;
> @@ -1645,9 +1598,9 @@ static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
>
> flits = skb_transport_offset(skb) / 8;
> sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
> - sgl_flits = write_sgl(skb, sgp, skb_transport_header(skb),
> - skb_tail_pointer(skb) -
> - skb_transport_header(skb), addr);
> + sgl_flits = make_sgl(skb, sgp, skb_transport_header(skb),
> + skb->tail - skb->transport_header,
> + adap->pdev);
> if (need_skb_unmap()) {
> setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits);
> skb->destructor = deferred_unmap_destructor;
> @@ -1705,11 +1658,6 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
> goto again;
> }
>
> - if (map_skb(adap->pdev, skb, (dma_addr_t *)skb->head)) {
> - spin_unlock(&q->lock);
> - return NET_XMIT_SUCCESS;
> - }
> -
> gen = q->gen;
> q->in_use += ndesc;
> pidx = q->pidx;
> @@ -1720,7 +1668,7 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
> }
> spin_unlock(&q->lock);
>
> - write_ofld_wr(adap, skb, q, pidx, gen, ndesc, (dma_addr_t *)skb->head);
> + write_ofld_wr(adap, skb, q, pidx, gen, ndesc);
> check_ring_tx_db(adap, q);
> return NET_XMIT_SUCCESS;
> }
> @@ -1738,7 +1686,6 @@ static void restart_offloadq(unsigned long data)
> struct sge_txq *q = &qs->txq[TXQ_OFLD];
> const struct port_info *pi = netdev_priv(qs->netdev);
> struct adapter *adap = pi->adapter;
> - unsigned int written = 0;
>
> spin_lock(&q->lock);
> again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
> @@ -1758,14 +1705,10 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
> break;
> }
>
> - if (map_skb(adap->pdev, skb, (dma_addr_t *)skb->head))
> - break;
> -
> gen = q->gen;
> q->in_use += ndesc;
> pidx = q->pidx;
> q->pidx += ndesc;
> - written += ndesc;
> if (q->pidx >= q->size) {
> q->pidx -= q->size;
> q->gen ^= 1;
> @@ -1773,8 +1716,7 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
> __skb_unlink(skb, &q->sendq);
> spin_unlock(&q->lock);
>
> - write_ofld_wr(adap, skb, q, pidx, gen, ndesc,
> - (dma_addr_t *)skb->head);
> + write_ofld_wr(adap, skb, q, pidx, gen, ndesc);
> spin_lock(&q->lock);
> }
> spin_unlock(&q->lock);
> @@ -1784,9 +1726,8 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
> set_bit(TXQ_LAST_PKT_DB, &q->flags);
> #endif
> wmb();
> - if (likely(written))
> - t3_write_reg(adap, A_SG_KDOORBELL,
> - F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
> + t3_write_reg(adap, A_SG_KDOORBELL,
> + F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
> }
>
> /**
From: Divy Le ray <[email protected]>
Date: Wed, 14 Aug 2013 08:57:24 -0700
> On 08/14/2013 02:19 AM, Alexey Kardashevskiy wrote:
>> This reverts commit f83331bab149e29fa2c49cf102c0cd8c3f1ce9f9.
>>
>> As the tests PPC64 (powernv platform) show, IOMMU pages are leaking
>> when transferring big amount of small packets (<=64 bytes),
>> "ping -f" and waiting for 15 seconds is the simplest way to confirm
>> the bug.
>>
>> Cc: Linus Torvalds<[email protected]>
>> Cc: Santosh Rastapur<[email protected]>
>> Cc: Jay Fenlason<[email protected]>
>> Cc: David S. Miller<[email protected]>
>> Cc: Divy Le ray<[email protected]>
>> Signed-off-by: Alexey Kardashevskiy<[email protected]>
>
> Acked-by: Divy Le Ray <[email protected]>
Applied, thanks.