2005-05-18 01:56:23

by Christoph Lameter

[permalink] [raw]
Subject: [PATCH] NUMA aware allocation of transmit and receive buffers for e1000

NUMA awareness for the e1000 driver. Allocate transmit and receive buffers
on the node of the device.

Signed-off-by: Christoph Lameter <[email protected]>
Signed-off-by: Justin M. Forbes <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>

Index: linux-2.6.11/drivers/net/e1000/e1000_main.c
===================================================================
--- linux-2.6.11.orig/drivers/net/e1000/e1000_main.c 2005-05-17 16:44:20.000000000 -0700
+++ linux-2.6.11/drivers/net/e1000/e1000_main.c 2005-05-17 17:47:43.000000000 -0700
@@ -120,8 +120,8 @@ int e1000_up(struct e1000_adapter *adapt
void e1000_down(struct e1000_adapter *adapter);
void e1000_reset(struct e1000_adapter *adapter);
int e1000_set_spd_dplx(struct e1000_adapter *adapter, uint16_t spddplx);
-int e1000_setup_tx_resources(struct e1000_adapter *adapter);
-int e1000_setup_rx_resources(struct e1000_adapter *adapter);
+int e1000_setup_tx_resources(struct e1000_adapter *adapter, int node);
+int e1000_setup_rx_resources(struct e1000_adapter *adapter, int node);
void e1000_free_tx_resources(struct e1000_adapter *adapter);
void e1000_free_rx_resources(struct e1000_adapter *adapter);
void e1000_update_stats(struct e1000_adapter *adapter);
@@ -513,6 +513,7 @@ e1000_probe(struct pci_dev *pdev,
netdev->mem_start = mmio_start;
netdev->mem_end = mmio_start + mmio_len;
netdev->base_addr = adapter->hw.io_base;
+ netdev->node = pcibus_to_node(pdev->bus);

adapter->bd_number = cards_found;

@@ -785,12 +786,12 @@ e1000_open(struct net_device *netdev)

/* allocate transmit descriptors */

- if((err = e1000_setup_tx_resources(adapter)))
+ if((err = e1000_setup_tx_resources(adapter, netdev->node)))
goto err_setup_tx;

/* allocate receive descriptors */

- if((err = e1000_setup_rx_resources(adapter)))
+ if((err = e1000_setup_rx_resources(adapter, netdev->node)))
goto err_setup_rx;

if((err = e1000_up(adapter)))
@@ -866,14 +867,14 @@ e1000_check_64k_bound(struct e1000_adapt
**/

int
-e1000_setup_tx_resources(struct e1000_adapter *adapter)
+e1000_setup_tx_resources(struct e1000_adapter *adapter, int node)
{
struct e1000_desc_ring *txdr = &adapter->tx_ring;
struct pci_dev *pdev = adapter->pdev;
int size;

size = sizeof(struct e1000_buffer) * txdr->count;
- txdr->buffer_info = vmalloc(size);
+ txdr->buffer_info = kmalloc_node(size, GFP_KERNEL, node);
if(!txdr->buffer_info) {
DPRINTK(PROBE, ERR,
"Unable to Allocate Memory for the Transmit descriptor ring\n");
@@ -891,7 +892,7 @@ e1000_setup_tx_resources(struct e1000_ad
setup_tx_desc_die:
DPRINTK(PROBE, ERR,
"Unable to Allocate Memory for the Transmit descriptor ring\n");
- vfree(txdr->buffer_info);
+ kfree(txdr->buffer_info);
return -ENOMEM;
}

@@ -1018,14 +1019,14 @@ e1000_configure_tx(struct e1000_adapter
**/

int
-e1000_setup_rx_resources(struct e1000_adapter *adapter)
+e1000_setup_rx_resources(struct e1000_adapter *adapter, int node)
{
struct e1000_desc_ring *rxdr = &adapter->rx_ring;
struct pci_dev *pdev = adapter->pdev;
int size;

size = sizeof(struct e1000_buffer) * rxdr->count;
- rxdr->buffer_info = vmalloc(size);
+ rxdr->buffer_info = kmalloc_node(size, GFP_KERNEL, node);
if(!rxdr->buffer_info) {
DPRINTK(PROBE, ERR,
"Unable to Allocate Memory for the Recieve descriptor ring\n");
@@ -1044,7 +1045,7 @@ e1000_setup_rx_resources(struct e1000_ad
setup_rx_desc_die:
DPRINTK(PROBE, ERR,
"Unble to Allocate Memory for the Recieve descriptor ring\n");
- vfree(rxdr->buffer_info);
+ kfree(rxdr->buffer_info);
return -ENOMEM;
}

@@ -1197,7 +1198,7 @@ e1000_free_tx_resources(struct e1000_ada

e1000_clean_tx_ring(adapter);

- vfree(adapter->tx_ring.buffer_info);
+ kfree(adapter->tx_ring.buffer_info);
adapter->tx_ring.buffer_info = NULL;

pci_free_consistent(pdev, adapter->tx_ring.size,
@@ -1279,7 +1280,7 @@ e1000_free_rx_resources(struct e1000_ada

e1000_clean_rx_ring(adapter);

- vfree(rx_ring->buffer_info);
+ kfree(rx_ring->buffer_info);
rx_ring->buffer_info = NULL;

pci_free_consistent(pdev, rx_ring->size, rx_ring->desc, rx_ring->dma);
Index: linux-2.6.11/include/linux/netdevice.h
===================================================================
--- linux-2.6.11.orig/include/linux/netdevice.h 2005-05-17 16:44:20.000000000 -0700
+++ linux-2.6.11/include/linux/netdevice.h 2005-05-17 17:03:01.000000000 -0700
@@ -279,6 +279,7 @@ struct net_device
unsigned long mem_start; /* shared mem start */
unsigned long base_addr; /* device I/O address */
unsigned int irq; /* device IRQ number */
+ unsigned int node; /* device node number */

/*
* Some hardware also needs these fields, but they are not


2005-05-18 02:04:44

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] NUMA aware allocation of transmit and receive buffers for e1000

Christoph Lameter <[email protected]> wrote:
>
> NUMA awareness for the e1000 driver. Allocate transmit and receive buffers
> on the node of the device.

Hast thou any benchmarking results?

> - txdr->buffer_info = vmalloc(size);
> + txdr->buffer_info = kmalloc_node(size, GFP_KERNEL, node);

How come this is safe to do?

2005-05-18 02:53:01

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] NUMA aware allocation of transmit and receive buffers for e1000

On Tue, 17 May 2005, Andrew Morton wrote:

> Christoph Lameter <[email protected]> wrote:
> >
> > NUMA awareness for the e1000 driver. Allocate transmit and receive buffers
> > on the node of the device.
>
> Hast thou any benchmarking results?

Yes, your honor. Just a second .... The patch has been around for a long
time.

No benchmarks results in my email archive. Would need to talk to some
folks tomorrow and maybe we would have to run some new
benchmarks.

> > - txdr->buffer_info = vmalloc(size);
> > + txdr->buffer_info = kmalloc_node(size, GFP_KERNEL, node);
>
> How come that this is safe to do

Because physically contiguous memory is usually better than virtually
contiguous memory? Any reason that physically contiguous memory will
break the driver?

2005-05-18 04:30:49

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] NUMA aware allocation of transmit and receive buffers for e1000

On Tue, 17 May 2005, David S. Miller wrote:

> > Because physically contiguous memory is usually better than virtually
> > contiguous memory? Any reason that physically contiguous memory will
> > break the driver?
>
> The issue is whether size can end up being too large for
> kmalloc() to satisfy, whereas vmalloc() would be able to
> handle it.

Oww.. We need a NUMA aware vmalloc for this?

2005-05-18 04:59:37

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] NUMA aware allocation of transmit and receive buffers for e1000

Christoph Lameter <[email protected]> wrote:
>
> On Tue, 17 May 2005, David S. Miller wrote:
>
> > > Because physically contiguous memory is usually better than virtually
> > > contiguous memory? Any reason that physically contiguous memory will
> > > break the driver?
> >
> > The issue is whether size can end up being too large for
> > kmalloc() to satisfy, whereas vmalloc() would be able to
> > handle it.
>
> Oww.. We need a NUMA aware vmalloc for this?

I think the e1000 driver is being a bit insane there. I figure that
sizeof(struct e1000_buffer) is 28 on 64-bit, so even with 4k pagesize we'll
always succeed in being able to support a 32k/32 = 1024-entry Tx ring.

Is there any real-world reason for wanting larger ring sizes than that?

2005-05-18 13:19:55

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] NUMA aware allocation of transmit and receive buffers for e1000

Christoph Lameter <[email protected]> writes:

> On Tue, 17 May 2005, David S. Miller wrote:
>
>> > Because physically contiguous memory is usually better than virtually
>> > contiguous memory? Any reason that physically contiguous memory will
>> > break the driver?
>>
>> The issue is whether size can end up being too large for
>> kmalloc() to satisfy, whereas vmalloc() would be able to
>> handle it.
>
> Oww.. We need a NUMA aware vmalloc for this?

You can do that already by just changing process NUMA policy temporarily
while calling vmalloc.

-Andi

2005-05-18 16:13:56

by Ganesh Venkatesan

[permalink] [raw]
Subject: Re: [PATCH] NUMA aware allocation of transmit and receive buffers for e1000

On 5/17/05, Andrew Morton <[email protected]> wrote:
> I think the e1000 driver is being a bit insane there. I figure that
Do you mean insane to use vmalloc?

> sizeof(struct e1000_buffer) is 28 on 64-bit, so even with 4k pagesize we'll
> always succeed in being able to support a 32k/32 = 1024-entry Tx ring.
>
> Is there any real-world reason for wanting larger ring sizes than that?
>
>
We have had cases where allocation of 32K of memory (via kmalloc) fails.

ganesh.

2005-05-18 20:43:20

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] NUMA aware allocation of transmit and receive buffers for e1000

Ganesh Venkatesan <[email protected]> wrote:
>
> On 5/17/05, Andrew Morton <[email protected]> wrote:
> > I think the e1000 driver is being a bit insane there. I figure that
> Do you mean insane to use vmalloc?
>
> > sizeof(struct e1000_buffer) is 28 on 64-bit, so even with 4k pagesize we'll
> > always succeed in being able to support a 32k/32 = 1024-entry Tx ring.
> >
> > Is there any real-world reason for wanting larger ring sizes than that?
> >
> >
> We have had cases where allocation of 32K of memory (via kmalloc) fails.
>

Are you sure? The current page allocator will infinitely loop until
success for <=32k GFP_KERNEL allocations - the only way it can fail is if
the calling process gets oom-killed.

2005-05-27 21:53:24

by Christoph Lameter

[permalink] [raw]
Subject: [PATCH] e1000: NUMA aware allocation of descriptors V2

NUMA awareness for the e1000 driver. Allocate tx and rx descriptors
on the node of the device.

It is safe to replace vmalloc by kmalloc node since only the descriptors
are allocated in a NUMA aware way. These will not be so large that the
use of vmalloc becomes necesssary.

The patch includes a modification to slab.h to revert from inline functions
for kmalloc_node/kmem_cache_alloc_node to a macro so that an undefined variable
may be specified. Is that ok? If so then I probably need to spin a separate
patch just for slab.h.

V1-V2:
- Patch against 2.6.12-rc5-mm1
- Do not defined netdev->node for non NUMA case
- Change kmem_cache_alloc_node and kmalloc_node to fall back to macro
definitions for the non numa case so that an undefined variable can be
specified.

References to earlier discussions:
http://marc.theaimsgroup.com/?t=111638151000001&r=1&w=2

Note that i386 pci_alloc_coherent also needs to be made NUMA aware.

Signed-off-by: Christoph Lameter <[email protected]>
Signed-off-by: Justin M. Forbes <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>

Index: linux-2.6.12-rc5/drivers/net/e1000/e1000_main.c
===================================================================
--- linux-2.6.12-rc5.orig/drivers/net/e1000/e1000_main.c 2005-05-27 21:22:39.000000000 +0000
+++ linux-2.6.12-rc5/drivers/net/e1000/e1000_main.c 2005-05-27 21:25:28.000000000 +0000
@@ -567,7 +567,9 @@
netdev->mem_start = mmio_start;
netdev->mem_end = mmio_start + mmio_len;
netdev->base_addr = adapter->hw.io_base;
-
+#ifdef CONFIG_NUMA
+ netdev->node = pcibus_to_node(pdev->bus);
+#endif
adapter->bd_number = cards_found;

/* setup the private structure */
@@ -971,7 +973,9 @@
int size;

size = sizeof(struct e1000_buffer) * txdr->count;
- txdr->buffer_info = vmalloc(size);
+
+ txdr->buffer_info = kmalloc_node(size, GFP_KERNEL, adapter->netdev->node );
+
if(!txdr->buffer_info) {
DPRINTK(PROBE, ERR,
"Unable to allocate memory for the transmit descriptor ring\n");
@@ -987,7 +991,7 @@
txdr->desc = pci_alloc_consistent(pdev, txdr->size, &txdr->dma);
if(!txdr->desc) {
setup_tx_desc_die:
- vfree(txdr->buffer_info);
+ kfree(txdr->buffer_info);
DPRINTK(PROBE, ERR,
"Unable to allocate memory for the transmit descriptor ring\n");
return -ENOMEM;
@@ -1015,7 +1019,7 @@
DPRINTK(PROBE, ERR,
"Unable to allocate aligned memory "
"for the transmit descriptor ring\n");
- vfree(txdr->buffer_info);
+ kfree(txdr->buffer_info);
return -ENOMEM;
} else {
/* Free old allocation, new allocation was successful */
@@ -1123,7 +1127,8 @@
int size, desc_len;

size = sizeof(struct e1000_buffer) * rxdr->count;
- rxdr->buffer_info = vmalloc(size);
+ rxdr->buffer_info = kmalloc_node(size, GFP_KERNEL, adapter->netdev->node);
+
if(!rxdr->buffer_info) {
DPRINTK(PROBE, ERR,
"Unable to allocate memory for the receive descriptor ring\n");
@@ -1134,7 +1139,7 @@
size = sizeof(struct e1000_ps_page) * rxdr->count;
rxdr->ps_page = kmalloc(size, GFP_KERNEL);
if(!rxdr->ps_page) {
- vfree(rxdr->buffer_info);
+ kfree(rxdr->buffer_info);
DPRINTK(PROBE, ERR,
"Unable to allocate memory for the receive descriptor ring\n");
return -ENOMEM;
@@ -1144,7 +1149,7 @@
size = sizeof(struct e1000_ps_page_dma) * rxdr->count;
rxdr->ps_page_dma = kmalloc(size, GFP_KERNEL);
if(!rxdr->ps_page_dma) {
- vfree(rxdr->buffer_info);
+ kfree(rxdr->buffer_info);
kfree(rxdr->ps_page);
DPRINTK(PROBE, ERR,
"Unable to allocate memory for the receive descriptor ring\n");
@@ -1166,7 +1171,7 @@

if(!rxdr->desc) {
setup_rx_desc_die:
- vfree(rxdr->buffer_info);
+ kfree(rxdr->buffer_info);
kfree(rxdr->ps_page);
kfree(rxdr->ps_page_dma);
DPRINTK(PROBE, ERR,
@@ -1196,7 +1201,7 @@
DPRINTK(PROBE, ERR,
"Unable to allocate aligned memory "
"for the receive descriptor ring\n");
- vfree(rxdr->buffer_info);
+ kfree(rxdr->buffer_info);
kfree(rxdr->ps_page);
kfree(rxdr->ps_page_dma);
return -ENOMEM;
@@ -1393,7 +1398,7 @@

e1000_clean_tx_ring(adapter);

- vfree(adapter->tx_ring.buffer_info);
+ kfree(adapter->tx_ring.buffer_info);
adapter->tx_ring.buffer_info = NULL;

pci_free_consistent(pdev, adapter->tx_ring.size,
@@ -1473,7 +1478,7 @@

e1000_clean_rx_ring(adapter);

- vfree(rx_ring->buffer_info);
+ kfree(rx_ring->buffer_info);
rx_ring->buffer_info = NULL;
kfree(rx_ring->ps_page);
rx_ring->ps_page = NULL;
Index: linux-2.6.12-rc5/include/linux/netdevice.h
===================================================================
--- linux-2.6.12-rc5.orig/include/linux/netdevice.h 2005-05-27 21:22:39.000000000 +0000
+++ linux-2.6.12-rc5/include/linux/netdevice.h 2005-05-27 21:22:43.000000000 +0000
@@ -279,7 +279,9 @@
unsigned long mem_start; /* shared mem start */
unsigned long base_addr; /* device I/O address */
unsigned int irq; /* device IRQ number */
-
+#ifdef CONFIG_NUMA
+ unsigned int node; /* device node number */
+#endif
/*
* Some hardware also needs these fields, but they are not
* part of the usual set specified in Space.c.
Index: linux-2.6.12-rc5/include/linux/slab.h
===================================================================
--- linux-2.6.12-rc5.orig/include/linux/slab.h 2005-05-25 03:31:20.000000000 +0000
+++ linux-2.6.12-rc5/include/linux/slab.h 2005-05-27 21:30:42.000000000 +0000
@@ -106,14 +106,12 @@
extern void *kmem_cache_alloc_node(kmem_cache_t *, int flags, int node);
extern void *kmalloc_node(size_t size, int flags, int node);
#else
-static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int node)
-{
- return kmem_cache_alloc(cachep, flags);
-}
-static inline void *kmalloc_node(size_t size, int flags, int node)
-{
- return kmalloc(size, flags);
-}
+/*
+ * The definitions are macros here to allow the use of an undefined variable
+ * for the node. The variable may only be defined if CONFIG_NUMA is set.
+ */
+#define kmem_cache_alloc_node(__cachep, __flags, __node) kmem_cache_alloc(__cachep, __flags)
+#define kmalloc_node(__size, __flags, __node) kmalloc(__size, __flags)
#endif

extern int FASTCALL(kmem_cache_reap(int));

2005-05-27 23:24:53

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] e1000: NUMA aware allocation of descriptors V2

Christoph Lameter <[email protected]> wrote:
>
> NUMA awareness for the e1000 driver. Allocate tx and rx descriptors
> on the node of the device.
>
> It is safe to replace vmalloc by kmalloc node since only the descriptors
> are allocated in a NUMA aware way. These will not be so large that the
> use of vmalloc becomes necesssary.

Really? That's probably OK with the default number of tx descriptors, but
that number can be made arbitrarily large with a module parameter.

Could you please work this with the e1000 developers?