LinuxLists.cc - [net-next rfc 0/3] increase the limit of tuntap queues

[permalink] [raw]

Subject: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

Currently, we use kcalloc to allocate rx/tx queues for a net device which could
be easily lead to a high order memory allocation request when initializing a
multiqueue net device. We can simply avoid this by switching to use flex array
which always allocate at order zero.

Signed-off-by: Jason Wang <[email protected]>
---
include/linux/netdevice.h | 13 ++++++----
net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------
net/core/net-sysfs.c | 15 +++++++----
3 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 09b4188..c0b5d04 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -32,6 +32,7 @@
#include <linux/atomic.h>
#include <asm/cache.h>
#include <asm/byteorder.h>
+#include <linux/flex_array.h>

#include <linux/percpu.h>
#include <linux/rculist.h>
@@ -1230,7 +1231,7 @@ struct net_device {

#ifdef CONFIG_RPS
- struct netdev_rx_queue *_rx;
+ struct flex_array *_rx;

/* Number of RX queues allocated at register_netdev() time */
unsigned int num_rx_queues;
@@ -1250,7 +1251,7 @@ struct net_device {
/*
* Cache lines mostly used on transmit path
*/
- struct netdev_queue *_tx ____cacheline_aligned_in_smp;
+ struct flex_array *_tx ____cacheline_aligned_in_smp;

/* Number of TX queues allocated at alloc_netdev_mq() time */
unsigned int num_tx_queues;
@@ -1434,7 +1435,7 @@ static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
unsigned int index)
{
- return &dev->_tx[index];
+ return flex_array_get(dev->_tx, index);
}

static inline void netdev_for_each_tx_queue(struct net_device *dev,
@@ -1445,8 +1446,10 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
{
unsigned int i;

- for (i = 0; i < dev->num_tx_queues; i++)
- f(dev, &dev->_tx[i], arg);
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = flex_array_get(dev->_tx, i);
+ f(dev, txq, arg);
+ }
}

extern struct netdev_queue *netdev_pick_tx(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index fa007db..3a4ecb1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
#include <linux/hashtable.h>
+#include <linux/flex_array.h>

#include "net-sysfs.h"

@@ -2902,7 +2903,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (rxq_index == skb_get_rx_queue(skb))
goto out;

- rxqueue = dev->_rx + rxq_index;
+ rxqueue = flex_array_get(dev->_rx, rxq_index);
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
@@ -2950,9 +2951,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
dev->name, index, dev->real_num_rx_queues);
goto done;
}
- rxqueue = dev->_rx + index;
+ rxqueue = flex_array_get(dev->_rx, index);
} else
- rxqueue = dev->_rx;
+ rxqueue = flex_array_get(dev->_rx, 0);

map = rcu_dereference(rxqueue->rps_map);
if (map) {
@@ -3038,7 +3039,7 @@ done:
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
u32 flow_id, u16 filter_id)
{
- struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct netdev_rx_queue *rxqueue = flex_array_get(dev->_rx, rxq_index);
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
@@ -5223,18 +5224,31 @@ EXPORT_SYMBOL(netif_stacked_transfer_operstate);
static int netif_alloc_rx_queues(struct net_device *dev)
{
unsigned int i, count = dev->num_rx_queues;
- struct netdev_rx_queue *rx;
+ struct flex_array *rx;
+ int err;

BUG_ON(count < 1);

- rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
- if (!rx)
+ rx = flex_array_alloc(sizeof(struct netdev_rx_queue), count,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!rx) {
+ pr_err("netdev: Unable to allocate flex array for rx queues\n");
return -ENOMEM;
+ }
+
+ err = flex_array_prealloc(rx, 0, count, GFP_KERNEL | __GFP_ZERO);
+ if (err) {
+ pr_err("netdev, Unable to prealloc %u rx qeueus\n", count);
+ flex_array_free(rx);
+ return err;
+ }

dev->_rx = rx;

- for (i = 0; i < count; i++)
- rx[i].dev = dev;
+ for (i = 0; i < count; i++) {
+ struct netdev_rx_queue *rxq = flex_array_get(rx, i);
+ rxq->dev = dev;
+ }
return 0;
}
#endif
@@ -5256,13 +5270,24 @@ static void netdev_init_one_queue(struct net_device *dev,
static int netif_alloc_netdev_queues(struct net_device *dev)
{
unsigned int count = dev->num_tx_queues;
- struct netdev_queue *tx;
+ struct flex_array *tx;
+ int err;

BUG_ON(count < 1);

- tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
- if (!tx)
+ tx = flex_array_alloc(sizeof(struct netdev_queue), count,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!tx) {
+ pr_err("netdev: Unable to allocate flex array for tx queues\n");
return -ENOMEM;
+ }
+
+ err = flex_array_prealloc(tx, 0, count, GFP_KERNEL | __GFP_ZERO);
+ if (err) {
+ pr_err("netdev, Unable to prealloc %u tx qeueus\n", count);
+ flex_array_free(tx);
+ return err;
+ }

dev->_tx = tx;

@@ -5811,9 +5836,9 @@ free_all:

free_pcpu:
free_percpu(dev->pcpu_refcnt);
- kfree(dev->_tx);
+ flex_array_free(dev->_tx);
#ifdef CONFIG_RPS
- kfree(dev->_rx);
+ flex_array_free(dev->_rx);
#endif

free_p:
@@ -5836,9 +5861,9 @@ void free_netdev(struct net_device *dev)

release_net(dev_net(dev));

- kfree(dev->_tx);
+ flex_array_free(dev->_tx);
#ifdef CONFIG_RPS
- kfree(dev->_rx);
+ flex_array_free(dev->_rx);
#endif

kfree(rcu_dereference_protected(dev->ingress_queue, 1));
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 981fed3..4328c3a 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -22,6 +22,7 @@
#include <linux/export.h>
#include <linux/jiffies.h>
#include <linux/pm_runtime.h>
+#include <linux/flex_array.h>

#include "net-sysfs.h"

@@ -717,7 +718,7 @@ static struct kobj_type rx_queue_ktype = {

static int rx_queue_add_kobject(struct net_device *net, int index)
{
- struct netdev_rx_queue *queue = net->_rx + index;
+ struct netdev_rx_queue *queue = flex_array_get(net->_rx, index);
struct kobject *kobj = &queue->kobj;
int error = 0;

@@ -751,8 +752,10 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
}

- while (--i >= new_num)
- kobject_put(&net->_rx[i].kobj);
+ while (--i >= new_num) {
+ struct netdev_rx_queue *rxq = flex_array_get(net->_rx, i);
+ kobject_put(&rxq->kobj);
+ }

return error;
#else
@@ -939,7 +942,7 @@ static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
int i;

for (i = 0; i < dev->num_tx_queues; i++)
- if (queue == &dev->_tx[i])
+ if (queue == (struct netdev_queue *)flex_array_get(dev->_tx, i))
break;

BUG_ON(i >= dev->num_tx_queues);
@@ -1051,7 +1054,7 @@ static struct kobj_type netdev_queue_ktype = {

static int netdev_queue_add_kobject(struct net_device *net, int index)
{
- struct netdev_queue *queue = net->_tx + index;
+ struct netdev_queue *queue = flex_array_get(net->_tx, index);
struct kobject *kobj = &queue->kobj;
int error = 0;

@@ -1093,7 +1096,7 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}

while (--i >= new_num) {
- struct netdev_queue *queue = net->_tx + i;
+ struct netdev_queue *queue = flex_array_get(net->_tx, i);

#ifdef CONFIG_BQL
sysfs_remove_group(&queue->kobj, &dql_group);
--
1.7.1

2013-06-19 05:51:49

[permalink] [raw]

Subject: [net-next rfc 3/3] tuntap: increase the max queues to 16

Since we've reduce the size of tun_struct and use flex array to allocate netdev
queues, it's safe for us to increase the limit of queues in tuntap.

Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 5 +----
1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8c5c124..205f6aa 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -110,10 +110,7 @@ struct tap_filter {
unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN];
};

-/* DEFAULT_MAX_NUM_RSS_QUEUES were choosed to let the rx/tx queues allocated for
- * the netdevice to be fit in one page. So we can make sure the success of
- * memory allocation. TODO: increase the limit. */
-#define MAX_TAP_QUEUES DEFAULT_MAX_NUM_RSS_QUEUES
+#define MAX_TAP_QUEUES 16
#define MAX_TAP_FLOWS 4096

#define TUN_FLOW_EXPIRE (3 * HZ)
--
1.7.1

2013-06-19 05:51:46

[permalink] [raw]

Subject: [net-next rfc 2/3] tuntap: reduce the size of tun_struct by using flex array

This patch switches to use flex array to implement the flow caches, it can
brings several advantages:

- save the size of the tun_struct structure, which can allows us to increase the
upper limit of queues in the future.
- avoid higher order memory allocation which could be used when switching to use
pure hashing in flow cache who may demand a larger size array in the future.

After this patch, the size of tun_struct on x86_64 were reduced from 8512 to
328.

Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 54 +++++++++++++++++++++++++++++++++++++----------
net/openvswitch/flow.c | 2 +-
2 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a344270..8c5c124 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -64,6 +64,7 @@
#include <linux/nsproxy.h>
#include <linux/virtio_net.h>
#include <linux/rcupdate.h>
+#include <linux/flex_array.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
@@ -180,7 +181,7 @@ struct tun_struct {
int debug;
#endif
spinlock_t lock;
- struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
+ struct flex_array *flows;
struct timer_list flow_gc_timer;
unsigned long ageing_time;
unsigned int numdisabled;
@@ -239,10 +240,11 @@ static void tun_flow_flush(struct tun_struct *tun)

spin_lock_bh(&tun->lock);
for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
+ struct hlist_head *h = flex_array_get(tun->flows, i);
struct tun_flow_entry *e;
struct hlist_node *n;

- hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
+ hlist_for_each_entry_safe(e, n, h, hash_link)
tun_flow_delete(tun, e);
}
spin_unlock_bh(&tun->lock);
@@ -254,10 +256,11 @@ static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)

spin_lock_bh(&tun->lock);
for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
+ struct hlist_head *h = flex_array_get(tun->flows, i);
struct tun_flow_entry *e;
struct hlist_node *n;

- hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
+ hlist_for_each_entry_safe(e, n, h, hash_link) {
if (e->queue_index == queue_index)
tun_flow_delete(tun, e);
}
@@ -277,10 +280,11 @@ static void tun_flow_cleanup(unsigned long data)

spin_lock_bh(&tun->lock);
for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
+ struct hlist_head *h = flex_array_get(tun->flows, i);
struct tun_flow_entry *e;
struct hlist_node *n;

- hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
+ hlist_for_each_entry_safe(e, n, h, hash_link) {
unsigned long this_timer;
count++;
this_timer = e->updated + delay;
@@ -307,7 +311,7 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
if (!rxhash)
return;
else
- head = &tun->flows[tun_hashfn(rxhash)];
+ head = flex_array_get(tun->flows, tun_hashfn(rxhash));

rcu_read_lock();

@@ -356,7 +360,8 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)

txq = skb_get_rxhash(skb);
if (txq) {
- e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
+ e = tun_flow_find(flex_array_get(tun->flows, tun_hashfn(txq)),
+ txq);
if (e)
txq = e->queue_index;
else
@@ -841,23 +846,45 @@ static const struct net_device_ops tap_netdev_ops = {
#endif
};

-static void tun_flow_init(struct tun_struct *tun)
+static int tun_flow_init(struct tun_struct *tun, bool mq)
{
- int i;
+ struct flex_array *buckets;
+ int i, err;
+
+ if (!mq)
+ return 0;
+
+ buckets = flex_array_alloc(sizeof(struct hlist_head),
+ TUN_NUM_FLOW_ENTRIES, GFP_KERNEL);
+ if (!buckets)
+ return -ENOMEM;

+ err = flex_array_prealloc(buckets, 0, TUN_NUM_FLOW_ENTRIES, GFP_KERNEL);
+ if (err) {
+ flex_array_free(buckets);
+ return -ENOMEM;
+ }
+
+ tun->flows = buckets;
for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
- INIT_HLIST_HEAD(&tun->flows[i]);
+ INIT_HLIST_HEAD((struct hlist_head *)
+ flex_array_get(buckets, i));

tun->ageing_time = TUN_FLOW_EXPIRE;
setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
mod_timer(&tun->flow_gc_timer,
round_jiffies_up(jiffies + tun->ageing_time));
+
+ return 0;
}

static void tun_flow_uninit(struct tun_struct *tun)
{
- del_timer_sync(&tun->flow_gc_timer);
- tun_flow_flush(tun);
+ if (tun->flags & TUN_TAP_MQ) {
+ del_timer_sync(&tun->flow_gc_timer);
+ tun_flow_flush(tun);
+ flex_array_free(tun->flows);
+ }
}

/* Initialize net device. */
@@ -1660,7 +1687,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
goto err_free_dev;

tun_net_init(dev);
- tun_flow_init(tun);
+
+ err = tun_flow_init(tun, queues > 1);
+ if (err < 0)
+ goto err_free_dev;

dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
TUN_USER_FEATURES;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 093c191..5787acc 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -241,7 +241,7 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
struct flex_array *buckets;
int i, err;

- buckets = flex_array_alloc(sizeof(struct hlist_head *),
+ buckets = flex_array_alloc(sizeof(struct hlist_head),
n_buckets, GFP_KERNEL);
if (!buckets)
return NULL;
--
1.7.1

2013-06-19 06:32:07

[permalink] [raw]

Subject: Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
> Currently, we use kcalloc to allocate rx/tx queues for a net device which could
> be easily lead to a high order memory allocation request when initializing a
> multiqueue net device. We can simply avoid this by switching to use flex array
> which always allocate at order zero.
>
> Signed-off-by: Jason Wang <[email protected]>
> ---
> include/linux/netdevice.h | 13 ++++++----
> net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------
> net/core/net-sysfs.c | 15 +++++++----
> 3 files changed, 58 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 09b4188..c0b5d04 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -32,6 +32,7 @@
> #include <linux/atomic.h>
> #include <asm/cache.h>
> #include <asm/byteorder.h>
> +#include <linux/flex_array.h>
>
> #include <linux/percpu.h>
> #include <linux/rculist.h>
> @@ -1230,7 +1231,7 @@ struct net_device {
>
>
> #ifdef CONFIG_RPS
> - struct netdev_rx_queue *_rx;
> + struct flex_array *_rx;
>
> /* Number of RX queues allocated at register_netdev() time */
> unsigned int num_rx_queues;
> @@ -1250,7 +1251,7 @@ struct net_device {
> /*
> * Cache lines mostly used on transmit path
> */
> - struct netdev_queue *_tx ____cacheline_aligned_in_smp;
> + struct flex_array *_tx ____cacheline_aligned_in_smp;
>

Using flex_array and adding overhead in this super critical part of
network stack, only to avoid order-1 allocations done in GFP_KERNEL
context is simply insane.

We can revisit this in 2050 if we ever need order-4 allocations or so,
and still use 4K pages.

2013-06-19 06:34:20

[permalink] [raw]

Subject: Re: [net-next rfc 3/3] tuntap: increase the max queues to 16

On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
> Since we've reduce the size of tun_struct and use flex array to allocate netdev
> queues, it's safe for us to increase the limit of queues in tuntap.

Its already safe to increase max queues to 16, without your patches 1 &
2

2013-06-19 07:15:18

[permalink] [raw]

Subject: Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

On 06/19/2013 02:31 PM, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
>> Currently, we use kcalloc to allocate rx/tx queues for a net device which could
>> be easily lead to a high order memory allocation request when initializing a
>> multiqueue net device. We can simply avoid this by switching to use flex array
>> which always allocate at order zero.
>>
>> Signed-off-by: Jason Wang <[email protected]>
>> ---
>> include/linux/netdevice.h | 13 ++++++----
>> net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------
>> net/core/net-sysfs.c | 15 +++++++----
>> 3 files changed, 58 insertions(+), 27 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 09b4188..c0b5d04 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -32,6 +32,7 @@
>> #include <linux/atomic.h>
>> #include <asm/cache.h>
>> #include <asm/byteorder.h>
>> +#include <linux/flex_array.h>
>>
>> #include <linux/percpu.h>
>> #include <linux/rculist.h>
>> @@ -1230,7 +1231,7 @@ struct net_device {
>>
>>
>> #ifdef CONFIG_RPS
>> - struct netdev_rx_queue *_rx;
>> + struct flex_array *_rx;
>>
>> /* Number of RX queues allocated at register_netdev() time */
>> unsigned int num_rx_queues;
>> @@ -1250,7 +1251,7 @@ struct net_device {
>> /*
>> * Cache lines mostly used on transmit path
>> */
>> - struct netdev_queue *_tx ____cacheline_aligned_in_smp;
>> + struct flex_array *_tx ____cacheline_aligned_in_smp;
>>
> Using flex_array and adding overhead in this super critical part of
> network stack, only to avoid order-1 allocations done in GFP_KERNEL
> context is simply insane.

Yes, and I also miss the fact of GFP_KERNEL allocation.
> We can revisit this in 2050 if we ever need order-4 allocations or so,
> and still use 4K pages.
>
>

Will drop this patch, thanks.

2013-06-19 07:16:01

[permalink] [raw]

Subject: Re: [net-next rfc 3/3] tuntap: increase the max queues to 16

On 06/19/2013 02:34 PM, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
>> Since we've reduce the size of tun_struct and use flex array to allocate netdev
>> queues, it's safe for us to increase the limit of queues in tuntap.
> Its already safe to increase max queues to 16, without your patches 1 &
> 2
>
>
>

Will send a single patch to increase it to 16.

Thanks

2013-06-19 09:10:57

[permalink] [raw]

Subject: Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

On Tue, Jun 18, 2013 at 11:31:58PM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
> > Currently, we use kcalloc to allocate rx/tx queues for a net device which could
> > be easily lead to a high order memory allocation request when initializing a
> > multiqueue net device. We can simply avoid this by switching to use flex array
> > which always allocate at order zero.
> >
> > Signed-off-by: Jason Wang <[email protected]>
> > ---
> > include/linux/netdevice.h | 13 ++++++----
> > net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------
> > net/core/net-sysfs.c | 15 +++++++----
> > 3 files changed, 58 insertions(+), 27 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 09b4188..c0b5d04 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -32,6 +32,7 @@
> > #include <linux/atomic.h>
> > #include <asm/cache.h>
> > #include <asm/byteorder.h>
> > +#include <linux/flex_array.h>
> >
> > #include <linux/percpu.h>
> > #include <linux/rculist.h>
> > @@ -1230,7 +1231,7 @@ struct net_device {
> >
> >
> > #ifdef CONFIG_RPS
> > - struct netdev_rx_queue *_rx;
> > + struct flex_array *_rx;
> >
> > /* Number of RX queues allocated at register_netdev() time */
> > unsigned int num_rx_queues;
> > @@ -1250,7 +1251,7 @@ struct net_device {
> > /*
> > * Cache lines mostly used on transmit path
> > */
> > - struct netdev_queue *_tx ____cacheline_aligned_in_smp;
> > + struct flex_array *_tx ____cacheline_aligned_in_smp;
> >
>
> Using flex_array and adding overhead in this super critical part of
> network stack, only to avoid order-1 allocations done in GFP_KERNEL
> context is simply insane.
>
> We can revisit this in 2050 if we ever need order-4 allocations or so,
> and still use 4K pages.
>
>

Well KVM supports up to 160 VCPUs on x86.

Creating a queue per CPU is very reasonable, and
assuming cache line size of 64 bytes, netdev_queue seems to be 320
bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
I agree most people don't have such systems yet, but
they do exist.

We can cut the size of netdev_queue, moving out kobj - which
does not seem to be used on data path to a separate structure.
It's 64 byte in size so exactly 256 bytes.
That will get us an order-3 allocation, and there's
some padding there so we won't immediately increase it
the moment we add some fields.

Comments on this idea?

Instead of always using a flex array, we could have
+ struct netdev_queue *_tx; /* Used with small # of queues */
+#ifdef CONFIG_NETDEV_HUGE_NUMBER_OR_QUEUES
+ struct flex_array *_tx_large; /* Used with large # of queues */
+#endif

And fix wrappers to use _tx if not NULL, otherwise _tx_large.

If configured in, it's an extra branch on data path but probably less
costly than the extra indirection.

--
MST

2013-06-19 09:56:18

[permalink] [raw]

Subject: Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:

> Well KVM supports up to 160 VCPUs on x86.
>
> Creating a queue per CPU is very reasonable, and
> assuming cache line size of 64 bytes, netdev_queue seems to be 320
> bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
> I agree most people don't have such systems yet, but
> they do exist.

Even so, it will just work, like a fork() is likely to work, even if a
process needs order-1 allocation for kernel stack.

Some drivers still use order-10 allocations with kmalloc(), and nobody
complained yet.

We had complains with mlx4 driver lately only bcause kmalloc() now gives
a warning if allocations above MAX_ORDER are attempted.

Having a single pointer means that we can :

- Attempts a regular kmalloc() call, it will work most of the time.
- fallback to vmalloc() _if_ kmalloc() failed.

Frankly, if you want one tx queue per cpu, I would rather use
NETIF_F_LLTX, like some other virtual devices.

This way, you can have real per cpu memory, with proper NUMA affinity.

2013-06-19 12:21:38

[permalink] [raw]

Subject: Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

On Wed, Jun 19, 2013 at 02:56:03AM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:
>
> > Well KVM supports up to 160 VCPUs on x86.
> >
> > Creating a queue per CPU is very reasonable, and
> > assuming cache line size of 64 bytes, netdev_queue seems to be 320
> > bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
> > I agree most people don't have such systems yet, but
> > they do exist.
>
> Even so, it will just work, like a fork() is likely to work, even if a
> process needs order-1 allocation for kernel stack.
> Some drivers still use order-10 allocations with kmalloc(), and nobody
> complained yet.
>
> We had complains with mlx4 driver lately only bcause kmalloc() now gives
> a warning if allocations above MAX_ORDER are attempted.
>
> Having a single pointer means that we can :
>
> - Attempts a regular kmalloc() call, it will work most of the time.
> - fallback to vmalloc() _if_ kmalloc() failed.

Most drivers create devices at boot time, when this is more likely to work.
What makes tun (and macvlan) a bit special is that the device is created
from userspace. Virt setups create/destroy them all the
time.

>
> Frankly, if you want one tx queue per cpu, I would rather use
> NETIF_F_LLTX, like some other virtual devices.
>
> This way, you can have real per cpu memory, with proper NUMA affinity.
>

Hmm good point, worth looking at.

Thanks,
--
MST

2013-06-19 15:40:22

[permalink] [raw]

Subject: Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

On Wed, Jun 19, 2013 at 02:56:03AM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:
>
> > Well KVM supports up to 160 VCPUs on x86.
> >
> > Creating a queue per CPU is very reasonable, and
> > assuming cache line size of 64 bytes, netdev_queue seems to be 320
> > bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
> > I agree most people don't have such systems yet, but
> > they do exist.
>
> Even so, it will just work, like a fork() is likely to work, even if a
> process needs order-1 allocation for kernel stack.
>
> Some drivers still use order-10 allocations with kmalloc(), and nobody
> complained yet.
>
> We had complains with mlx4 driver lately only bcause kmalloc() now gives
> a warning if allocations above MAX_ORDER are attempted.
>
> Having a single pointer means that we can :
>
> - Attempts a regular kmalloc() call, it will work most of the time.
> - fallback to vmalloc() _if_ kmalloc() failed.

That's a good trick too - vmalloc memory is a bit slower
on x86 since it's not using a huge page, but that's only
when we have lots of CPUs/queues...

Short term - how about switching to vmalloc if > 32 queues?

> Frankly, if you want one tx queue per cpu, I would rather use
> NETIF_F_LLTX, like some other virtual devices.
>
> This way, you can have real per cpu memory, with proper NUMA affinity.
>
>

2013-06-19 15:58:38

[permalink] [raw]

Subject: Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

On Wed, 2013-06-19 at 18:40 +0300, Michael S. Tsirkin wrote:

> That's a good trick too - vmalloc memory is a bit slower
> on x86 since it's not using a huge page, but that's only
> when we have lots of CPUs/queues...
>
> Short term - how about switching to vmalloc if > 32 queues?

You do not have to switch "if > 32 queues"

diff --git a/net/core/dev.c b/net/core/dev.c
index fa007db..473cc9e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
#include <linux/hashtable.h>
+#include <linux/vmalloc.h>

#include "net-sysfs.h"

@@ -5253,6 +5254,14 @@ static void netdev_init_one_queue(struct net_device *dev,
#endif
}

+static void netif_free_tx_queues(struct net_device *dev)
+{
+ if (is_vmalloc_addr(dev->_tx))
+ vfree(dev->_tx);
+ else
+ kfree(dev->_tx);
+}
+
static int netif_alloc_netdev_queues(struct net_device *dev)
{
unsigned int count = dev->num_tx_queues;
@@ -5261,9 +5270,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
BUG_ON(count < 1);

tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
- if (!tx)
- return -ENOMEM;
-
+ if (!tx) {
+ tx = vzalloc(count * sizeof(struct netdev_queue));
+ if (!tx)
+ return -ENOMEM;
+ }
dev->_tx = tx;

netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
@@ -5811,7 +5822,7 @@ free_all:

free_pcpu:
free_percpu(dev->pcpu_refcnt);
- kfree(dev->_tx);
+ netif_free_tx_queues(dev);
#ifdef CONFIG_RPS
kfree(dev->_rx);
#endif
@@ -5836,7 +5847,7 @@ void free_netdev(struct net_device *dev)

release_net(dev_net(dev));

- kfree(dev->_tx);
+ netif_free_tx_queues(dev);
#ifdef CONFIG_RPS
kfree(dev->_rx);
#endif

2013-06-19 16:16:11

by David Laight

[permalink] [raw]

Subject: RE: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

> +static void netif_free_tx_queues(struct net_device *dev)
> +{
> + if (is_vmalloc_addr(dev->_tx))
> + vfree(dev->_tx);
> + else
> + kfree(dev->_tx);
> +}
> +
> static int netif_alloc_netdev_queues(struct net_device *dev)
> {
> unsigned int count = dev->num_tx_queues;
> @@ -5261,9 +5270,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
> BUG_ON(count < 1);
>
> tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
> - if (!tx)
> - return -ENOMEM;
> -
> + if (!tx) {
> + tx = vzalloc(count * sizeof(struct netdev_queue));
> + if (!tx)
> + return -ENOMEM;
> + }
> dev->_tx = tx;

Given the number of places I've seen this code added, why
not put it in a general helper?

I also thought that malloc() with GFP_KERNEL would sleep.
Under what conditions does it fail instead?

David

????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?

2013-06-19 16:29:00

[permalink] [raw]

Subject: RE: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

On Wed, 2013-06-19 at 17:06 +0100, David Laight wrote:

> Given the number of places I've seen this code added, why
> not put it in a general helper?

Good luck with that. We had many attempts in the past.

> I also thought that malloc() with GFP_KERNEL would sleep.
> Under what conditions does it fail instead?

For example on 32 bit kernel, LOW memory exhaustion/fragmentation.

vmalloc() has an extra 128MB virtual space (on i386 at least with
standard 3G/1G split)

2013-06-19 18:06:29

[permalink] [raw]

Subject: Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

On Wed, Jun 19, 2013 at 08:58:31AM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 18:40 +0300, Michael S. Tsirkin wrote:
>
> > That's a good trick too - vmalloc memory is a bit slower
> > on x86 since it's not using a huge page, but that's only
> > when we have lots of CPUs/queues...
> >
> > Short term - how about switching to vmalloc if > 32 queues?
>
> You do not have to switch "if > 32 queues"
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index fa007db..473cc9e 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -130,6 +130,7 @@
> #include <linux/cpu_rmap.h>
> #include <linux/static_key.h>
> #include <linux/hashtable.h>
> +#include <linux/vmalloc.h>
>
> #include "net-sysfs.h"
>
> @@ -5253,6 +5254,14 @@ static void netdev_init_one_queue(struct net_device *dev,
> #endif
> }
>
> +static void netif_free_tx_queues(struct net_device *dev)
> +{
> + if (is_vmalloc_addr(dev->_tx))
> + vfree(dev->_tx);
> + else
> + kfree(dev->_tx);
> +}
> +
> static int netif_alloc_netdev_queues(struct net_device *dev)
> {
> unsigned int count = dev->num_tx_queues;
> @@ -5261,9 +5270,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
> BUG_ON(count < 1);
>
> tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);

As we have explicit code to recover, maybe set __GFP_NOWARN?
If we are out of memory, vzalloc will warn too, we don't
need two warnings.

> - if (!tx)
> - return -ENOMEM;
> -
> + if (!tx) {
> + tx = vzalloc(count * sizeof(struct netdev_queue));
> + if (!tx)
> + return -ENOMEM;
> + }
> dev->_tx = tx;
>
> netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
> @@ -5811,7 +5822,7 @@ free_all:
>
> free_pcpu:
> free_percpu(dev->pcpu_refcnt);
> - kfree(dev->_tx);
> + netif_free_tx_queues(dev);
> #ifdef CONFIG_RPS
> kfree(dev->_rx);
> #endif
> @@ -5836,7 +5847,7 @@ void free_netdev(struct net_device *dev)
>
> release_net(dev_net(dev));
>
> - kfree(dev->_tx);
> + netif_free_tx_queues(dev);
> #ifdef CONFIG_RPS
> kfree(dev->_rx);
> #endif
>

2013-06-19 19:23:55

by Jerry Chu

[permalink] [raw]

Subject: Re: [net-next rfc 3/3] tuntap: increase the max queues to 16

On Tue, Jun 18, 2013 at 11:34 PM, Eric Dumazet <[email protected]> wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
>> Since we've reduce the size of tun_struct and use flex array to allocate netdev
>> queues, it's safe for us to increase the limit of queues in tuntap.
>
> Its already safe to increase max queues to 16, without your patches 1 &
> 2

How about 32? Will kmem size be an issue?

As others have pointed out it's best to allocate one queue per CPU
(virtual or real)
and > 16 CPU machines are very common. (I only asked for 16 initially
out of concern
for kmem size.)

Thanks,

Jerry

>
>
>

2013-06-20 05:14:35

[permalink] [raw]

Subject: Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array

On 06/19/2013 05:56 PM, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:
>
>> Well KVM supports up to 160 VCPUs on x86.
>>
>> Creating a queue per CPU is very reasonable, and
>> assuming cache line size of 64 bytes, netdev_queue seems to be 320
>> bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
>> I agree most people don't have such systems yet, but
>> they do exist.
> Even so, it will just work, like a fork() is likely to work, even if a
> process needs order-1 allocation for kernel stack.
>
> Some drivers still use order-10 allocations with kmalloc(), and nobody
> complained yet.
>
> We had complains with mlx4 driver lately only bcause kmalloc() now gives
> a warning if allocations above MAX_ORDER are attempted.
>
> Having a single pointer means that we can :
>
> - Attempts a regular kmalloc() call, it will work most of the time.
> - fallback to vmalloc() _if_ kmalloc() failed.
>
> Frankly, if you want one tx queue per cpu, I would rather use
> NETIF_F_LLTX, like some other virtual devices.

A drawback of NETIF_F_LLTX is that we may contend on qdisc lock
especially when we have a huge number of tx queues.
>
> This way, you can have real per cpu memory, with proper NUMA affinity.
>
>
>

2013-06-20 06:05:20