Hi all:
This series tries to increase the limit of tuntap queues. Histrocially there're
two reasons which prevent us from doing this:
- We store the hash buckets in tun_struct which results a very large size of
tun_struct, this high order memory allocation fail easily when the memory were
fragmented.
- The netdev_queue and netdev_rx_queue array in netdevice were allocated through
kmalloc, which may cause a high order memory allocation too when we have
several queues. E.g. sizeof(netdev_queue) is 320, which means a high order
allocation will happens when the device has more than 12 queues.
So this series tries to address those issues by switching to use flex array. All
entries were preallocated, and since flex array always do a order-0 allocation,
we can safely increase the limit after.
Only compile test, comments or review are more than welcomed.
Jason Wang (3):
net: avoid high order memory allocation for queues by using flex
array
tuntap: reduce the size of tun_struct by using flex array
tuntap: increase the max queues to 16
drivers/net/tun.c | 59 ++++++++++++++++++++++++++++++++------------
include/linux/netdevice.h | 13 ++++++----
net/core/dev.c | 57 +++++++++++++++++++++++++++++++------------
net/core/net-sysfs.c | 15 +++++++----
net/openvswitch/flow.c | 2 +-
5 files changed, 102 insertions(+), 44 deletions(-)
Currently, we use kcalloc to allocate rx/tx queues for a net device which could
be easily lead to a high order memory allocation request when initializing a
multiqueue net device. We can simply avoid this by switching to use flex array
which always allocate at order zero.
Signed-off-by: Jason Wang <[email protected]>
---
include/linux/netdevice.h | 13 ++++++----
net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------
net/core/net-sysfs.c | 15 +++++++----
3 files changed, 58 insertions(+), 27 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 09b4188..c0b5d04 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -32,6 +32,7 @@
#include <linux/atomic.h>
#include <asm/cache.h>
#include <asm/byteorder.h>
+#include <linux/flex_array.h>
#include <linux/percpu.h>
#include <linux/rculist.h>
@@ -1230,7 +1231,7 @@ struct net_device {
#ifdef CONFIG_RPS
- struct netdev_rx_queue *_rx;
+ struct flex_array *_rx;
/* Number of RX queues allocated at register_netdev() time */
unsigned int num_rx_queues;
@@ -1250,7 +1251,7 @@ struct net_device {
/*
* Cache lines mostly used on transmit path
*/
- struct netdev_queue *_tx ____cacheline_aligned_in_smp;
+ struct flex_array *_tx ____cacheline_aligned_in_smp;
/* Number of TX queues allocated at alloc_netdev_mq() time */
unsigned int num_tx_queues;
@@ -1434,7 +1435,7 @@ static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
unsigned int index)
{
- return &dev->_tx[index];
+ return flex_array_get(dev->_tx, index);
}
static inline void netdev_for_each_tx_queue(struct net_device *dev,
@@ -1445,8 +1446,10 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
{
unsigned int i;
- for (i = 0; i < dev->num_tx_queues; i++)
- f(dev, &dev->_tx[i], arg);
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = flex_array_get(dev->_tx, i);
+ f(dev, txq, arg);
+ }
}
extern struct netdev_queue *netdev_pick_tx(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index fa007db..3a4ecb1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
#include <linux/hashtable.h>
+#include <linux/flex_array.h>
#include "net-sysfs.h"
@@ -2902,7 +2903,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (rxq_index == skb_get_rx_queue(skb))
goto out;
- rxqueue = dev->_rx + rxq_index;
+ rxqueue = flex_array_get(dev->_rx, rxq_index);
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
@@ -2950,9 +2951,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
dev->name, index, dev->real_num_rx_queues);
goto done;
}
- rxqueue = dev->_rx + index;
+ rxqueue = flex_array_get(dev->_rx, index);
} else
- rxqueue = dev->_rx;
+ rxqueue = flex_array_get(dev->_rx, 0);
map = rcu_dereference(rxqueue->rps_map);
if (map) {
@@ -3038,7 +3039,7 @@ done:
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
u32 flow_id, u16 filter_id)
{
- struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct netdev_rx_queue *rxqueue = flex_array_get(dev->_rx, rxq_index);
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
@@ -5223,18 +5224,31 @@ EXPORT_SYMBOL(netif_stacked_transfer_operstate);
static int netif_alloc_rx_queues(struct net_device *dev)
{
unsigned int i, count = dev->num_rx_queues;
- struct netdev_rx_queue *rx;
+ struct flex_array *rx;
+ int err;
BUG_ON(count < 1);
- rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
- if (!rx)
+ rx = flex_array_alloc(sizeof(struct netdev_rx_queue), count,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!rx) {
+ pr_err("netdev: Unable to allocate flex array for rx queues\n");
return -ENOMEM;
+ }
+
+ err = flex_array_prealloc(rx, 0, count, GFP_KERNEL | __GFP_ZERO);
+ if (err) {
+ pr_err("netdev, Unable to prealloc %u rx qeueus\n", count);
+ flex_array_free(rx);
+ return err;
+ }
dev->_rx = rx;
- for (i = 0; i < count; i++)
- rx[i].dev = dev;
+ for (i = 0; i < count; i++) {
+ struct netdev_rx_queue *rxq = flex_array_get(rx, i);
+ rxq->dev = dev;
+ }
return 0;
}
#endif
@@ -5256,13 +5270,24 @@ static void netdev_init_one_queue(struct net_device *dev,
static int netif_alloc_netdev_queues(struct net_device *dev)
{
unsigned int count = dev->num_tx_queues;
- struct netdev_queue *tx;
+ struct flex_array *tx;
+ int err;
BUG_ON(count < 1);
- tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
- if (!tx)
+ tx = flex_array_alloc(sizeof(struct netdev_queue), count,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!tx) {
+ pr_err("netdev: Unable to allocate flex array for tx queues\n");
return -ENOMEM;
+ }
+
+ err = flex_array_prealloc(tx, 0, count, GFP_KERNEL | __GFP_ZERO);
+ if (err) {
+ pr_err("netdev, Unable to prealloc %u tx qeueus\n", count);
+ flex_array_free(tx);
+ return err;
+ }
dev->_tx = tx;
@@ -5811,9 +5836,9 @@ free_all:
free_pcpu:
free_percpu(dev->pcpu_refcnt);
- kfree(dev->_tx);
+ flex_array_free(dev->_tx);
#ifdef CONFIG_RPS
- kfree(dev->_rx);
+ flex_array_free(dev->_rx);
#endif
free_p:
@@ -5836,9 +5861,9 @@ void free_netdev(struct net_device *dev)
release_net(dev_net(dev));
- kfree(dev->_tx);
+ flex_array_free(dev->_tx);
#ifdef CONFIG_RPS
- kfree(dev->_rx);
+ flex_array_free(dev->_rx);
#endif
kfree(rcu_dereference_protected(dev->ingress_queue, 1));
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 981fed3..4328c3a 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -22,6 +22,7 @@
#include <linux/export.h>
#include <linux/jiffies.h>
#include <linux/pm_runtime.h>
+#include <linux/flex_array.h>
#include "net-sysfs.h"
@@ -717,7 +718,7 @@ static struct kobj_type rx_queue_ktype = {
static int rx_queue_add_kobject(struct net_device *net, int index)
{
- struct netdev_rx_queue *queue = net->_rx + index;
+ struct netdev_rx_queue *queue = flex_array_get(net->_rx, index);
struct kobject *kobj = &queue->kobj;
int error = 0;
@@ -751,8 +752,10 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
}
- while (--i >= new_num)
- kobject_put(&net->_rx[i].kobj);
+ while (--i >= new_num) {
+ struct netdev_rx_queue *rxq = flex_array_get(net->_rx, i);
+ kobject_put(&rxq->kobj);
+ }
return error;
#else
@@ -939,7 +942,7 @@ static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
int i;
for (i = 0; i < dev->num_tx_queues; i++)
- if (queue == &dev->_tx[i])
+ if (queue == (struct netdev_queue *)flex_array_get(dev->_tx, i))
break;
BUG_ON(i >= dev->num_tx_queues);
@@ -1051,7 +1054,7 @@ static struct kobj_type netdev_queue_ktype = {
static int netdev_queue_add_kobject(struct net_device *net, int index)
{
- struct netdev_queue *queue = net->_tx + index;
+ struct netdev_queue *queue = flex_array_get(net->_tx, index);
struct kobject *kobj = &queue->kobj;
int error = 0;
@@ -1093,7 +1096,7 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
while (--i >= new_num) {
- struct netdev_queue *queue = net->_tx + i;
+ struct netdev_queue *queue = flex_array_get(net->_tx, i);
#ifdef CONFIG_BQL
sysfs_remove_group(&queue->kobj, &dql_group);
--
1.7.1
Since we've reduce the size of tun_struct and use flex array to allocate netdev
queues, it's safe for us to increase the limit of queues in tuntap.
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 5 +----
1 files changed, 1 insertions(+), 4 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8c5c124..205f6aa 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -110,10 +110,7 @@ struct tap_filter {
unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN];
};
-/* DEFAULT_MAX_NUM_RSS_QUEUES were choosed to let the rx/tx queues allocated for
- * the netdevice to be fit in one page. So we can make sure the success of
- * memory allocation. TODO: increase the limit. */
-#define MAX_TAP_QUEUES DEFAULT_MAX_NUM_RSS_QUEUES
+#define MAX_TAP_QUEUES 16
#define MAX_TAP_FLOWS 4096
#define TUN_FLOW_EXPIRE (3 * HZ)
--
1.7.1
This patch switches to use flex array to implement the flow caches, it can
brings several advantages:
- save the size of the tun_struct structure, which can allows us to increase the
upper limit of queues in the future.
- avoid higher order memory allocation which could be used when switching to use
pure hashing in flow cache who may demand a larger size array in the future.
After this patch, the size of tun_struct on x86_64 were reduced from 8512 to
328.
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 54 +++++++++++++++++++++++++++++++++++++----------
net/openvswitch/flow.c | 2 +-
2 files changed, 43 insertions(+), 13 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a344270..8c5c124 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -64,6 +64,7 @@
#include <linux/nsproxy.h>
#include <linux/virtio_net.h>
#include <linux/rcupdate.h>
+#include <linux/flex_array.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
@@ -180,7 +181,7 @@ struct tun_struct {
int debug;
#endif
spinlock_t lock;
- struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
+ struct flex_array *flows;
struct timer_list flow_gc_timer;
unsigned long ageing_time;
unsigned int numdisabled;
@@ -239,10 +240,11 @@ static void tun_flow_flush(struct tun_struct *tun)
spin_lock_bh(&tun->lock);
for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
+ struct hlist_head *h = flex_array_get(tun->flows, i);
struct tun_flow_entry *e;
struct hlist_node *n;
- hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
+ hlist_for_each_entry_safe(e, n, h, hash_link)
tun_flow_delete(tun, e);
}
spin_unlock_bh(&tun->lock);
@@ -254,10 +256,11 @@ static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
spin_lock_bh(&tun->lock);
for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
+ struct hlist_head *h = flex_array_get(tun->flows, i);
struct tun_flow_entry *e;
struct hlist_node *n;
- hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
+ hlist_for_each_entry_safe(e, n, h, hash_link) {
if (e->queue_index == queue_index)
tun_flow_delete(tun, e);
}
@@ -277,10 +280,11 @@ static void tun_flow_cleanup(unsigned long data)
spin_lock_bh(&tun->lock);
for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
+ struct hlist_head *h = flex_array_get(tun->flows, i);
struct tun_flow_entry *e;
struct hlist_node *n;
- hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
+ hlist_for_each_entry_safe(e, n, h, hash_link) {
unsigned long this_timer;
count++;
this_timer = e->updated + delay;
@@ -307,7 +311,7 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
if (!rxhash)
return;
else
- head = &tun->flows[tun_hashfn(rxhash)];
+ head = flex_array_get(tun->flows, tun_hashfn(rxhash));
rcu_read_lock();
@@ -356,7 +360,8 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
txq = skb_get_rxhash(skb);
if (txq) {
- e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
+ e = tun_flow_find(flex_array_get(tun->flows, tun_hashfn(txq)),
+ txq);
if (e)
txq = e->queue_index;
else
@@ -841,23 +846,45 @@ static const struct net_device_ops tap_netdev_ops = {
#endif
};
-static void tun_flow_init(struct tun_struct *tun)
+static int tun_flow_init(struct tun_struct *tun, bool mq)
{
- int i;
+ struct flex_array *buckets;
+ int i, err;
+
+ if (!mq)
+ return 0;
+
+ buckets = flex_array_alloc(sizeof(struct hlist_head),
+ TUN_NUM_FLOW_ENTRIES, GFP_KERNEL);
+ if (!buckets)
+ return -ENOMEM;
+ err = flex_array_prealloc(buckets, 0, TUN_NUM_FLOW_ENTRIES, GFP_KERNEL);
+ if (err) {
+ flex_array_free(buckets);
+ return -ENOMEM;
+ }
+
+ tun->flows = buckets;
for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
- INIT_HLIST_HEAD(&tun->flows[i]);
+ INIT_HLIST_HEAD((struct hlist_head *)
+ flex_array_get(buckets, i));
tun->ageing_time = TUN_FLOW_EXPIRE;
setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
mod_timer(&tun->flow_gc_timer,
round_jiffies_up(jiffies + tun->ageing_time));
+
+ return 0;
}
static void tun_flow_uninit(struct tun_struct *tun)
{
- del_timer_sync(&tun->flow_gc_timer);
- tun_flow_flush(tun);
+ if (tun->flags & TUN_TAP_MQ) {
+ del_timer_sync(&tun->flow_gc_timer);
+ tun_flow_flush(tun);
+ flex_array_free(tun->flows);
+ }
}
/* Initialize net device. */
@@ -1660,7 +1687,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
goto err_free_dev;
tun_net_init(dev);
- tun_flow_init(tun);
+
+ err = tun_flow_init(tun, queues > 1);
+ if (err < 0)
+ goto err_free_dev;
dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
TUN_USER_FEATURES;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 093c191..5787acc 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -241,7 +241,7 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
struct flex_array *buckets;
int i, err;
- buckets = flex_array_alloc(sizeof(struct hlist_head *),
+ buckets = flex_array_alloc(sizeof(struct hlist_head),
n_buckets, GFP_KERNEL);
if (!buckets)
return NULL;
--
1.7.1
On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
> Currently, we use kcalloc to allocate rx/tx queues for a net device which could
> be easily lead to a high order memory allocation request when initializing a
> multiqueue net device. We can simply avoid this by switching to use flex array
> which always allocate at order zero.
>
> Signed-off-by: Jason Wang <[email protected]>
> ---
> include/linux/netdevice.h | 13 ++++++----
> net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------
> net/core/net-sysfs.c | 15 +++++++----
> 3 files changed, 58 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 09b4188..c0b5d04 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -32,6 +32,7 @@
> #include <linux/atomic.h>
> #include <asm/cache.h>
> #include <asm/byteorder.h>
> +#include <linux/flex_array.h>
>
> #include <linux/percpu.h>
> #include <linux/rculist.h>
> @@ -1230,7 +1231,7 @@ struct net_device {
>
>
> #ifdef CONFIG_RPS
> - struct netdev_rx_queue *_rx;
> + struct flex_array *_rx;
>
> /* Number of RX queues allocated at register_netdev() time */
> unsigned int num_rx_queues;
> @@ -1250,7 +1251,7 @@ struct net_device {
> /*
> * Cache lines mostly used on transmit path
> */
> - struct netdev_queue *_tx ____cacheline_aligned_in_smp;
> + struct flex_array *_tx ____cacheline_aligned_in_smp;
>
Using flex_array and adding overhead in this super critical part of
network stack, only to avoid order-1 allocations done in GFP_KERNEL
context is simply insane.
We can revisit this in 2050 if we ever need order-4 allocations or so,
and still use 4K pages.
On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
> Since we've reduce the size of tun_struct and use flex array to allocate netdev
> queues, it's safe for us to increase the limit of queues in tuntap.
Its already safe to increase max queues to 16, without your patches 1 &
2
On 06/19/2013 02:31 PM, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
>> Currently, we use kcalloc to allocate rx/tx queues for a net device which could
>> be easily lead to a high order memory allocation request when initializing a
>> multiqueue net device. We can simply avoid this by switching to use flex array
>> which always allocate at order zero.
>>
>> Signed-off-by: Jason Wang <[email protected]>
>> ---
>> include/linux/netdevice.h | 13 ++++++----
>> net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------
>> net/core/net-sysfs.c | 15 +++++++----
>> 3 files changed, 58 insertions(+), 27 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 09b4188..c0b5d04 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -32,6 +32,7 @@
>> #include <linux/atomic.h>
>> #include <asm/cache.h>
>> #include <asm/byteorder.h>
>> +#include <linux/flex_array.h>
>>
>> #include <linux/percpu.h>
>> #include <linux/rculist.h>
>> @@ -1230,7 +1231,7 @@ struct net_device {
>>
>>
>> #ifdef CONFIG_RPS
>> - struct netdev_rx_queue *_rx;
>> + struct flex_array *_rx;
>>
>> /* Number of RX queues allocated at register_netdev() time */
>> unsigned int num_rx_queues;
>> @@ -1250,7 +1251,7 @@ struct net_device {
>> /*
>> * Cache lines mostly used on transmit path
>> */
>> - struct netdev_queue *_tx ____cacheline_aligned_in_smp;
>> + struct flex_array *_tx ____cacheline_aligned_in_smp;
>>
> Using flex_array and adding overhead in this super critical part of
> network stack, only to avoid order-1 allocations done in GFP_KERNEL
> context is simply insane.
Yes, and I also miss the fact of GFP_KERNEL allocation.
> We can revisit this in 2050 if we ever need order-4 allocations or so,
> and still use 4K pages.
>
>
Will drop this patch, thanks.
On 06/19/2013 02:34 PM, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
>> Since we've reduce the size of tun_struct and use flex array to allocate netdev
>> queues, it's safe for us to increase the limit of queues in tuntap.
> Its already safe to increase max queues to 16, without your patches 1 &
> 2
>
>
>
Will send a single patch to increase it to 16.
Thanks
On Tue, Jun 18, 2013 at 11:31:58PM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
> > Currently, we use kcalloc to allocate rx/tx queues for a net device which could
> > be easily lead to a high order memory allocation request when initializing a
> > multiqueue net device. We can simply avoid this by switching to use flex array
> > which always allocate at order zero.
> >
> > Signed-off-by: Jason Wang <[email protected]>
> > ---
> > include/linux/netdevice.h | 13 ++++++----
> > net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------
> > net/core/net-sysfs.c | 15 +++++++----
> > 3 files changed, 58 insertions(+), 27 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 09b4188..c0b5d04 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -32,6 +32,7 @@
> > #include <linux/atomic.h>
> > #include <asm/cache.h>
> > #include <asm/byteorder.h>
> > +#include <linux/flex_array.h>
> >
> > #include <linux/percpu.h>
> > #include <linux/rculist.h>
> > @@ -1230,7 +1231,7 @@ struct net_device {
> >
> >
> > #ifdef CONFIG_RPS
> > - struct netdev_rx_queue *_rx;
> > + struct flex_array *_rx;
> >
> > /* Number of RX queues allocated at register_netdev() time */
> > unsigned int num_rx_queues;
> > @@ -1250,7 +1251,7 @@ struct net_device {
> > /*
> > * Cache lines mostly used on transmit path
> > */
> > - struct netdev_queue *_tx ____cacheline_aligned_in_smp;
> > + struct flex_array *_tx ____cacheline_aligned_in_smp;
> >
>
> Using flex_array and adding overhead in this super critical part of
> network stack, only to avoid order-1 allocations done in GFP_KERNEL
> context is simply insane.
>
> We can revisit this in 2050 if we ever need order-4 allocations or so,
> and still use 4K pages.
>
>
Well KVM supports up to 160 VCPUs on x86.
Creating a queue per CPU is very reasonable, and
assuming cache line size of 64 bytes, netdev_queue seems to be 320
bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
I agree most people don't have such systems yet, but
they do exist.
We can cut the size of netdev_queue, moving out kobj - which
does not seem to be used on data path to a separate structure.
It's 64 byte in size so exactly 256 bytes.
That will get us an order-3 allocation, and there's
some padding there so we won't immediately increase it
the moment we add some fields.
Comments on this idea?
Instead of always using a flex array, we could have
+ struct netdev_queue *_tx; /* Used with small # of queues */
+#ifdef CONFIG_NETDEV_HUGE_NUMBER_OR_QUEUES
+ struct flex_array *_tx_large; /* Used with large # of queues */
+#endif
And fix wrappers to use _tx if not NULL, otherwise _tx_large.
If configured in, it's an extra branch on data path but probably less
costly than the extra indirection.
--
MST
On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:
> Well KVM supports up to 160 VCPUs on x86.
>
> Creating a queue per CPU is very reasonable, and
> assuming cache line size of 64 bytes, netdev_queue seems to be 320
> bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
> I agree most people don't have such systems yet, but
> they do exist.
Even so, it will just work, like a fork() is likely to work, even if a
process needs order-1 allocation for kernel stack.
Some drivers still use order-10 allocations with kmalloc(), and nobody
complained yet.
We had complains with mlx4 driver lately only bcause kmalloc() now gives
a warning if allocations above MAX_ORDER are attempted.
Having a single pointer means that we can :
- Attempts a regular kmalloc() call, it will work most of the time.
- fallback to vmalloc() _if_ kmalloc() failed.
Frankly, if you want one tx queue per cpu, I would rather use
NETIF_F_LLTX, like some other virtual devices.
This way, you can have real per cpu memory, with proper NUMA affinity.
On Wed, Jun 19, 2013 at 02:56:03AM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:
>
> > Well KVM supports up to 160 VCPUs on x86.
> >
> > Creating a queue per CPU is very reasonable, and
> > assuming cache line size of 64 bytes, netdev_queue seems to be 320
> > bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
> > I agree most people don't have such systems yet, but
> > they do exist.
>
> Even so, it will just work, like a fork() is likely to work, even if a
> process needs order-1 allocation for kernel stack.
> Some drivers still use order-10 allocations with kmalloc(), and nobody
> complained yet.
>
> We had complains with mlx4 driver lately only bcause kmalloc() now gives
> a warning if allocations above MAX_ORDER are attempted.
>
> Having a single pointer means that we can :
>
> - Attempts a regular kmalloc() call, it will work most of the time.
> - fallback to vmalloc() _if_ kmalloc() failed.
Most drivers create devices at boot time, when this is more likely to work.
What makes tun (and macvlan) a bit special is that the device is created
from userspace. Virt setups create/destroy them all the
time.
>
> Frankly, if you want one tx queue per cpu, I would rather use
> NETIF_F_LLTX, like some other virtual devices.
>
> This way, you can have real per cpu memory, with proper NUMA affinity.
>
Hmm good point, worth looking at.
Thanks,
--
MST
On Wed, Jun 19, 2013 at 02:56:03AM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:
>
> > Well KVM supports up to 160 VCPUs on x86.
> >
> > Creating a queue per CPU is very reasonable, and
> > assuming cache line size of 64 bytes, netdev_queue seems to be 320
> > bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
> > I agree most people don't have such systems yet, but
> > they do exist.
>
> Even so, it will just work, like a fork() is likely to work, even if a
> process needs order-1 allocation for kernel stack.
>
> Some drivers still use order-10 allocations with kmalloc(), and nobody
> complained yet.
>
> We had complains with mlx4 driver lately only bcause kmalloc() now gives
> a warning if allocations above MAX_ORDER are attempted.
>
> Having a single pointer means that we can :
>
> - Attempts a regular kmalloc() call, it will work most of the time.
> - fallback to vmalloc() _if_ kmalloc() failed.
That's a good trick too - vmalloc memory is a bit slower
on x86 since it's not using a huge page, but that's only
when we have lots of CPUs/queues...
Short term - how about switching to vmalloc if > 32 queues?
> Frankly, if you want one tx queue per cpu, I would rather use
> NETIF_F_LLTX, like some other virtual devices.
>
> This way, you can have real per cpu memory, with proper NUMA affinity.
>
>
On Wed, 2013-06-19 at 18:40 +0300, Michael S. Tsirkin wrote:
> That's a good trick too - vmalloc memory is a bit slower
> on x86 since it's not using a huge page, but that's only
> when we have lots of CPUs/queues...
>
> Short term - how about switching to vmalloc if > 32 queues?
You do not have to switch "if > 32 queues"
diff --git a/net/core/dev.c b/net/core/dev.c
index fa007db..473cc9e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
#include <linux/hashtable.h>
+#include <linux/vmalloc.h>
#include "net-sysfs.h"
@@ -5253,6 +5254,14 @@ static void netdev_init_one_queue(struct net_device *dev,
#endif
}
+static void netif_free_tx_queues(struct net_device *dev)
+{
+ if (is_vmalloc_addr(dev->_tx))
+ vfree(dev->_tx);
+ else
+ kfree(dev->_tx);
+}
+
static int netif_alloc_netdev_queues(struct net_device *dev)
{
unsigned int count = dev->num_tx_queues;
@@ -5261,9 +5270,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
BUG_ON(count < 1);
tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
- if (!tx)
- return -ENOMEM;
-
+ if (!tx) {
+ tx = vzalloc(count * sizeof(struct netdev_queue));
+ if (!tx)
+ return -ENOMEM;
+ }
dev->_tx = tx;
netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
@@ -5811,7 +5822,7 @@ free_all:
free_pcpu:
free_percpu(dev->pcpu_refcnt);
- kfree(dev->_tx);
+ netif_free_tx_queues(dev);
#ifdef CONFIG_RPS
kfree(dev->_rx);
#endif
@@ -5836,7 +5847,7 @@ void free_netdev(struct net_device *dev)
release_net(dev_net(dev));
- kfree(dev->_tx);
+ netif_free_tx_queues(dev);
#ifdef CONFIG_RPS
kfree(dev->_rx);
#endif
> +static void netif_free_tx_queues(struct net_device *dev)
> +{
> + if (is_vmalloc_addr(dev->_tx))
> + vfree(dev->_tx);
> + else
> + kfree(dev->_tx);
> +}
> +
> static int netif_alloc_netdev_queues(struct net_device *dev)
> {
> unsigned int count = dev->num_tx_queues;
> @@ -5261,9 +5270,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
> BUG_ON(count < 1);
>
> tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
> - if (!tx)
> - return -ENOMEM;
> -
> + if (!tx) {
> + tx = vzalloc(count * sizeof(struct netdev_queue));
> + if (!tx)
> + return -ENOMEM;
> + }
> dev->_tx = tx;
Given the number of places I've seen this code added, why
not put it in a general helper?
I also thought that malloc() with GFP_KERNEL would sleep.
Under what conditions does it fail instead?
David
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Wed, 2013-06-19 at 17:06 +0100, David Laight wrote:
> Given the number of places I've seen this code added, why
> not put it in a general helper?
Good luck with that. We had many attempts in the past.
> I also thought that malloc() with GFP_KERNEL would sleep.
> Under what conditions does it fail instead?
For example on 32 bit kernel, LOW memory exhaustion/fragmentation.
vmalloc() has an extra 128MB virtual space (on i386 at least with
standard 3G/1G split)
On Wed, Jun 19, 2013 at 08:58:31AM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 18:40 +0300, Michael S. Tsirkin wrote:
>
> > That's a good trick too - vmalloc memory is a bit slower
> > on x86 since it's not using a huge page, but that's only
> > when we have lots of CPUs/queues...
> >
> > Short term - how about switching to vmalloc if > 32 queues?
>
> You do not have to switch "if > 32 queues"
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index fa007db..473cc9e 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -130,6 +130,7 @@
> #include <linux/cpu_rmap.h>
> #include <linux/static_key.h>
> #include <linux/hashtable.h>
> +#include <linux/vmalloc.h>
>
> #include "net-sysfs.h"
>
> @@ -5253,6 +5254,14 @@ static void netdev_init_one_queue(struct net_device *dev,
> #endif
> }
>
> +static void netif_free_tx_queues(struct net_device *dev)
> +{
> + if (is_vmalloc_addr(dev->_tx))
> + vfree(dev->_tx);
> + else
> + kfree(dev->_tx);
> +}
> +
> static int netif_alloc_netdev_queues(struct net_device *dev)
> {
> unsigned int count = dev->num_tx_queues;
> @@ -5261,9 +5270,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
> BUG_ON(count < 1);
>
> tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
As we have explicit code to recover, maybe set __GFP_NOWARN?
If we are out of memory, vzalloc will warn too, we don't
need two warnings.
> - if (!tx)
> - return -ENOMEM;
> -
> + if (!tx) {
> + tx = vzalloc(count * sizeof(struct netdev_queue));
> + if (!tx)
> + return -ENOMEM;
> + }
> dev->_tx = tx;
>
> netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
> @@ -5811,7 +5822,7 @@ free_all:
>
> free_pcpu:
> free_percpu(dev->pcpu_refcnt);
> - kfree(dev->_tx);
> + netif_free_tx_queues(dev);
> #ifdef CONFIG_RPS
> kfree(dev->_rx);
> #endif
> @@ -5836,7 +5847,7 @@ void free_netdev(struct net_device *dev)
>
> release_net(dev_net(dev));
>
> - kfree(dev->_tx);
> + netif_free_tx_queues(dev);
> #ifdef CONFIG_RPS
> kfree(dev->_rx);
> #endif
>
On Tue, Jun 18, 2013 at 11:34 PM, Eric Dumazet <[email protected]> wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
>> Since we've reduce the size of tun_struct and use flex array to allocate netdev
>> queues, it's safe for us to increase the limit of queues in tuntap.
>
> Its already safe to increase max queues to 16, without your patches 1 &
> 2
How about 32? Will kmem size be an issue?
As others have pointed out it's best to allocate one queue per CPU
(virtual or real)
and > 16 CPU machines are very common. (I only asked for 16 initially
out of concern
for kmem size.)
Thanks,
Jerry
>
>
>
On 06/19/2013 05:56 PM, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:
>
>> Well KVM supports up to 160 VCPUs on x86.
>>
>> Creating a queue per CPU is very reasonable, and
>> assuming cache line size of 64 bytes, netdev_queue seems to be 320
>> bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
>> I agree most people don't have such systems yet, but
>> they do exist.
> Even so, it will just work, like a fork() is likely to work, even if a
> process needs order-1 allocation for kernel stack.
>
> Some drivers still use order-10 allocations with kmalloc(), and nobody
> complained yet.
>
> We had complains with mlx4 driver lately only bcause kmalloc() now gives
> a warning if allocations above MAX_ORDER are attempted.
>
> Having a single pointer means that we can :
>
> - Attempts a regular kmalloc() call, it will work most of the time.
> - fallback to vmalloc() _if_ kmalloc() failed.
>
> Frankly, if you want one tx queue per cpu, I would rather use
> NETIF_F_LLTX, like some other virtual devices.
A drawback of NETIF_F_LLTX is that we may contend on qdisc lock
especially when we have a huge number of tx queues.
>
> This way, you can have real per cpu memory, with proper NUMA affinity.
>
>
>
On Thu, 2013-06-20 at 13:14 +0800, Jason Wang wrote:
> A drawback of NETIF_F_LLTX is that we may contend on qdisc lock
> especially when we have a huge number of tx queues.
For your information loopback driver is LLTX, and there is no qdisc on
it, unless you specifically add one.
Once you add a qdisc on a device, you hit the typical contention on a
spinlock. But its hard to design a parallel qdisc. So far nobody did
that.