G'day,
This patch attempts to implement a suggestion made by Neil Brown in
offline discussions, to extend the Linux NFS server to use multiple
UDP sockets for load sharing. It turns out that the single UDP socket
is a throughput limiter at surprisingly small loads. With this patch
my test machine goes from being limited at about 146 MB/s in cached
read loads over UDP to about 210~230 MB/s, which is about the same
number as the same test over TCP.
The good thing about this patch is that it uses a couple of handy
network stack features to create multiple UDP sockets automatically
and almost completely transparently to userspace. None of userspace
nfs-utils, the portmapper, client mount, or kernel network stack
need any modifications to work. The patch creates one UDP socket per
network interface, bound to that network interface; all sockets have
the same port.
With the patch, userspace sees
# ifconfig
eth0 Link encap:Ethernet [...] # used for management only
eth1 Link encap:Ethernet [...]
eth2 Link encap:Ethernet [...]
eth3 Link encap:Ethernet [...]
lo Link encap:Local Loopback [...]
# netstat -anu
Active Internet connections (servers and established)
Proto Recv-Q Send-Q Local Address Foreign Address State
[...]
udp 0 0 0.0.0.0:2049 0.0.0.0:*
udp 0 0 0.0.0.0:2049 0.0.0.0:*
udp 0 0 0.0.0.0:2049 0.0.0.0:*
udp 0 0 0.0.0.0:2049 0.0.0.0:*
udp 0 0 0.0.0.0:2049 0.0.0.0:*
[...]
# rpcinfo -p
program vers proto port
[...]
100003 2 udp 2049 nfs
100003 3 udp 2049 nfs
100003 2 tcp 2049 nfs
100003 3 tcp 2049 nfs
[...]
The downside with multiple sockets (both variants that I've tried)
is that while the server supports greater loads, some combinations of
irq-to-cpu mappings and loads have resulted in dramatically unbalanced
service compared to a single-socket server. For example, a single
socket server will spread 146 MB/s among three NICs as about 48
MB/s per NIC but the worst case test showed a multiple socket server
spreading a larger load as 90.6, 7.3, 107.0 MB/s. Other tests show
less dramatic imbalances such as 43.6, 72.5, and 97.5 MB/s, and yet
others show balanced loads.
I'm not sure if this is an Altix platform issue or a problem in the
network stack, so I'm pushing this patch out for comments and testing.
Another unknown is the behaviour in the presence of virtual interfaces
(tunnels, bonding etc).
Anyway, here it is:
Index: linux/fs/nfsd/nfssvc.c
===================================================================
--- linux.orig/fs/nfsd/nfssvc.c Wed May 12 16:27:02 2004
+++ linux/fs/nfsd/nfssvc.c Fri May 14 11:18:44 2004
@@ -31,6 +31,8 @@
#include <linux/nfsd/stats.h>
#include <linux/nfsd/cache.h>
#include <linux/lockd/bind.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
#define NFSDDBG_FACILITY NFSDDBG_SVC
@@ -52,6 +54,8 @@
static void nfsd(struct svc_rqst *rqstp);
struct timeval nfssvc_boot;
static struct svc_serv *nfsd_serv;
+static unsigned short nfsd_port;
+static int nfsd_num_udp_socks;
static atomic_t nfsd_busy;
static unsigned long nfsd_last_call;
static spinlock_t nfsd_call_lock = SPIN_LOCK_UNLOCKED;
@@ -75,6 +79,44 @@
return nfsd_serv->sv_nrthreads;
}
+static int
+nfsd_netdev_notifier(struct notifier_block *self, unsigned long code, void *data)
+{
+ struct net_device *dev = (struct net_device *)data;
+ int err;
+
+ switch (code)
+ {
+ case NETDEV_UP: /* device coming up */
+ dprintk("nfsd: interface %s coming up, creating socket\n",
+ dev->name);
+ lock_kernel();
+ err = svc_makesock_dev(nfsd_serv, IPPROTO_UDP, nfsd_port,
+ dev->ifindex);
+ if (err < 0)
+ printk(KERN_ERR "nfsd: failed to create socket for interface %s\n",
+ dev->name);
+ else
+ nfsd_num_udp_socks++;
+ unlock_kernel();
+ break;
+
+ case NETDEV_GOING_DOWN: /* device going down */
+ dprintk("nfsd: interface %s going down, removing socket\n",
+ dev->name);
+ lock_kernel();
+ if (svc_delete_socket_dev(nfsd_serv, dev->ifindex) >= 0)
+ nfsd_num_udp_socks--;
+ unlock_kernel();
+ break;
+ }
+ return 0;
+}
+
+static struct notifier_block nfsd_netdev_nb = {
+ .notifier_call = nfsd_netdev_notifier
+};
+
int
nfsd_svc(unsigned short port, int nrservs)
{
@@ -101,9 +143,24 @@
nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE);
if (nfsd_serv == NULL)
goto out;
- error = svc_makesock(nfsd_serv, IPPROTO_UDP, port);
- if (error < 0)
- goto failure;
+
+ /*
+ * Register a notifier to be called when net device
+ * state changes; as a side effect the callback is
+ * immediately called for all current devices.
+ */
+ nfsd_num_udp_socks = 0;
+ nfsd_port = port;
+ register_netdevice_notifier(&nfsd_netdev_nb);
+ if (nfsd_num_udp_socks == 0) {
+ /* a socket is bound to the port, or no up devices */
+ unregister_netdevice_notifier(&nfsd_netdev_nb);
+
+ dprintk("nfsd: falling back to global socket\n");
+ error = svc_makesock(nfsd_serv, IPPROTO_UDP, port);
+ if (error < 0)
+ goto failure;
+ }
#ifdef CONFIG_NFSD_TCP
error = svc_makesock(nfsd_serv, IPPROTO_TCP, port);
@@ -267,6 +324,7 @@
if (serv->sv_nrthreads==1) {
printk(KERN_WARNING "nfsd: last server has exited\n");
+ unregister_netdevice_notifier(&nfsd_netdev_nb);
if (err != SIG_NOCLEAN) {
printk(KERN_WARNING "nfsd: unexporting all filesystems\n");
nfsd_export_flush();
Index: linux/include/linux/sunrpc/svcsock.h
===================================================================
--- linux.orig/include/linux/sunrpc/svcsock.h Wed May 12 16:27:02 2004
+++ linux/include/linux/sunrpc/svcsock.h Wed May 12 16:46:43 2004
@@ -56,7 +56,10 @@
* Function prototypes.
*/
int svc_makesock(struct svc_serv *, int, unsigned short);
+int svc_makesock_dev(struct svc_serv *, int, unsigned short,
+ int bind_dev);
void svc_delete_socket(struct svc_sock *);
+int svc_delete_socket_dev(struct svc_serv *serv, int bind_dev);
int svc_recv(struct svc_serv *, struct svc_rqst *, long);
int svc_send(struct svc_rqst *);
void svc_drop(struct svc_rqst *);
Index: linux/net/sunrpc/svcsock.c
===================================================================
--- linux.orig/net/sunrpc/svcsock.c Wed May 12 16:27:02 2004
+++ linux/net/sunrpc/svcsock.c Fri May 14 11:14:08 2004
@@ -1460,7 +1460,7 @@
svc_tcp_init(svsk);
spin_lock_bh(&serv->sv_lock);
- if (!pmap_register) {
+ if (!pmap_register && sock->type == SOCK_STREAM) {
set_bit(SK_TEMP, &svsk->sk_flags);
list_add(&svsk->sk_list, &serv->sv_tempsocks);
serv->sv_tmpcnt++;
@@ -1482,17 +1482,18 @@
* Create socket for RPC service.
*/
static int
-svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin)
+svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin,
+ int bind_dev)
{
struct svc_sock *svsk;
struct socket *sock;
int error;
int type;
- dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n",
+ dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d, %d)\n",
serv->sv_program->pg_name, protocol,
NIPQUAD(sin->sin_addr.s_addr),
- ntohs(sin->sin_port));
+ ntohs(sin->sin_port), bind_dev);
if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
printk(KERN_WARNING "svc: only UDP and TCP "
@@ -1507,11 +1508,14 @@
if (sin != NULL) {
if (type == SOCK_STREAM)
sock->sk->sk_reuse = 1; /* allow address reuse */
+ if (bind_dev)
+ sock->sk->sk_bound_dev_if = bind_dev;
error = sock->ops->bind(sock, (struct sockaddr *) sin,
sizeof(*sin));
if (error < 0)
goto bummer;
}
+
if (protocol == IPPROTO_TCP) {
if ((error = sock->ops->listen(sock, 64)) < 0)
@@ -1528,15 +1532,15 @@
}
/*
- * Remove a dead socket
+ * Common code to remove a dead socket. Should be called with
+ * the svc_serv's spinlock held.
*/
-void
-svc_delete_socket(struct svc_sock *svsk)
+static void
+__svc_delete_socket(struct svc_sock *svsk)
{
struct svc_serv *serv;
struct sock *sk;
- dprintk("svc: svc_delete_socket(%p)\n", svsk);
serv = svsk->sk_server;
sk = svsk->sk_sk;
@@ -1545,8 +1549,6 @@
sk->sk_data_ready = svsk->sk_odata;
sk->sk_write_space = svsk->sk_owspace;
- spin_lock_bh(&serv->sv_lock);
-
list_del_init(&svsk->sk_list);
list_del_init(&svsk->sk_ready);
if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags))
@@ -1554,21 +1556,61 @@
serv->sv_tmpcnt--;
if (!svsk->sk_inuse) {
- spin_unlock_bh(&serv->sv_lock);
sock_release(svsk->sk_sock);
kfree(svsk);
} else {
- spin_unlock_bh(&serv->sv_lock);
dprintk(KERN_NOTICE "svc: server socket destroy delayed\n");
/* svsk->sk_server = NULL; */
}
}
/*
+ * Remove a dead socket
+ */
+void
+svc_delete_socket(struct svc_sock *svsk)
+{
+ struct svc_serv *serv = svsk->sk_server;
+
+ dprintk("svc: svc_delete_socket(%p)\n", svsk);
+
+ spin_lock_bh(&serv->sv_lock);
+ __svc_delete_socket(svsk);
+ spin_unlock_bh(&serv->sv_lock);
+}
+
+/*
+ * Remove any socket attached to the service which is bound to
+ * the given interface index. Used when an interface goes down.
+ * Returns 0 if successful or a negative error code.
+ */
+int
+svc_delete_socket_dev(struct svc_serv *serv, int bind_dev)
+{
+ struct list_head *p;
+ int ret = -ENODEV;
+
+ dprintk("svc: svc_delete_socket_dev(%p, %d)\n", serv, bind_dev);
+
+ spin_lock_bh(&serv->sv_lock);
+ list_for_each(p, &serv->sv_permsocks) {
+ struct svc_sock *svsk = list_entry(p, struct svc_sock, sk_list);
+ if (svsk->sk_sk->sk_bound_dev_if == bind_dev) {
+ __svc_delete_socket(svsk);
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_bh(&serv->sv_lock);
+ return ret;
+}
+
+/*
* Make a socket for nfsd and lockd
*/
int
-svc_makesock(struct svc_serv *serv, int protocol, unsigned short port)
+svc_makesock_dev(struct svc_serv *serv, int protocol, unsigned short port,
+ int bind_dev)
{
struct sockaddr_in sin;
@@ -1576,7 +1618,13 @@
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = INADDR_ANY;
sin.sin_port = htons(port);
- return svc_create_socket(serv, protocol, &sin);
+ return svc_create_socket(serv, protocol, &sin, bind_dev);
+}
+
+int
+svc_makesock(struct svc_serv *serv, int protocol, unsigned short port)
+{
+ return svc_makesock_dev(serv, protocol, port, 0);
}
/*
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.
-------------------------------------------------------
This SF.Net email is sponsored by: SourceForge.net Broadband
Sign-up now for SourceForge Broadband and get the fastest
6.0/768 connection for only $19.95/mo for the first 3 months!
http://ads.osdn.com/?ad_id=2562&alloc_id=6184&op=click
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Thu, 2004-05-13 at 22:23, Greg Banks wrote:
> G'day,
>
> This patch attempts to implement a suggestion made by Neil Brown in
> offline discussions, to extend the Linux NFS server to use multiple
> UDP sockets for load sharing. It turns out that the single UDP socket
> is a throughput limiter at surprisingly small loads. With this patch
> my test machine goes from being limited at about 146 MB/s in cached
> read loads over UDP to about 210~230 MB/s, which is about the same
> number as the same test over TCP.
Greg,
I'm confused here. Are you actually implying that socket contention is
responsible for a full *40%* decrease in throughput? If not, please
could you explain which bottleneck this patch is removing?
Also, isn't that call to sock_release() while holding serv->sv_lock a
tad deadlock-prone? Note that sock_release() will call bh_lock_sock()...
Cheers,
Trond
-------------------------------------------------------
This SF.Net email is sponsored by: SourceForge.net Broadband
Sign-up now for SourceForge Broadband and get the fastest
6.0/768 connection for only $19.95/mo for the first 3 months!
http://ads.osdn.com/?ad_id=2562&alloc_id=6184&op=click
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Fri, May 14, 2004 at 04:21:50PM -0400, Trond Myklebust wrote:
>
> I'm confused here. Are you actually implying that socket contention is
> responsible for a full *40%* decrease in throughput?
At that level of throughput, yes.
> Also, isn't that call to sock_release() while holding serv->sv_lock a
> tad deadlock-prone? Note that sock_release() will call bh_lock_sock()...
Ok, I'll fix that by leaving the unlocks alone and changing the
semantics of __svc_delete_socket() to "call with lock, drops the lock".
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.
-------------------------------------------------------
This SF.Net email is sponsored by: SourceForge.net Broadband
Sign-up now for SourceForge Broadband and get the fastest
6.0/768 connection for only $19.95/mo for the first 3 months!
http://ads.osdn.com/?ad_id=2562&alloc_id=6184&op=click
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Sat, 2004-05-15 at 02:38, Greg Banks wrote:
> On Fri, May 14, 2004 at 04:21:50PM -0400, Trond Myklebust wrote:
> >
> > I'm confused here. Are you actually implying that socket contention is
> > responsible for a full *40%* decrease in throughput?
>
> At that level of throughput, yes.
So what exactly *is* this bottleneck? As far as we're concerned, a
socket is basically just a buffer and a couple of locks.
IOW, are you seeing actual lock contention here, or could we just obtain
the same effect by bumping the send or receive buffer size on the
existing UDP socket?
If you are indeed seeing lock contention, then exactly which lock are we
talking about? Is it spinning on sending the reply, is it the receive
softirq, or is it one of those RPC svcsock-specific locks?
Cheers,
Trond
-------------------------------------------------------
This SF.Net email is sponsored by: SourceForge.net Broadband
Sign-up now for SourceForge Broadband and get the fastest
6.0/768 connection for only $19.95/mo for the first 3 months!
http://ads.osdn.com/?ad_id=2562&alloc_id=6184&op=click
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Saturday May 15, [email protected] wrote:
> On Sat, 2004-05-15 at 02:38, Greg Banks wrote:
> > On Fri, May 14, 2004 at 04:21:50PM -0400, Trond Myklebust wrote:
> > >
> > > I'm confused here. Are you actually implying that socket contention is
> > > responsible for a full *40%* decrease in throughput?
> >
> > At that level of throughput, yes.
>
> So what exactly *is* this bottleneck? As far as we're concerned, a
> socket is basically just a buffer and a couple of locks.
>
It is almost certainly svsk->sk_sem. We hold this while calling
sock->ops->sendpage on each individual page of the reply - we don't
want them mixed up with some other reply.
Remember that the test here involves lots of large READ requests.
Each will have a few pages (this is an ia64 with large pages, so there
will probably be only one or two pages or page-fragments of data) what
will need to be fragmented into multiple UDP packets.
NeilBrown
-------------------------------------------------------
This SF.Net email is sponsored by: SourceForge.net Broadband
Sign-up now for SourceForge Broadband and get the fastest
6.0/768 connection for only $19.95/mo for the first 3 months!
http://ads.osdn.com/?ad_id=2562&alloc_id=6184&op=click
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs
On Sun, May 16, 2004 at 08:24:31AM +1000, Neil Brown wrote:
> On Saturday May 15, [email protected] wrote:
> > So what exactly *is* this bottleneck? As far as we're concerned, a
> > socket is basically just a buffer and a couple of locks.
>
> It is almost certainly svsk->sk_sem. We hold this while calling
> sock->ops->sendpage on each individual page of the reply - we don't
> want them mixed up with some other reply.
Yes, svsk->sk_sem is the culprit.
The test is not CPU limited, so the limit is probably a sleeping lock.
Profiling by context switch shows (butterfly edited to reduce clutter):
-----------------------------------------------
...
0.00 15369.48 15362/55422 schedule_timeout [14]
0.00 16902.23 16894/55422 _down [12]
0.00 22457.94 22447/55422 cpu_idle [9]
[5] 100.0 0.00 55449.00 55422 schedule [5]
...
-----------------------------------------------
...
0.00 16901.23 16893/16894 svc_send [13]
[12] 30.5 0.00 16902.23 16894 _down [12] (actually __down)
0.00 16902.23 16894/55422 schedule [5]
...
-----------------------------------------------
...
0.00 15204.40 15197/15362 svc_recv [15]
[14] 27.7 0.00 15369.48 15362 schedule_timeout [14]
0.00 15369.48 15362/55422 schedule [5]
...
-----------------------------------------------
0.00 16901.23 16891/16891 svc_process [11]
[13] 30.5 0.00 16901.23 16891 svc_send [13]
0.00 16901.23 16893/16894 _down [12]
-----------------------------------------------
0.00 16983.27 16972/16972 nfsd [8]
[11] 30.6 0.00 16983.27 16972 svc_process [11]
0.00 16901.23 16891/16891 svc_send [13]
0.00 82.04 81/81 nfsd_dispatch [35]
-----------------------------------------------
This shows that
* 40.5% of context switches are from CPUs going out of the idle
loop (presumably because an nfsd has become runnable).
* 27.4% of context switches are from nfsd's going idle in svc_recv()
* 30.5% of context switches are from nfsd's calling _down() in
svc_send() which sending replies. There's only one call to
__down() in svc_send, from the down() inline:
int
svc_send(struct svc_rqst *rqstp)
{
struct svc_sock *svsk;
int len;
struct xdr_buf *xb;
[...]
/* Grab svsk->sk_sem to serialize outgoing data. */
down(&svsk->sk_sem); <----------
if (test_bit(SK_DEAD, &svsk->sk_flags))
len = -ENOTCONN;
else
len = svsk->sk_sendto(rqstp);
up(&svsk->sk_sem);
svc_sock_release(rqstp);
[...]
}
> Remember that the test here involves lots of large READ requests.
> Each will have a few pages (this is an ia64 with large pages, so there
> will probably be only one or two pages or page-fragments of data) what
> will need to be fragmented into multiple UDP packets.
Other profiling and tracing code has shown the following behaviour.
The READs are 32K, with IRIX clients which align their requests to
32K so these comprise exactly 2 x 16K pages. MTU is standard 1500
so each READ reply is fragmented into 23 frames, which is seen by
the driver as non-linear skbs with headers and 1 or 2 page-frags.
In total each READ reply takes 21*2+2*3 = 48 send ring descriptors,
of the 512 available in the NIC. The send rate is sufficiently
low that the send ring never gets close to filling, so sent packets
are never queued in software and each call to svc_udp_sendto() runs
through the IP fragmentation code, allocates 23 small skbs, and drops
straight down into the driver to fill in send ring descriptors.
So almost the entire send path is serialised on svsk->sk_sem, which
is why we need more than one UDP socket.
> On Saturday May 15, [email protected] wrote:
> IOW, are you seeing actual lock contention here, or could we just obtain
> the same effect by bumping the send or receive buffer size on the
> existing UDP socket?
Bumping up the socket send and receive spaces has negligible effect.
I tested this by tweaking svcsock.c to add a rational multiplier into
the code which sets the spaces, and ran a 32K UDP READ test (3 gige
NICs, 6 clients, 128 nfsd's, single UDP socket). These numbers are
the average of 5 runs:
mult space throughput
(bytes) (MB/s)
---- -------- -----
1/8 553344 147.2
1/4 1106688 145.0
1/2 2213376 145.0
1 4426752 140.1 <---- default case
2 8853504 141.1
4 17707008 140.7
8 35414016 141.4
Note that by using 128 threads I've already pushed the socket spaces
beyond the point of diminishing returns with a single UDP socket.
Greg.
--
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.
-------------------------------------------------------
This SF.Net email is sponsored by: SourceForge.net Broadband
Sign-up now for SourceForge Broadband and get the fastest
6.0/768 connection for only $19.95/mo for the first 3 months!
http://ads.osdn.com/?ad_id=2562&alloc_id=6184&op=click
_______________________________________________
NFS maillist - [email protected]
https://lists.sourceforge.net/lists/listinfo/nfs