LinuxLists.cc - [RFC,PATCH 11/38] svc: Add xpo

2007-11-29 23:20:24

Subject: [RFC,PATCH 11/38] svc: Add xpo_accept transport function

Previously, the accept logic looked into the socket state to determine
whether to call accept or recv when data-ready was indicated on an endpoint.
Since some transports don't use sockets, this logic was changed to use a flag
bit (SK_LISTENER) to identify listening endpoints. A transport function
(xpo_accept) was added to allow each transport to define its own accept
processing. A transport's initialization logic is reponsible for setting the
SK_LISTENER bit. I didn't see any way to do this in transport independent
logic since the passive side of a UDP connection doesn't listen and
always recv's.

In the svc_recv function, if the SK_LISTENER bit is set, the transport
xpo_accept function is called to handle accept processing.

Note that all functions are defined even if they don't make sense
for a given transport. For example, accept doesn't mean anything for
UDP. The fuction is defined anyway and bug checks if called. The
UDP transport should never set the SK_LISTENER bit.

The code that poaches connections when the connection
limit is hit was moved to a subroutine to make the accept logic path
easier to follow. Since this is in the new connection path, it should
not be a performance issue.

Signed-off-by: Tom Tucker <[email protected]>
---

include/linux/sunrpc/svc_xprt.h | 1
include/linux/sunrpc/svcsock.h | 1
net/sunrpc/svcsock.c | 127 +++++++++++++++++++++------------------
3 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 3adc8f3..1527ff1 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -10,6 +10,7 @@
#include <linux/sunrpc/svc.h>

struct svc_xprt_ops {
+ struct svc_xprt *(*xpo_accept)(struct svc_xprt *);
int (*xpo_has_wspace)(struct svc_xprt *);
int (*xpo_recvfrom)(struct svc_rqst *);
void (*xpo_prep_reply_hdr)(struct svc_rqst *);
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 08e78d0..9882ce0 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -36,6 +36,7 @@ struct svc_sock {
#define SK_DEFERRED 8 /* request on sk_deferred */
#define SK_OLD 9 /* used for temp socket aging mark+sweep */
#define SK_DETACHED 10 /* detached from tempsocks list */
+#define SK_LISTENER 11 /* listening endpoint */

atomic_t sk_reserved; /* space on outq that is reserved */

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 38ecdd1..661162b 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -912,6 +912,12 @@ static int svc_udp_has_wspace(struct svc_xprt *xprt)
return 1;
}

+static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
+{
+ BUG();
+ return NULL;
+}
+
static struct svc_xprt_ops svc_udp_ops = {
.xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto,
@@ -920,6 +926,7 @@ static struct svc_xprt_ops svc_udp_ops = {
.xpo_free = svc_sock_free,
.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
.xpo_has_wspace = svc_udp_has_wspace,
+ .xpo_accept = svc_udp_accept,
};

static struct svc_xprt_class svc_udp_class = {
@@ -1044,9 +1051,9 @@ static inline int svc_port_is_privileged(struct sockaddr *sin)
/*
* Accept a TCP connection
*/
-static void
-svc_tcp_accept(struct svc_sock *svsk)
+static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
struct sockaddr_storage addr;
struct sockaddr *sin = (struct sockaddr *) &addr;
struct svc_serv *serv = svsk->sk_server;
@@ -1058,7 +1065,7 @@ svc_tcp_accept(struct svc_sock *svsk)

dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
if (!sock)
- return;
+ return NULL;

clear_bit(SK_CONN, &svsk->sk_flags);
err = kernel_accept(sock, &newsock, O_NONBLOCK);
@@ -1069,7 +1076,7 @@ svc_tcp_accept(struct svc_sock *svsk)
else if (err != -EAGAIN && net_ratelimit())
printk(KERN_WARNING "%s: accept failed (err %d)!\n",
serv->sv_name, -err);
- return;
+ return NULL;
}

set_bit(SK_CONN, &svsk->sk_flags);
@@ -1115,59 +1122,14 @@ svc_tcp_accept(struct svc_sock *svsk)

svc_sock_received(newsvsk);

- /* make sure that we don't have too many active connections.
- * If we have, something must be dropped.
- *
- * There's no point in trying to do random drop here for
- * DoS prevention. The NFS clients does 1 reconnect in 15
- * seconds. An attacker can easily beat that.
- *
- * The only somewhat efficient mechanism would be if drop
- * old connections from the same IP first. But right now
- * we don't even record the client IP in svc_sock.
- */
- if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
- struct svc_sock *svsk = NULL;
- spin_lock_bh(&serv->sv_lock);
- if (!list_empty(&serv->sv_tempsocks)) {
- if (net_ratelimit()) {
- /* Try to help the admin */
- printk(KERN_NOTICE "%s: too many open TCP "
- "sockets, consider increasing the "
- "number of nfsd threads\n",
- serv->sv_name);
- printk(KERN_NOTICE
- "%s: last TCP connect from %s\n",
- serv->sv_name, __svc_print_addr(sin,
- buf, sizeof(buf)));
- }
- /*
- * Always select the oldest socket. It's not fair,
- * but so is life
- */
- svsk = list_entry(serv->sv_tempsocks.prev,
- struct svc_sock,
- sk_list);
- set_bit(SK_CLOSE, &svsk->sk_flags);
- atomic_inc(&svsk->sk_inuse);
- }
- spin_unlock_bh(&serv->sv_lock);
-
- if (svsk) {
- svc_sock_enqueue(svsk);
- svc_sock_put(svsk);
- }
-
- }
-
if (serv->sv_stats)
serv->sv_stats->nettcpconn++;

- return;
+ return &newsvsk->sk_xprt;

failed:
sock_release(newsock);
- return;
+ return NULL;
}

/*
@@ -1192,12 +1154,6 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
return svc_deferred_recv(rqstp);
}

- if (svsk->sk_sk->sk_state == TCP_LISTEN) {
- svc_tcp_accept(svsk);
- svc_sock_received(svsk);
- return 0;
- }
-
if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
/* sndbuf needs to have room for one request
* per thread, otherwise we can stall even when the
@@ -1403,6 +1359,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
.xpo_free = svc_sock_free,
.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
.xpo_has_wspace = svc_tcp_has_wspace,
+ .xpo_accept = svc_tcp_accept,
};

static struct svc_xprt_class svc_tcp_class = {
@@ -1433,6 +1390,7 @@ svc_tcp_init(struct svc_sock *svsk)

if (sk->sk_state == TCP_LISTEN) {
dprintk("setting up TCP socket for listening\n");
+ set_bit(SK_LISTENER, &svsk->sk_flags);
sk->sk_data_ready = svc_tcp_listen_data_ready;
set_bit(SK_CONN, &svsk->sk_flags);
} else {
@@ -1484,6 +1442,55 @@ svc_sock_update_bufs(struct svc_serv *serv)
spin_unlock_bh(&serv->sv_lock);
}

+static void
+svc_check_conn_limits(struct svc_serv *serv)
+{
+ char buf[RPC_MAX_ADDRBUFLEN];
+
+ /* make sure that we don't have too many active connections.
+ * If we have, something must be dropped.
+ *
+ * There's no point in trying to do random drop here for
+ * DoS prevention. The NFS clients does 1 reconnect in 15
+ * seconds. An attacker can easily beat that.
+ *
+ * The only somewhat efficient mechanism would be if drop
+ * old connections from the same IP first. But right now
+ * we don't even record the client IP in svc_sock.
+ */
+ if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
+ struct svc_sock *svsk = NULL;
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&serv->sv_tempsocks)) {
+ if (net_ratelimit()) {
+ /* Try to help the admin */
+ printk(KERN_NOTICE "%s: too many open TCP "
+ "sockets, consider increasing the "
+ "number of nfsd threads\n",
+ serv->sv_name);
+ printk(KERN_NOTICE
+ "%s: last TCP connect from %s\n",
+ serv->sv_name, buf);
+ }
+ /*
+ * Always select the oldest socket. It's not fair,
+ * but so is life
+ */
+ svsk = list_entry(serv->sv_tempsocks.prev,
+ struct svc_sock,
+ sk_list);
+ set_bit(SK_CLOSE, &svsk->sk_flags);
+ atomic_inc(&svsk->sk_inuse);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ if (svsk) {
+ svc_sock_enqueue(svsk);
+ svc_sock_put(svsk);
+ }
+ }
+}
+
/*
* Receive the next request on any socket. This code is carefully
* organised not to touch any cachelines in the shared svc_serv
@@ -1579,6 +1586,12 @@ svc_recv(struct svc_rqst *rqstp, long timeout)
if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
dprintk("svc_recv: found SK_CLOSE\n");
svc_delete_socket(svsk);
+ } else if (test_bit(SK_LISTENER, &svsk->sk_flags)) {
+ struct svc_xprt *newxpt;
+ newxpt = svsk->sk_xprt.xpt_ops->xpo_accept(&svsk->sk_xprt);
+ if (newxpt)
+ svc_check_conn_limits(svsk->sk_server);
+ svc_sock_received(svsk);
} else {
dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));

2007-11-30 21:03:14

by Chuck Lever

[permalink] [raw]

Subject: Re: [RFC,PATCH 11/38] svc: Add xpo_accept transport function

The refactoring here helps clarity. More below.

On Nov 29, 2007, at 5:40 PM, Tom Tucker wrote:
> Previously, the accept logic looked into the socket state to determine
> whether to call accept or recv when data-ready was indicated on an
> endpoint.
> Since some transports don't use sockets, this logic was changed to
> use a flag
> bit (SK_LISTENER) to identify listening endpoints. A transport
> function
> (xpo_accept) was added to allow each transport to define its own
> accept
> processing. A transport's initialization logic is reponsible for
> setting the
> SK_LISTENER bit. I didn't see any way to do this in transport
> independent
> logic since the passive side of a UDP connection doesn't listen and
> always recv's.
>
> In the svc_recv function, if the SK_LISTENER bit is set, the transport
> xpo_accept function is called to handle accept processing.
>
> Note that all functions are defined even if they don't make sense
> for a given transport. For example, accept doesn't mean anything for
> UDP. The fuction is defined anyway and bug checks if called. The

s/fuction/function/

:-O

> UDP transport should never set the SK_LISTENER bit.
>
> The code that poaches connections when the connection
> limit is hit was moved to a subroutine to make the accept logic path
> easier to follow. Since this is in the new connection path, it should
> not be a performance issue.
>
> Signed-off-by: Tom Tucker <[email protected]>
> ---
>
> include/linux/sunrpc/svc_xprt.h | 1
> include/linux/sunrpc/svcsock.h | 1
> net/sunrpc/svcsock.c | 127 ++++++++++++++++++++
> +------------------
> 3 files changed, 72 insertions(+), 57 deletions(-)
>
> diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/
> svc_xprt.h
> index 3adc8f3..1527ff1 100644
> --- a/include/linux/sunrpc/svc_xprt.h
> +++ b/include/linux/sunrpc/svc_xprt.h
> @@ -10,6 +10,7 @@
> #include <linux/sunrpc/svc.h>
>
> struct svc_xprt_ops {
> + struct svc_xprt *(*xpo_accept)(struct svc_xprt *);
> int (*xpo_has_wspace)(struct svc_xprt *);
> int (*xpo_recvfrom)(struct svc_rqst *);
> void (*xpo_prep_reply_hdr)(struct svc_rqst *);
> diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/
> svcsock.h
> index 08e78d0..9882ce0 100644
> --- a/include/linux/sunrpc/svcsock.h
> +++ b/include/linux/sunrpc/svcsock.h
> @@ -36,6 +36,7 @@ struct svc_sock {
> #define SK_DEFERRED 8 /* request on sk_deferred */
> #define SK_OLD 9 /* used for temp socket aging mark+sweep */
> #define SK_DETACHED 10 /* detached from tempsocks list */
> +#define SK_LISTENER 11 /* listening endpoint */
>
> atomic_t sk_reserved; /* space on outq that is reserved */
>
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index 38ecdd1..661162b 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -912,6 +912,12 @@ static int svc_udp_has_wspace(struct svc_xprt
> *xprt)
> return 1;
> }
>
> +static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
> +{
> + BUG();
> + return NULL;
> +}
> +
> static struct svc_xprt_ops svc_udp_ops = {
> .xpo_recvfrom = svc_udp_recvfrom,
> .xpo_sendto = svc_udp_sendto,
> @@ -920,6 +926,7 @@ static struct svc_xprt_ops svc_udp_ops = {
> .xpo_free = svc_sock_free,
> .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
> .xpo_has_wspace = svc_udp_has_wspace,
> + .xpo_accept = svc_udp_accept,
> };
>
> static struct svc_xprt_class svc_udp_class = {
> @@ -1044,9 +1051,9 @@ static inline int svc_port_is_privileged
> (struct sockaddr *sin)
> /*
> * Accept a TCP connection
> */
> -static void
> -svc_tcp_accept(struct svc_sock *svsk)
> +static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
> {
> + struct svc_sock *svsk = container_of(xprt, struct svc_sock,
> sk_xprt);
> struct sockaddr_storage addr;
> struct sockaddr *sin = (struct sockaddr *) &addr;
> struct svc_serv *serv = svsk->sk_server;
> @@ -1058,7 +1065,7 @@ svc_tcp_accept(struct svc_sock *svsk)
>
> dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
> if (!sock)
> - return;
> + return NULL;
>
> clear_bit(SK_CONN, &svsk->sk_flags);
> err = kernel_accept(sock, &newsock, O_NONBLOCK);
> @@ -1069,7 +1076,7 @@ svc_tcp_accept(struct svc_sock *svsk)
> else if (err != -EAGAIN && net_ratelimit())
> printk(KERN_WARNING "%s: accept failed (err %d)!\n",
> serv->sv_name, -err);
> - return;
> + return NULL;
> }
>
> set_bit(SK_CONN, &svsk->sk_flags);
> @@ -1115,59 +1122,14 @@ svc_tcp_accept(struct svc_sock *svsk)
>
> svc_sock_received(newsvsk);
>
> - /* make sure that we don't have too many active connections.
> - * If we have, something must be dropped.
> - *
> - * There's no point in trying to do random drop here for
> - * DoS prevention. The NFS clients does 1 reconnect in 15
> - * seconds. An attacker can easily beat that.
> - *
> - * The only somewhat efficient mechanism would be if drop
> - * old connections from the same IP first. But right now
> - * we don't even record the client IP in svc_sock.
> - */

> - if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
> - struct svc_sock *svsk = NULL;
> - spin_lock_bh(&serv->sv_lock);
> - if (!list_empty(&serv->sv_tempsocks)) {
> - if (net_ratelimit()) {
> - /* Try to help the admin */
> - printk(KERN_NOTICE "%s: too many open TCP "
> - "sockets, consider increasing the "
> - "number of nfsd threads\n",
> - serv->sv_name);
> - printk(KERN_NOTICE
> - "%s: last TCP connect from %s\n",
> - serv->sv_name, __svc_print_addr(sin,
> - buf, sizeof(buf)));
> - }
> - /*
> - * Always select the oldest socket. It's not fair,
> - * but so is life
> - */
> - svsk = list_entry(serv->sv_tempsocks.prev,
> - struct svc_sock,
> - sk_list);
> - set_bit(SK_CLOSE, &svsk->sk_flags);
> - atomic_inc(&svsk->sk_inuse);
> - }
> - spin_unlock_bh(&serv->sv_lock);
> -
> - if (svsk) {
> - svc_sock_enqueue(svsk);
> - svc_sock_put(svsk);
> - }
> -
> - }
> -
> if (serv->sv_stats)
> serv->sv_stats->nettcpconn++;
>
> - return;
> + return &newsvsk->sk_xprt;
>
> failed:
> sock_release(newsock);
> - return;
> + return NULL;
> }
>
> /*
> @@ -1192,12 +1154,6 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
> return svc_deferred_recv(rqstp);
> }
>
> - if (svsk->sk_sk->sk_state == TCP_LISTEN) {
> - svc_tcp_accept(svsk);
> - svc_sock_received(svsk);
> - return 0;
> - }
> -
> if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
> /* sndbuf needs to have room for one request
> * per thread, otherwise we can stall even when the
> @@ -1403,6 +1359,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
> .xpo_free = svc_sock_free,
> .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
> .xpo_has_wspace = svc_tcp_has_wspace,
> + .xpo_accept = svc_tcp_accept,
> };
>
> static struct svc_xprt_class svc_tcp_class = {
> @@ -1433,6 +1390,7 @@ svc_tcp_init(struct svc_sock *svsk)
>
> if (sk->sk_state == TCP_LISTEN) {
> dprintk("setting up TCP socket for listening\n");
> + set_bit(SK_LISTENER, &svsk->sk_flags);
> sk->sk_data_ready = svc_tcp_listen_data_ready;
> set_bit(SK_CONN, &svsk->sk_flags);
> } else {
> @@ -1484,6 +1442,55 @@ svc_sock_update_bufs(struct svc_serv *serv)
> spin_unlock_bh(&serv->sv_lock);
> }
>
> +static void
> +svc_check_conn_limits(struct svc_serv *serv)

Style police want the return value type and function declaration on
the same line.

> +{
> + char buf[RPC_MAX_ADDRBUFLEN];
> +
> + /* make sure that we don't have too many active connections.
> + * If we have, something must be dropped.
> + *
> + * There's no point in trying to do random drop here for
> + * DoS prevention. The NFS clients does 1 reconnect in 15
> + * seconds. An attacker can easily beat that.
> + *
> + * The only somewhat efficient mechanism would be if drop
> + * old connections from the same IP first. But right now
> + * we don't even record the client IP in svc_sock.
> + */

Just a personal preference: I think this would better serve as a
block comment placed just before the function declaration above.

> + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {

Would be nice if the naked constants were defined as macros
somewhere. Some rationale for these values would be nice (does
anyone (ie, Greg!) remember why these values were chosen?).

> + struct svc_sock *svsk = NULL;
> + spin_lock_bh(&serv->sv_lock);
> + if (!list_empty(&serv->sv_tempsocks)) {
> + if (net_ratelimit()) {
> + /* Try to help the admin */
> + printk(KERN_NOTICE "%s: too many open TCP "
> + "sockets, consider increasing the "
> + "number of nfsd threads\n",
> + serv->sv_name);
> + printk(KERN_NOTICE
> + "%s: last TCP connect from %s\n",
> + serv->sv_name, buf);
> + }
> + /*
> + * Always select the oldest socket. It's not fair,
> + * but so is life
> + */
> + svsk = list_entry(serv->sv_tempsocks.prev,
> + struct svc_sock,
> + sk_list);
> + set_bit(SK_CLOSE, &svsk->sk_flags);
> + atomic_inc(&svsk->sk_inuse);
> + }
> + spin_unlock_bh(&serv->sv_lock);
> +
> + if (svsk) {
> + svc_sock_enqueue(svsk);
> + svc_sock_put(svsk);
> + }
> + }
> +}
> +
> /*
> * Receive the next request on any socket. This code is carefully
> * organised not to touch any cachelines in the shared svc_serv
> @@ -1579,6 +1586,12 @@ svc_recv(struct svc_rqst *rqstp, long timeout)
> if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
> dprintk("svc_recv: found SK_CLOSE\n");
> svc_delete_socket(svsk);
> + } else if (test_bit(SK_LISTENER, &svsk->sk_flags)) {
> + struct svc_xprt *newxpt;
> + newxpt = svsk->sk_xprt.xpt_ops->xpo_accept(&svsk->sk_xprt);
> + if (newxpt)
> + svc_check_conn_limits(svsk->sk_server);
> + svc_sock_received(svsk);
> } else {
> dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
> rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com

2007-11-30 21:43:15

by Tom Tucker

[permalink] [raw]

Subject: Re: [RFC,PATCH 11/38] svc: Add xpo_accept transport function

On Fri, 2007-11-30 at 16:01 -0500, Chuck Lever wrote:
> The refactoring here helps clarity. More below.
>
> On Nov 29, 2007, at 5:40 PM, Tom Tucker wrote:
> > Previously, the accept logic looked into the socket state to determine
> > whether to call accept or recv when data-ready was indicated on an
> > endpoint.
> > Since some transports don't use sockets, this logic was changed to
> > use a flag
> > bit (SK_LISTENER) to identify listening endpoints. A transport
> > function
> > (xpo_accept) was added to allow each transport to define its own
> > accept
> > processing. A transport's initialization logic is reponsible for
> > setting the
> > SK_LISTENER bit. I didn't see any way to do this in transport
> > independent
> > logic since the passive side of a UDP connection doesn't listen and
> > always recv's.
> >
> > In the svc_recv function, if the SK_LISTENER bit is set, the transport
> > xpo_accept function is called to handle accept processing.
> >
> > Note that all functions are defined even if they don't make sense
> > for a given transport. For example, accept doesn't mean anything for
> > UDP. The fuction is defined anyway and bug checks if called. The
>
> s/fuction/function/
>
> :-O

ok.

>
> > UDP transport should never set the SK_LISTENER bit.
> >
> > The code that poaches connections when the connection
> > limit is hit was moved to a subroutine to make the accept logic path
> > easier to follow. Since this is in the new connection path, it should
> > not be a performance issue.
> >
> > Signed-off-by: Tom Tucker <[email protected]>
> > ---
> >
> > include/linux/sunrpc/svc_xprt.h | 1
> > include/linux/sunrpc/svcsock.h | 1
> > net/sunrpc/svcsock.c | 127 ++++++++++++++++++++
> > +------------------
> > 3 files changed, 72 insertions(+), 57 deletions(-)
> >
> > diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/
> > svc_xprt.h
> > index 3adc8f3..1527ff1 100644
> > --- a/include/linux/sunrpc/svc_xprt.h
> > +++ b/include/linux/sunrpc/svc_xprt.h
> > @@ -10,6 +10,7 @@
> > #include <linux/sunrpc/svc.h>
> >
> > struct svc_xprt_ops {
> > + struct svc_xprt *(*xpo_accept)(struct svc_xprt *);
> > int (*xpo_has_wspace)(struct svc_xprt *);
> > int (*xpo_recvfrom)(struct svc_rqst *);
> > void (*xpo_prep_reply_hdr)(struct svc_rqst *);
> > diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/
> > svcsock.h
> > index 08e78d0..9882ce0 100644
> > --- a/include/linux/sunrpc/svcsock.h
> > +++ b/include/linux/sunrpc/svcsock.h
> > @@ -36,6 +36,7 @@ struct svc_sock {
> > #define SK_DEFERRED 8 /* request on sk_deferred */
> > #define SK_OLD 9 /* used for temp socket aging mark+sweep */
> > #define SK_DETACHED 10 /* detached from tempsocks list */
> > +#define SK_LISTENER 11 /* listening endpoint */
> >
> > atomic_t sk_reserved; /* space on outq that is reserved */
> >
> > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > index 38ecdd1..661162b 100644
> > --- a/net/sunrpc/svcsock.c
> > +++ b/net/sunrpc/svcsock.c
> > @@ -912,6 +912,12 @@ static int svc_udp_has_wspace(struct svc_xprt
> > *xprt)
> > return 1;
> > }
> >
> > +static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
> > +{
> > + BUG();
> > + return NULL;
> > +}
> > +
> > static struct svc_xprt_ops svc_udp_ops = {
> > .xpo_recvfrom = svc_udp_recvfrom,
> > .xpo_sendto = svc_udp_sendto,
> > @@ -920,6 +926,7 @@ static struct svc_xprt_ops svc_udp_ops = {
> > .xpo_free = svc_sock_free,
> > .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
> > .xpo_has_wspace = svc_udp_has_wspace,
> > + .xpo_accept = svc_udp_accept,
> > };
> >
> > static struct svc_xprt_class svc_udp_class = {
> > @@ -1044,9 +1051,9 @@ static inline int svc_port_is_privileged
> > (struct sockaddr *sin)
> > /*
> > * Accept a TCP connection
> > */
> > -static void
> > -svc_tcp_accept(struct svc_sock *svsk)
> > +static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
> > {
> > + struct svc_sock *svsk = container_of(xprt, struct svc_sock,
> > sk_xprt);
> > struct sockaddr_storage addr;
> > struct sockaddr *sin = (struct sockaddr *) &addr;
> > struct svc_serv *serv = svsk->sk_server;
> > @@ -1058,7 +1065,7 @@ svc_tcp_accept(struct svc_sock *svsk)
> >
> > dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
> > if (!sock)
> > - return;
> > + return NULL;
> >
> > clear_bit(SK_CONN, &svsk->sk_flags);
> > err = kernel_accept(sock, &newsock, O_NONBLOCK);
> > @@ -1069,7 +1076,7 @@ svc_tcp_accept(struct svc_sock *svsk)
> > else if (err != -EAGAIN && net_ratelimit())
> > printk(KERN_WARNING "%s: accept failed (err %d)!\n",
> > serv->sv_name, -err);
> > - return;
> > + return NULL;
> > }
> >
> > set_bit(SK_CONN, &svsk->sk_flags);
> > @@ -1115,59 +1122,14 @@ svc_tcp_accept(struct svc_sock *svsk)
> >
> > svc_sock_received(newsvsk);
> >
> > - /* make sure that we don't have too many active connections.
> > - * If we have, something must be dropped.
> > - *
> > - * There's no point in trying to do random drop here for
> > - * DoS prevention. The NFS clients does 1 reconnect in 15
> > - * seconds. An attacker can easily beat that.
> > - *
> > - * The only somewhat efficient mechanism would be if drop
> > - * old connections from the same IP first. But right now
> > - * we don't even record the client IP in svc_sock.
> > - */
>
> > - if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
> > - struct svc_sock *svsk = NULL;
> > - spin_lock_bh(&serv->sv_lock);
> > - if (!list_empty(&serv->sv_tempsocks)) {
> > - if (net_ratelimit()) {
> > - /* Try to help the admin */
> > - printk(KERN_NOTICE "%s: too many open TCP "
> > - "sockets, consider increasing the "
> > - "number of nfsd threads\n",
> > - serv->sv_name);
> > - printk(KERN_NOTICE
> > - "%s: last TCP connect from %s\n",
> > - serv->sv_name, __svc_print_addr(sin,
> > - buf, sizeof(buf)));
> > - }
> > - /*
> > - * Always select the oldest socket. It's not fair,
> > - * but so is life
> > - */
> > - svsk = list_entry(serv->sv_tempsocks.prev,
> > - struct svc_sock,
> > - sk_list);
> > - set_bit(SK_CLOSE, &svsk->sk_flags);
> > - atomic_inc(&svsk->sk_inuse);
> > - }
> > - spin_unlock_bh(&serv->sv_lock);
> > -
> > - if (svsk) {
> > - svc_sock_enqueue(svsk);
> > - svc_sock_put(svsk);
> > - }
> > -
> > - }
> > -
> > if (serv->sv_stats)
> > serv->sv_stats->nettcpconn++;
> >
> > - return;
> > + return &newsvsk->sk_xprt;
> >
> > failed:
> > sock_release(newsock);
> > - return;
> > + return NULL;
> > }
> >
> > /*
> > @@ -1192,12 +1154,6 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
> > return svc_deferred_recv(rqstp);
> > }
> >
> > - if (svsk->sk_sk->sk_state == TCP_LISTEN) {
> > - svc_tcp_accept(svsk);
> > - svc_sock_received(svsk);
> > - return 0;
> > - }
> > -
> > if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
> > /* sndbuf needs to have room for one request
> > * per thread, otherwise we can stall even when the
> > @@ -1403,6 +1359,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
> > .xpo_free = svc_sock_free,
> > .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
> > .xpo_has_wspace = svc_tcp_has_wspace,
> > + .xpo_accept = svc_tcp_accept,
> > };
> >
> > static struct svc_xprt_class svc_tcp_class = {
> > @@ -1433,6 +1390,7 @@ svc_tcp_init(struct svc_sock *svsk)
> >
> > if (sk->sk_state == TCP_LISTEN) {
> > dprintk("setting up TCP socket for listening\n");
> > + set_bit(SK_LISTENER, &svsk->sk_flags);
> > sk->sk_data_ready = svc_tcp_listen_data_ready;
> > set_bit(SK_CONN, &svsk->sk_flags);
> > } else {
> > @@ -1484,6 +1442,55 @@ svc_sock_update_bufs(struct svc_serv *serv)
> > spin_unlock_bh(&serv->sv_lock);
> > }
> >
> > +static void
> > +svc_check_conn_limits(struct svc_serv *serv)
>
> Style police want the return value type and function declaration on
> the same line.
>

yes.

> > +{
> > + char buf[RPC_MAX_ADDRBUFLEN];
> > +
> > + /* make sure that we don't have too many active connections.
> > + * If we have, something must be dropped.
> > + *
> > + * There's no point in trying to do random drop here for
> > + * DoS prevention. The NFS clients does 1 reconnect in 15
> > + * seconds. An attacker can easily beat that.
> > + *
> > + * The only somewhat efficient mechanism would be if drop
> > + * old connections from the same IP first. But right now
> > + * we don't even record the client IP in svc_sock.
> > + */
>
> Just a personal preference: I think this would better serve as a
> block comment placed just before the function declaration above.
>

agreed.

> > + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
>
> Would be nice if the naked constants were defined as macros
> somewhere. Some rationale for these values would be nice (does
> anyone (ie, Greg!) remember why these values were chosen?).
>

I've noodled on this before and produced nothing but heat. I'm afraid
the best I can come up with is #define THREE and #define TWENTY :-)

> > + struct svc_sock *svsk = NULL;
> > + spin_lock_bh(&serv->sv_lock);
> > + if (!list_empty(&serv->sv_tempsocks)) {
> > + if (net_ratelimit()) {
> > + /* Try to help the admin */
> > + printk(KERN_NOTICE "%s: too many open TCP "
> > + "sockets, consider increasing the "
> > + "number of nfsd threads\n",
> > + serv->sv_name);
> > + printk(KERN_NOTICE
> > + "%s: last TCP connect from %s\n",
> > + serv->sv_name, buf);
> > + }
> > + /*
> > + * Always select the oldest socket. It's not fair,
> > + * but so is life
> > + */
> > + svsk = list_entry(serv->sv_tempsocks.prev,
> > + struct svc_sock,
> > + sk_list);
> > + set_bit(SK_CLOSE, &svsk->sk_flags);
> > + atomic_inc(&svsk->sk_inuse);
> > + }
> > + spin_unlock_bh(&serv->sv_lock);
> > +
> > + if (svsk) {
> > + svc_sock_enqueue(svsk);
> > + svc_sock_put(svsk);
> > + }
> > + }
> > +}
> > +
> > /*
> > * Receive the next request on any socket. This code is carefully
> > * organised not to touch any cachelines in the shared svc_serv
> > @@ -1579,6 +1586,12 @@ svc_recv(struct svc_rqst *rqstp, long timeout)
> > if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
> > dprintk("svc_recv: found SK_CLOSE\n");
> > svc_delete_socket(svsk);
> > + } else if (test_bit(SK_LISTENER, &svsk->sk_flags)) {
> > + struct svc_xprt *newxpt;
> > + newxpt = svsk->sk_xprt.xpt_ops->xpo_accept(&svsk->sk_xprt);
> > + if (newxpt)
> > + svc_check_conn_limits(svsk->sk_server);
> > + svc_sock_received(svsk);
> > } else {
> > dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
> > rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));
>
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
>
>
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html