LinuxLists.cc - Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

2009-11-01 13:01:34

by William Allen Simpson

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index c118b2a..ec78a4b 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -2,6 +2,7 @@
#define __CRYPTOHASH_H

#define SHA_DIGEST_WORDS 5
+#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
#define SHA_WORKSPACE_WORDS 80

void sha_init(__u32 *buf);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 51b7426..f669c43 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1526,12 +1526,18 @@ static inline int tcp_s_data_size(const struct tcp_sock *tp)
: 0;
}

+/* Using SHA1 for now, define some constants.
+ */
+#define COOKIE_DIGEST_WORDS (SHA_DIGEST_WORDS)
+#define COOKIE_MESSAGE_WORDS (SHA_MESSAGE_BYTES / 4)
+#define COOKIE_WORKSPACE_WORDS (COOKIE_DIGEST_WORDS + COOKIE_MESSAGE_WORDS)
+
/* As tcp_request_sock has already been extended in other places, the
* only remaining method is to pass stack values along as function
* parameters. These parameters are not needed after sending SYNACK.
*/
struct tcp_extend_values {
- u8 cookie_bakery[TCP_COOKIE_MAX];
+ u32 cookie_bakery[COOKIE_WORKSPACE_WORDS];
u8 cookie_plus;
u8 cookie_in_always:1,
cookie_out_never:1;
@@ -1542,6 +1548,8 @@ static inline struct tcp_extend_values *tcp_xv(const struct request_values *rvp)
return (struct tcp_extend_values *)rvp;
}

+extern int tcp_cookie_generator(struct tcp_extend_values *xvp);
+
extern void tcp_v4_init(void);
extern void tcp_init(void);

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 12409df..160b077 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -264,6 +264,7 @@
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/crypto.h>
+#include <linux/time.h>

#include <net/icmp.h>
#include <net/tcp.h>
@@ -2933,6 +2934,146 @@ EXPORT_SYMBOL(tcp_md5_hash_key);

#endif

+/**
+ * Each Responder maintains up to two secret values concurrently for
+ * efficient secret rollover. Each secret value has 4 states:
+ *
+ * Generating.
+ * Generates new Responder-Cookies, but not yet used for primary
+ * verification. This is a short-term state, typically lasting only
+ * one round trip time (RTT).
+ *
+ * Primary.
+ * Used both for generation and primary verification.
+ *
+ * Retiring.
+ * Used for verification, until the first failure that can be
+ * verified by the newer Generating secret. At that time, this
+ * cookie's state is changed to Secondary, and the Generating
+ * cookie's state is changed to Primary. This is a short-term state,
+ * typically lasting only one round trip time (RTT).
+ *
+ * Secondary.
+ * Used for secondary verification, after primary verification
+ * failures. This state lasts no more than twice the Maximum Segment
+ * Lifetime (2MSL). Then, the secret is discarded.
+ */
+struct tcp_cookie_secret {
+ /* The secret is divided into two parts. The digest part is the
+ * equivalent of previously hashing a secret and saving the state,
+ * and serves as an initialization vector (IV). The message part
+ * serves as the trailing secret.
+ */
+ u32 secrets[COOKIE_WORKSPACE_WORDS];
+ unsigned long expires;
+};
+
+#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
+#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
+#define TCP_SECRET_LIFE (HZ * 600)
+
+static struct tcp_cookie_secret tcp_secret_one;
+static struct tcp_cookie_secret tcp_secret_two;
+
+static struct tcp_cookie_secret *tcp_secret_generating;
+static struct tcp_cookie_secret *tcp_secret_primary;
+static struct tcp_cookie_secret *tcp_secret_retiring;
+static struct tcp_cookie_secret *tcp_secret_secondary;
+
+static DEFINE_RWLOCK(tcp_secret_locker);
+
+/* Fill cookie_bakery with current generator, updating as needed.
+ * Returns: 0 for success.
+ */
+int tcp_cookie_generator(struct tcp_extend_values *xvp)
+{
+ u32 secrets[COOKIE_WORKSPACE_WORDS];
+ unsigned long jiffy = jiffies;
+
+ if (unlikely(NULL == tcp_secret_primary)) {
+ struct timespec tv;
+
+ getnstimeofday(&tv);
+ get_random_bytes(secrets, sizeof(secrets));
+
+ /* The first time, paranoia assumes that the randomization
+ * function isn't as strong. But this secret initialization
+ * is delayed until the last possible moment (packet arrival).
+ * Although that time is observable, it is unpredictably
+ * variable. Mash in the most volatile clock bits available,
+ * and expire the secret extra quickly.
+ */
+ secrets[COOKIE_DIGEST_WORDS+0] ^= (u32)tv.tv_nsec;
+
+ write_lock(&tcp_secret_locker);
+ if (NULL != tcp_secret_primary) {
+ /* initializated by another */
+ memcpy(&xvp->cookie_bakery[0],
+ &tcp_secret_primary->secrets[0],
+ sizeof(tcp_secret_primary->secrets));
+ } else {
+ /* still needs initialization */
+ memcpy(&xvp->cookie_bakery[0],
+ &secrets[0],
+ sizeof(secrets));
+ memcpy(&tcp_secret_one.secrets[0],
+ &secrets[0],
+ sizeof(secrets));
+ memcpy(&tcp_secret_two.secrets[0],
+ &secrets[0],
+ sizeof(secrets));
+
+ tcp_secret_one.expires = jiffy + TCP_SECRET_1MSL;
+ tcp_secret_primary = &tcp_secret_one;
+
+ tcp_secret_two.expires = 0; /* past due */
+ tcp_secret_secondary = &tcp_secret_two;
+ }
+ write_unlock(&tcp_secret_locker);
+ } else if (unlikely(time_after(jiffy, tcp_secret_primary->expires))) {
+ get_random_bytes(secrets, sizeof(secrets));
+
+ write_lock(&tcp_secret_locker);
+ if (!time_after(jiffy, tcp_secret_primary->expires)) {
+ /* refreshed by another */
+ memcpy(&xvp->cookie_bakery[0],
+ &tcp_secret_generating->secrets[0],
+ sizeof(tcp_secret_generating->secrets));
+ } else {
+ /* still needs refreshing */
+ memcpy(&xvp->cookie_bakery[0],
+ &secrets[0],
+ sizeof(secrets));
+ memcpy(&tcp_secret_secondary->secrets[0],
+ &secrets[0],
+ sizeof(secrets));
+
+ tcp_secret_secondary->expires = jiffy
+ + TCP_SECRET_LIFE;
+ tcp_secret_generating = tcp_secret_secondary;
+
+ tcp_secret_primary->expires = jiffy
+ + TCP_SECRET_2MSL;
+ tcp_secret_retiring = tcp_secret_primary;
+ }
+ write_unlock(&tcp_secret_locker);
+ } else {
+ read_lock(&tcp_secret_locker);
+ if (unlikely(NULL != tcp_secret_generating)) {
+ memcpy(&xvp->cookie_bakery[0],
+ &tcp_secret_generating->secrets[0],
+ sizeof(tcp_secret_generating->secrets));
+ } else {
+ memcpy(&xvp->cookie_bakery[0],
+ &tcp_secret_primary->secrets[0],
+ sizeof(tcp_secret_primary->secrets));
+ }
+ read_unlock(&tcp_secret_locker);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcp_cookie_generator);
+
void tcp_done(struct sock *sk)
{
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
@@ -3060,6 +3201,11 @@ void __init tcp_init(void)
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);

tcp_register_congestion_control(&tcp_reno);
+
+ tcp_secret_generating = NULL;
+ tcp_secret_primary = NULL;
+ tcp_secret_retiring = NULL;
+ tcp_secret_secondary = NULL;
}

EXPORT_SYMBOL(tcp_close);

Attachments:

TCPCT+1d.patch (6.88 kB)

2009-11-01 18:03:45

by Eric Dumazet

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

William Allen Simpson a ?crit :
> Since October 4th, I've repeatedly asked publicly for assistance with these
> Linux-specific memory locking constructs and cryptography. I've also sent
> private messages. No help has been forthcoming. None. Nada.
>
> At this point, I've spent weeks re-spinning code that I'd understood was
> approved a year ago. The whole project should have been finished by now!

Your messages on netdev are two weeks old, not one year, and came during
LKS. Many developpers were busy in Japan.

>
> So, I'll try a larger audience. Could somebody take a look at my usage of
> read and write locking?
>
> NB, I'm trying to port some 15-year-old fairly simple and straightforward
> (single cpu) code from the KA9Q cooperative multitasking platform.
>
> I've examined existing code used for syncookies and TCP MD5 authenticators.
> Neither meets my needs, as this secret is updated every few minutes. Both
> have very different approaches. They are rarely used. My code will be
> used on the order of tens of thousands of connections per second.
>
> Moreover, it seems to my naive eye that the syncookie per cpu code simply
> doesn't work properly. The workspace is allocated per cpu, but the cpu
> could change during the extensive SHA1 computations. Bug?
>
> Therefore, I'm approaching this as simply as possible. I'm particularly
> concerned about the initialization and cache state of memory pointers.
>
> Does the locking handle this? Or is there more to be done?
>

This patch looks fine, but I dont see how this new function is used.

Some points :

1) We are working hard to remove rwlocks from network stack, so please dont
add a new one. You probably can use a seqlock or RCU, or a server handling
10.000 connections request per second on many NIC will hit this rwlock.

2)

} else if (unlikely(time_after(jiffy, tcp_secret_primary->expires))) {
get_random_bytes(secrets, sizeof(secrets));

write_lock(&tcp_secret_locker);

It would be better to first get the lock, then get random_bytes, in order
not wasting entropy.

3) If you change secret ever 600 seconds, it might be better to use a timer
so that you dont have to check expiration and this logic at each SYN packet.
(Disociate the lookup (read-only, done many time per second) from the updates
(trigerred by a timer every 600 secs))

(Not counting you'll probably need to use a similar lookup algo for the ACK
packet coming from client)

2009-11-02 10:39:14

by William Allen Simpson

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

Eric Dumazet wrote:
> William Allen Simpson a ?crit :
>> Since October 4th, I've repeatedly asked publicly for assistance with these
>> Linux-specific memory locking constructs and cryptography. I've also sent
>> private messages. No help has been forthcoming. None. Nada.
>>
>> At this point, I've spent weeks re-spinning code that I'd understood was
>> approved a year ago. The whole project should have been finished by now!
>
> Your messages on netdev are two weeks old, not one year, and came during
> LKS. Many developpers were busy in Japan.
>
Thank you for your helpful comments. I'm not familiar with LKS. Is that a
month long project?

The message in question was "query: per cpu hash pool spinlock", sent:
"Sun, 04 Oct 2009 06:37:39 -0400".

My initial query for this series was "query: adding a sysctl", sent:
"Fri, 02 Oct 2009 00:00:05 -0400".

The earlier RFC approval was circa 1 year, 11 weeks, 2 days, 23 hours ago:

http://article.gmane.org/gmane.linux.network/102779

>> So, I'll try a larger audience. Could somebody take a look at my usage of
>> read and write locking?
>>
>> NB, I'm trying to port some 15-year-old fairly simple and straightforward
>> (single cpu) code from the KA9Q cooperative multitasking platform.
>>
>> I've examined existing code used for syncookies and TCP MD5 authenticators.
>> Neither meets my needs, as this secret is updated every few minutes. Both
>> have very different approaches. They are rarely used. My code will be
>> used on the order of tens of thousands of connections per second.
>>
>> Moreover, it seems to my naive eye that the syncookie per cpu code simply
>> doesn't work properly. The workspace is allocated per cpu, but the cpu
>> could change during the extensive SHA1 computations. Bug?
>>
The (buggy?) syncookie code that I'm avoiding/replacing is:

static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
ipv4_cookie_scratch);

static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
{
__u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);

memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
tmp[0] = (__force u32)saddr;
tmp[1] = (__force u32)daddr;
tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
tmp[3] = count;
sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);

return tmp[17];
}

It appears to me that the operations could be interrupted at any time, and
the fairly expensive sha transform (or probably any of the assignments)
could be corrupted?

That is, cpu0 grabs a scratch area, copies some data into it, interrupts,
cpu0 comes along again with another packet, points into the same workspace,
mashes the data, while cpu1 completes the previous operation with the
old tmp pointer on the stack.

Worse, this is called twice, each time getting tmp, and mashing the data,
and at the same time others are calling it twice more for verification.

Since syncookies only occur under stress, the code isn't well tested, and
the only symptom would be the returned packet would be dropped after
failing to verify. Since this only happens when lots of packets are
arriving, dropped packets probably aren't noticed.

However, that would be unacceptable for my code.

> This patch looks fine, but I dont see how this new function is used.
>
It is used at the places in the previous numbered patch (part 1c) that
currently say, "secret recipe not yet implemented".

And perhaps to replace the syncookie code mentioned above.

> Some points :
>
> 1) We are working hard to remove rwlocks from network stack, so please dont
> add a new one.

Botheration! I wish that was *documented* somewhere, perhaps in:
Documentation/spinlocks.txt

> add a new one. You probably can use a seqlock or RCU, or a server handling
> 10.000 connections request per second on many NIC will hit this rwlock.
>
Yep. Nobody in the tcp code is using them, but RCU looks interesting, and
may be well suited. I'll read up on it.

> 2)
>
> } else if (unlikely(time_after(jiffy, tcp_secret_primary->expires))) {
> get_random_bytes(secrets, sizeof(secrets));
>
> write_lock(&tcp_secret_locker);
>
> It would be better to first get the lock, then get random_bytes, in order
> not wasting entropy.
>
I'd put that outside the locked section as it's considerably more expensive
(hashing) than a memcpy. Entropy is never "wasted", as good cryptographic
random functions never reveal underlying entropy. I know there were some
papers a few years ago that found flaws in Linux random functions, but
surely that's been fixed. I assume you folks are using Yarrow or its
successors by now.

In any case, perhaps this RCU thingy obviates that issue.

>
> 3) If you change secret ever 600 seconds, it might be better to use a timer
> so that you dont have to check expiration and this logic at each SYN packet.
> (Disociate the lookup (read-only, done many time per second) from the updates
> (trigerred by a timer every 600 secs))
>
Part of the logic is not to update the generation secret until a new packet
arrives, allowing maximal entropy to be gathered beforehand. This is
especially worrisome for the first packets after booting. It takes into
account inter-packet arrival timing, and expiring after MSL. So, a timer
isn't feasible in this instance.

> (Not counting you'll probably need to use a similar lookup algo for the ACK
> packet coming from client)
>
Yes. A timer could be used to expire and clear old verification secrets,
and I'll look into that in the future ACK patch.

Thanks again.

2009-11-02 10:50:33

by David Miller

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

From: William Allen Simpson <[email protected]>
Date: Mon, 02 Nov 2009 05:39:14 -0500

> Thank you for your helpful comments. I'm not familiar with LKS. Is
> that a month long project?

It's the Linux Kernel Summit. It's well publicized:

http://events.linuxfoundation.org/events/kernel-summit

It's the yearly meeting of the top Linux kernel developers, it usually
lasts several days and since a large portion of the kernel developers
who attend have to travel on 10+ hour flights to reach the summit
location most choose to pad their trip on either side by several days
in order to recover from jet lag and other effects of travelling.

This year most kernel developers stayed for the rest of the week in
order to attend the Japan Linux Symposium.

Many people were offline a total of 2 weeks or more because of these
events. Some still haven't returned from Japan.

But that's neither here nor there, people sometimes are simply
overloaded with work that is important to them, or from their
view more important than reviewing your work.

Sometimes that happens and you just have to be patient and
understanding.

Complaining that your work isn't getting looked at in a timely manner
will always have the exact opposite effect that you want, it makes
people have a smaller desire to look at your stuff.

2009-11-02 10:56:04

by Eric Dumazet

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

William Allen Simpson a ?crit :
>
> The (buggy?) syncookie code that I'm avoiding/replacing is:
>
> static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
> ipv4_cookie_scratch);
>
> static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16
> dport,
> u32 count, int c)
> {
> __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
>
> memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
> tmp[0] = (__force u32)saddr;
> tmp[1] = (__force u32)daddr;
> tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
> tmp[3] = count;
> sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
>
> return tmp[17];
> }
>
> It appears to me that the operations could be interrupted at any time, and
> the fairly expensive sha transform (or probably any of the assignments)
> could be corrupted?
>
> That is, cpu0 grabs a scratch area, copies some data into it, interrupts,
> cpu0 comes along again with another packet, points into the same workspace,
> mashes the data, while cpu1 completes the previous operation with the
> old tmp pointer on the stack.
>
> Worse, this is called twice, each time getting tmp, and mashing the data,
> and at the same time others are calling it twice more for verification.
>
> Since syncookies only occur under stress, the code isn't well tested, and
> the only symptom would be the returned packet would be dropped after
> failing to verify. Since this only happens when lots of packets are
> arriving, dropped packets probably aren't noticed.
>
> However, that would be unacceptable for my code.
>

cookie_hash() runs in a non preemptable context. CPU cannot change under us.

(or else, we would not use __get_cpu_var(ipv4_cookie_scratch); )

And of course, each cpu gets its own scratch area, thanks to __get_cpu_var()

2009-11-02 12:36:12

by William Allen Simpson

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

Eric Dumazet wrote:
> cookie_hash() runs in a non preemptable context. CPU cannot change under us.
>
> (or else, we would not use __get_cpu_var(ipv4_cookie_scratch); )
>
> And of course, each cpu gets its own scratch area, thanks to __get_cpu_var()
>
Interesting. I'm not sure that running CPU intensive functions like SHA1 in
a non-preemptable context is a good idea. I'd assumed it wasn't!

Perhaps you could point at the documentation in the code that explains this?
Perhaps a function header comment that mentions it?

All I know is (from testing) that the tcp_minisockets.c caller is sometimes
called in a fashion that requires atomic allocation, and other times does not!

See my "Subject: query: tcpdump versus atomic?" thread from Oct 14th.

2009-11-02 13:16:43

by Eric Dumazet

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

William Allen Simpson a ?crit :
> Eric Dumazet wrote:
>> cookie_hash() runs in a non preemptable context. CPU cannot change
>> under us.
>>
>> (or else, we would not use __get_cpu_var(ipv4_cookie_scratch); )
>>
>> And of course, each cpu gets its own scratch area, thanks to
>> __get_cpu_var()
>>
> Interesting. I'm not sure that running CPU intensive functions like
> SHA1 in
> a non-preemptable context is a good idea. I'd assumed it wasn't!
>
> Perhaps you could point at the documentation in the code that explains
> this?

I suggest you read Documentations/ files about softirq

http://docs.blackfin.uclinux.org/kernel/generated/kernel-hacking.xml

Large part of network code is run by softirq handler, and a softirq handler
is not preemptable with another softirq (including itself).

> Perhaps a function header comment that mentions it?

So we are going to add a header to thousand of functions repeating this prereq ?

>
> All I know is (from testing) that the tcp_minisockets.c caller is sometimes
> called in a fashion that requires atomic allocation, and other times
> does not!

Maybe callers have different contexts (running from softirq handler or
from process context). Atomic ops are expensive and we try to avoid them
if/when possible.

>
> See my "Subject: query: tcpdump versus atomic?" thread from Oct 14th.

You probably add a bug in your kernel, leaving a function with unpaired lock/unlock
of notallow_something/allow_something

There are books about linux internals that you could read if you want some extra
documentation. Dont ask me details, I never read them :)

2009-11-02 17:22:00

by William Allen Simpson

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

Eric Dumazet wrote:
> Large part of network code is run by softirq handler, and a softirq handler
> is not preemptable with another softirq (including itself).
>
Thank you. That's helpful to know, as some existing locks have a "bh".
I've never figured out the ip_local_deliver_finish() context.

Knowing that there can only be one instance of the tcp stack running at
any one time, and the cpu never changes even after being interrupted, will
make it much easier to code.

>> Perhaps a function header comment that mentions it?
>
> So we are going to add a header to thousand of functions repeating this prereq ?
>
That's my usual practice. (Dozens would be more accurate in this case.)
I've always found it helpful for those coming after me, and sure would have
found it helpful now myself.... Repetitious, but well worth it.

Especially at tcp_v4_rcv(), as that's called through a vector named
"handler", which was particularly hard to track down.

It has an innocuous header "From tcp_input.c", that doesn't seem to have
anything to do with current reality.... (It's really called from
ip_input.c via af_inet.c).

>> All I know is (from testing) that the tcp_minisockets.c caller is sometimes
>> called in a fashion that requires atomic allocation, and other times
>> does not!
>
> Maybe callers have different contexts (running from softirq handler or
> from process context). Atomic ops are expensive and we try to avoid them
> if/when possible.
>
>> See my "Subject: query: tcpdump versus atomic?" thread from Oct 14th.
>
> You probably add a bug in your kernel, leaving a function with unpaired lock/unlock
> of notallow_something/allow_something
>
(No, I've not yet added locks; obviously, I'm still asking about them.)

Unlikely, as it was easy to reproduce by changing one line, without *any* of
my code present. Usually works, but doesn't work with tcpdump running on
the interface:

struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
{
- struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+ struct sock *newsk = inet_csk_clone(sk, req, GFP_KERNEL);

if (newsk != NULL) {

[ 2.876485] eth0: RealTek RTL8139 at 0x2000, 00:40:2b:6b:61:36, IRQ 17
[ 2.876490] eth0: Identified 8139 chip type 'RTL-8101'

[ 88.997594] device eth0 entered promiscuous mode
[ 114.827403] BUG: scheduling while atomic: swapper/0/0x10000100
[ 114.827462] Modules linked in: lp snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_timer snd ppdev iTCO_wdt iTCO_vendor_support psmouse soundcore parport_pc intel_agp parport agpgart pcspkr serio_raw shpchp snd_page_alloc 8139too aic7xxx 8139cp
scsi_transport_spi mii floppy
[ 114.827493]
[ 114.827497] Pid: 0, comm: swapper Not tainted (2.6.32-rc3 #4) Imperial
[ 114.827501] EIP: 0060:[<c0123295>] EFLAGS: 00000246 CPU: 0
[ 114.827512] EIP is at native_safe_halt+0x5/0x10
[ 114.827515] EAX: c0740000 EBX: 00000000 ECX: ffff4b6e EDX: 00000000
[ 114.827519] ESI: c07992c0 EDI: c0743000 EBP: c0741fa0 ESP: c0741fa0
[ 114.827522] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
[ 114.827525] CR0: 8005003b CR2: 09278fc4 CR3: 04b56000 CR4: 00000690
[ 114.827529] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
[ 114.827532] DR6: ffff0ff0 DR7: 00000400
[ 114.827535] Call Trace:
[ 114.827546] [<c01098b5>] default_idle+0x65/0x90
[ 114.827550] [<c0102062>] cpu_idle+0x52/0x90
[ 114.827558] [<c056cc23>] rest_init+0x53/0x60
[ 114.827565] [<c079c93d>] start_kernel+0x328/0x390
[ 114.827569] [<c079c3ce>] ? unknown_bootoption+0x0/0x1f6
[ 114.827574] [<c079c07e>] i386_start_kernel+0x7e/0xa8
[ 136.570632] device eth0 left promiscuous mode

> There are books about linux internals that you could read if you want some extra
> documentation. Dont ask me details, I never read them :)
>
Sorry, I've only read much of the Documentation directory (some parts
repeatedly), and Googled for more specific information. Pretty sparse!

Thank you again for your patient explanation.

2009-11-02 17:42:02

by Eric Dumazet

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

William Allen Simpson a ?crit :
> Eric Dumazet wrote:
>> Large part of network code is run by softirq handler, and a softirq
>> handler
>> is not preemptable with another softirq (including itself).
>>
> Thank you. That's helpful to know, as some existing locks have a "bh".
> I've never figured out the ip_local_deliver_finish() context.
>
> Knowing that there can only be one instance of the tcp stack running at
> any one time, and the cpu never changes even after being interrupted, will
> make it much easier to code.

Correction :

There is one instance of sofirq handler running per cpu.

So TCP (or UDP) stack can run simultaneously on several (and eventually all online) cpus.

This is why we still need some locks in various places.

>>
> (No, I've not yet added locks; obviously, I'm still asking about them.)
>
> Unlikely, as it was easy to reproduce by changing one line, without
> *any* of
> my code present. Usually works, but doesn't work with tcpdump running on
> the interface:

Yes, tcpdump has the nasty habit to consume a lot of ram, queuing a copy of
all network traffic on an af_packet socket. (or using a mmap buffer, it depends
on libpcap version you use)

>
> struct sock *tcp_create_openreq_child(struct sock *sk, struct
> request_sock *req, struct sk_buff *skb)
> {
> - struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
> + struct sock *newsk = inet_csk_clone(sk, req, GFP_KERNEL);

Here you want a GFP_KERNEL allocation, that is allowed to sleep if there is not
enough available memory. (It's allowed to sleep to let some processes to free
bit of ram, eventually)

But as the caller of tcp_create_openreq_child() runs from {soft}irq context,
its not allowed to sleep at all, even 10 usecs.

Therefore, linux kernel kindly warns you that's its illegal and deadlockable.

Dont change GFP_ATOMIC here, its really there for a valid reason.
And be prepared to get a NULL result from allocation.

If your machine has problems when running tcpdump, maybe it lacks some ram, maybe you could
tune tcpdump socket to not exhaust all LOWMEM.
I see your kernel is 32bits, so you have only 860 MB of kernel memory, called LOWMEM.
I believe last kernels might have some problems in OOM situations...

2009-11-03 22:38:15

by William Allen Simpson

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index c118b2a..ec78a4b 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -2,6 +2,7 @@
#define __CRYPTOHASH_H

#define SHA_DIGEST_WORDS 5
+#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
#define SHA_WORKSPACE_WORDS 80

void sha_init(__u32 *buf);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 51b7426..f669c43 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1526,12 +1526,18 @@ static inline int tcp_s_data_size(const struct tcp_sock *tp)
: 0;
}

+/* Using SHA1 for now, define some constants.
+ */
+#define COOKIE_DIGEST_WORDS (SHA_DIGEST_WORDS)
+#define COOKIE_MESSAGE_WORDS (SHA_MESSAGE_BYTES / 4)
+#define COOKIE_WORKSPACE_WORDS (COOKIE_DIGEST_WORDS + COOKIE_MESSAGE_WORDS)
+
/* As tcp_request_sock has already been extended in other places, the
* only remaining method is to pass stack values along as function
* parameters. These parameters are not needed after sending SYNACK.
*/
struct tcp_extend_values {
- u8 cookie_bakery[TCP_COOKIE_MAX];
+ u32 cookie_bakery[COOKIE_WORKSPACE_WORDS];
u8 cookie_plus;
u8 cookie_in_always:1,
cookie_out_never:1;
@@ -1542,6 +1548,8 @@ static inline struct tcp_extend_values *tcp_xv(const struct request_values *rvp)
return (struct tcp_extend_values *)rvp;
}

+extern int tcp_cookie_generator(struct tcp_extend_values *xvp);
+
extern void tcp_v4_init(void);
extern void tcp_init(void);

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 12409df..a8c5d99 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -264,6 +264,7 @@
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/crypto.h>
+#include <linux/time.h>

#include <net/icmp.h>
#include <net/tcp.h>
@@ -2933,6 +2934,126 @@ EXPORT_SYMBOL(tcp_md5_hash_key);

#endif

+/**
+ * Each Responder maintains up to two secret values concurrently for
+ * efficient secret rollover. Each secret value has 4 states:
+ *
+ * Generating. (tcp_secret_generating != tcp_secret_primary)
+ * Generates new Responder-Cookies, but not yet used for primary
+ * verification. This is a short-term state, typically lasting only
+ * one round trip time (RTT).
+ *
+ * Primary. (tcp_secret_generating == tcp_secret_primary)
+ * Used both for generation and primary verification.
+ *
+ * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
+ * Used for verification, until the first failure that can be
+ * verified by the newer Generating secret. At that time, this
+ * cookie's state is changed to Secondary, and the Generating
+ * cookie's state is changed to Primary. This is a short-term state,
+ * typically lasting only one round trip time (RTT).
+ *
+ * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
+ * Used for secondary verification, after primary verification
+ * failures. This state lasts no more than twice the Maximum Segment
+ * Lifetime (2MSL). Then, the secret is discarded.
+ */
+struct tcp_cookie_secret {
+ /* The secret is divided into two parts. The digest part is the
+ * equivalent of previously hashing a secret and saving the state,
+ * and serves as an initialization vector (IV). The message part
+ * serves as the trailing secret.
+ */
+ u32 secrets[COOKIE_WORKSPACE_WORDS];
+ unsigned long expires;
+};
+
+#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
+#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
+#define TCP_SECRET_LIFE (HZ * 600)
+
+static struct tcp_cookie_secret tcp_secret_one;
+static struct tcp_cookie_secret tcp_secret_two;
+
+/* Essentially a circular list, without dynamic allocation. */
+static struct tcp_cookie_secret *tcp_secret_generating;
+static struct tcp_cookie_secret *tcp_secret_primary;
+static struct tcp_cookie_secret *tcp_secret_retiring;
+static struct tcp_cookie_secret *tcp_secret_secondary;
+
+static DEFINE_SPINLOCK(tcp_secret_locker);
+
+/* Fill cookie_bakery with current generator, updating as needed.
+ * Only called in softirq context.
+ * Returns: 0 for success.
+ */
+int tcp_cookie_generator(struct tcp_extend_values *xvp)
+{
+ unsigned long jiffy = jiffies;
+
+ if (unlikely(time_after(jiffy, tcp_secret_generating->expires))) {
+ u32 secrets[COOKIE_WORKSPACE_WORDS];
+
+ spin_lock(&tcp_secret_locker);
+ if (!time_after(jiffy, tcp_secret_generating->expires)) {
+ /* refreshed by another */
+ spin_unlock(&tcp_secret_locker);
+ memcpy(&xvp->cookie_bakery[0],
+ &tcp_secret_generating->secrets[0],
+ sizeof(tcp_secret_generating->secrets));
+ } else {
+ /* still needs refreshing */
+ get_random_bytes(secrets, sizeof(secrets));
+
+ /* The first time, paranoia assumes that the
+ * randomization function isn't as strong. But,
+ * this secret initialization is delayed until
+ * the last possible moment (packet arrival).
+ * Although that time is observable, it is
+ * unpredictably variable. Mash in the most
+ * volatile clock bits available, and expire the
+ * secret extra quickly.
+ */
+ if (unlikely(tcp_secret_primary->expires ==
+ tcp_secret_secondary->expires)) {
+ struct timespec tv;
+
+ getnstimeofday(&tv);
+ secrets[COOKIE_DIGEST_WORDS+0] ^= (u32)tv.tv_nsec;
+ tcp_secret_secondary->expires = jiffy
+ + TCP_SECRET_1MSL;
+ } else {
+ tcp_secret_secondary->expires = jiffy
+ + TCP_SECRET_LIFE;
+ tcp_secret_primary->expires = jiffy
+ + TCP_SECRET_2MSL;
+ }
+ memcpy(&tcp_secret_secondary->secrets[0],
+ &secrets[0],
+ sizeof(secrets));
+
+ rcu_assign_pointer(tcp_secret_generating,
+ tcp_secret_secondary);
+ rcu_assign_pointer(tcp_secret_retiring,
+ tcp_secret_primary);
+ spin_unlock(&tcp_secret_locker);
+ /* call_rcu() or synchronize_rcu() not needed. */
+
+ memcpy(&xvp->cookie_bakery[0],
+ &secrets[0],
+ sizeof(secrets));
+ }
+ } else {
+ rcu_read_lock_bh();
+ memcpy(&xvp->cookie_bakery[0],
+ &rcu_dereference(tcp_secret_generating)->secrets[0],
+ sizeof(tcp_secret_generating->secrets));
+ rcu_read_unlock_bh();
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcp_cookie_generator);
+
void tcp_done(struct sock *sk)
{
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
@@ -2967,6 +3088,7 @@ void __init tcp_init(void)
struct sk_buff *skb = NULL;
unsigned long nr_pages, limit;
int order, i, max_share;
+ unsigned long jiffy = jiffies;

BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));

@@ -3060,6 +3182,15 @@ void __init tcp_init(void)
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);

tcp_register_congestion_control(&tcp_reno);
+
+ memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
+ memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
+ tcp_secret_one.expires = jiffy; /* past due */
+ tcp_secret_two.expires = jiffy; /* past due */
+ tcp_secret_generating = &tcp_secret_one;
+ tcp_secret_primary = &tcp_secret_one;
+ tcp_secret_retiring = &tcp_secret_two;
+ tcp_secret_secondary = &tcp_secret_two;
}

EXPORT_SYMBOL(tcp_close);

Attachments:

TCPCT+1d1.patch (6.98 kB)

2009-11-03 23:03:19

by Eric Dumazet

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

William Allen Simpson a ?crit :
> Eric Dumazet wrote:
>> This patch looks fine, but I dont see how this new function is used.
>>
>> Some points :
>>
>> 1) We are working hard to remove rwlocks from network stack, so please
>> dont
>> add a new one. You probably can use a seqlock or RCU, or a server
>> handling 10.000 connections request per second on many NIC will hit
>> this rwlock.
>>
> This is my attempt at using RCU, as seqlock didn't seem to apply (and is
> missing any Documentation.)
>

That seems very good, thanks, we can sort out details later, when full picture
is available.

> After the discussion about context, one question that I have is the need
> for the _bh suffix?
>
> + rcu_read_lock_bh();
> + memcpy(&xvp->cookie_bakery[0],
> + &rcu_dereference(tcp_secret_generating)->secrets[0],
> + sizeof(tcp_secret_generating->secrets));
> + rcu_read_unlock_bh();
>

Well, you dont need to disable BH in this code running in softirq context only.

Just use rcu_read_lock() (like you use spin_lock() in same function/context)

>
> Documentation/RCU/checklist.txt #7 says:
>
> One exception to this rule: rcu_read_lock() and rcu_read_unlock()
> may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
> in cases where local bottom halves are already known to be
> disabled, for example, in irq or softirq context. Commenting
> such cases is a must, of course! And the jury is still out on
> whether the increased speed is worth it.
>

2009-11-04 21:48:42

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

On Tue, Nov 03, 2009 at 05:38:10PM -0500, William Allen Simpson wrote:
> Eric Dumazet wrote:
>> This patch looks fine, but I dont see how this new function is used.
>> Some points :
>> 1) We are working hard to remove rwlocks from network stack, so please
>> dont
>> add a new one. You probably can use a seqlock or RCU, or a server handling
>> 10.000 connections request per second on many NIC will hit this rwlock.
> This is my attempt at using RCU, as seqlock didn't seem to apply (and is
> missing any Documentation.)
>
> After the discussion about context, one question that I have is the need
> for the _bh suffix?
>
> + rcu_read_lock_bh();
> + memcpy(&xvp->cookie_bakery[0],
> + &rcu_dereference(tcp_secret_generating)->secrets[0],
> + sizeof(tcp_secret_generating->secrets));
> + rcu_read_unlock_bh();
>
>
> Documentation/RCU/checklist.txt #7 says:
>
> One exception to this rule: rcu_read_lock() and rcu_read_unlock()
> may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
> in cases where local bottom halves are already known to be
> disabled, for example, in irq or softirq context. Commenting
> such cases is a must, of course! And the jury is still out on
> whether the increased speed is worth it.

I strongly suggest using the matching primitives unless you have a
really strong reason not to.

> diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
> index c118b2a..ec78a4b 100644
> --- a/include/linux/cryptohash.h
> +++ b/include/linux/cryptohash.h

[ . . . ]

> + if (unlikely(tcp_secret_primary->expires ==
> + tcp_secret_secondary->expires)) {
> + struct timespec tv;
> +
> + getnstimeofday(&tv);
> + secrets[COOKIE_DIGEST_WORDS+0] ^= (u32)tv.tv_nsec;
> + tcp_secret_secondary->expires = jiffy
> + + TCP_SECRET_1MSL;
> + } else {
> + tcp_secret_secondary->expires = jiffy
> + + TCP_SECRET_LIFE;
> + tcp_secret_primary->expires = jiffy
> + + TCP_SECRET_2MSL;
> + }
> + memcpy(&tcp_secret_secondary->secrets[0],
> + &secrets[0],
> + sizeof(secrets));
> +
> + rcu_assign_pointer(tcp_secret_generating,
> + tcp_secret_secondary);
> + rcu_assign_pointer(tcp_secret_retiring,
> + tcp_secret_primary);
> + spin_unlock(&tcp_secret_locker);
> + /* call_rcu() or synchronize_rcu() not needed. */

Would you be willing to say why? Are you relying on a time delay for a
given item to pass through tcp_secret_secondary and tcp_secret_retiring
or some such? If so, how do you know that this time delay will always
be long enough?

Or are you just shuffling the data structures around, without ever
freeing them? If so, is it really OK for a given reader to keep a
reference to a given item through the full range of shuffling, especially
given that it might be accesssing this concurrently with the ->expires
assignments above?

Either way, could you please expand the comment to give at least some
hint to the poor guy reading your code? ;-)

Thanx, Paul

2009-11-05 12:17:45

by William Allen Simpson

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

Paul E. McKenney wrote:
> On Tue, Nov 03, 2009 at 05:38:10PM -0500, William Allen Simpson wrote:
>> Documentation/RCU/checklist.txt #7 says:
>>
>> One exception to this rule: rcu_read_lock() and rcu_read_unlock()
>> may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
>> in cases where local bottom halves are already known to be
>> disabled, for example, in irq or softirq context. Commenting
>> such cases is a must, of course! And the jury is still out on
>> whether the increased speed is worth it.
>
> I strongly suggest using the matching primitives unless you have a
> really strong reason not to.
>
Eric gave contrary advice. But he also suggested (in an earlier message)
clearing the secrets with a timer, which could be a separate context --
although much later in time.

As you suggest, I'll use the _bh suffix everywhere until every i is dotted
and t is crossed. Then, check for efficiency later after thorough
analysis by experts such as yourself.

This code will be hit on every SYN and SYNACK that has a cookie option.
But it's just prior to a CPU intensive sha_transform -- in comparison,
it's trivial.

>> + rcu_assign_pointer(tcp_secret_generating,
>> + tcp_secret_secondary);
>> + rcu_assign_pointer(tcp_secret_retiring,
>> + tcp_secret_primary);
>> + spin_unlock_bh(&tcp_secret_locker);
>> + /* call_rcu() or synchronize_rcu() not needed. */
>
> Would you be willing to say why? Are you relying on a time delay for a
> given item to pass through tcp_secret_secondary and tcp_secret_retiring
> or some such? If so, how do you know that this time delay will always
> be long enough?
>
> Or are you just shuffling the data structures around, without ever
> freeing them? If so, is it really OK for a given reader to keep a
> reference to a given item through the full range of shuffling, especially
> given that it might be accesssing this concurrently with the ->expires
> assignments above?
>
> Either way, could you please expand the comment to give at least some
> hint to the poor guy reading your code? ;-)
>
Yes. Just shuffling the pointers without ever freeing anything. So,
there's nothing for call_rcu() to do, and nothing else to synchronize
(only the pointers). This assumes that after _unlock_ any CPU cache
with an old pointer->expires will hit the _lock_ code, and that will
update *both* ->expires and the other array elements concurrently?

One of the advantages of this scheme is the new secret is initialized
while the old secret is still used, and the old secret can continue to
be verified as old packets arrive. (I originally designed this for
Photuris [RFC-2522] circa 1995.)

As described in the long header given, each array element goes through
four (4) states. This is handling the first state transition. It will
hit at least 2 more locks, pointer updates, and unlocks before reuse.

Also, a great deal of time passes. After being retired (and expired), it
will be unused for approximately 5 minutes.

All that's a bit long for a comment.

+ /*
+ * The retiring data is never freed. Instead, it is
+ * replaced after later pointer updates and a quiet
+ * time of approximately 5 minutes. There is nothing
+ * for call_rcu() or synchronize_rcu() to handle.
+ */

Clear enough?

2009-11-05 12:45:30

by William Allen Simpson

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

William Allen Simpson wrote:
> Yes. Just shuffling the pointers without ever freeing anything. So,
> there's nothing for call_rcu() to do, and nothing else to synchronize
> (only the pointers). This assumes that after _unlock_ any CPU cache
> with an old pointer->expires will hit the _lock_ code, and that will
> update *both* ->expires and the other array elements concurrently?
>
Reiterating, I've not found Documentation showing that this code works:

+ unsigned long jiffy = jiffies;
+
+ if (unlikely(time_after(jiffy, tcp_secret_generating->expires))) {
+ spin_lock_bh(&tcp_secret_locker);
+ if (!time_after(jiffy, tcp_secret_generating->expires)) {
+ /* refreshed by another */
+ spin_unlock_bh(&tcp_secret_locker);
+ memcpy(&xvp->cookie_bakery[0],
+ &tcp_secret_generating->secrets[0],
+ sizeof(tcp_secret_generating->secrets));
+ } else {

How is it ensured that an old tcp_secret_generating or an old ->expires,
followed by a spin_lock, has updated both?

And even when both are updated, then every word of the ->secrets array has
also been updated in the local cache?

Is this a property of spin_lock()? Or spin_unlock()?

2009-11-05 13:19:28

by Eric Dumazet

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

William Allen Simpson a ?crit :
> Paul E. McKenney wrote:
>> On Tue, Nov 03, 2009 at 05:38:10PM -0500, William Allen Simpson wrote:
>>> Documentation/RCU/checklist.txt #7 says:
>>>
>>> One exception to this rule: rcu_read_lock() and rcu_read_unlock()
>>> may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
>>> in cases where local bottom halves are already known to be
>>> disabled, for example, in irq or softirq context. Commenting
>>> such cases is a must, of course! And the jury is still out on
>>> whether the increased speed is worth it.
>>
>> I strongly suggest using the matching primitives unless you have a
>> really strong reason not to.
>>
> Eric gave contrary advice. But he also suggested (in an earlier message)
> clearing the secrets with a timer, which could be a separate context --
> although much later in time.
>
> As you suggest, I'll use the _bh suffix everywhere until every i is dotted
> and t is crossed. Then, check for efficiency later after thorough
> analysis by experts such as yourself.
>
> This code will be hit on every SYN and SYNACK that has a cookie option.
> But it's just prior to a CPU intensive sha_transform -- in comparison,
> it's trivial.
>

I think you misunderstood my advice ;)

In the same function, you *cannot* use both variants like your last patch did :

spin_lock(&tcp_secret_locker);

...

rcu_read_lock_bh();
memcpy(&xvp->cookie_bakery[0],
&rcu_dereference(tcp_secret_generating)->secrets[0],
sizeof(tcp_secret_generating->secrets));
rcu_read_unlock_bh();

Reasoning is :

If you need _bh() for the rcu_read_lock_bh(), thats because you know
soft irq can happen anytime (they are not masked).

Then you also need _bh for the spin_lock() call, or risk deadlock.

-> tcp_cookie_generator();
spin_lock();
-> interrupt -> softirq -> SYN frame received -> tcp_cookie_generator() -> spin_lock(); hang

Your choices are :
------------------

1) Caller took care of disabling softirqs (or is only called from softirq handler),
then _bh suffixes are not necessary in tcp_cookie_generator().
-> spin_lock() & rcu_read_lock();

2) You dont know what called you (process context or softirq context)
-> you MUST use _bh prefixes on spin_lock_bh() & rcu_read_lock_bh();

2009-11-05 13:34:23

by Eric Dumazet

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

William Allen Simpson a ?crit :
> William Allen Simpson wrote:
>> Yes. Just shuffling the pointers without ever freeing anything. So,
>> there's nothing for call_rcu() to do, and nothing else to synchronize
>> (only the pointers). This assumes that after _unlock_ any CPU cache
>> with an old pointer->expires will hit the _lock_ code, and that will
>> update *both* ->expires and the other array elements concurrently?
>>
> Reiterating, I've not found Documentation showing that this code works:
>
> + unsigned long jiffy = jiffies;
> +
> + if (unlikely(time_after(jiffy, tcp_secret_generating->expires))) {
> + spin_lock_bh(&tcp_secret_locker);
> + if (!time_after(jiffy, tcp_secret_generating->expires)) {
> + /* refreshed by another */
> + spin_unlock_bh(&tcp_secret_locker);
> + memcpy(&xvp->cookie_bakery[0],
> + &tcp_secret_generating->secrets[0],
> + sizeof(tcp_secret_generating->secrets));
> + } else {
>
> How is it ensured that an old tcp_secret_generating or an old ->expires,
> followed by a spin_lock, has updated both?
>
> And even when both are updated, then every word of the ->secrets array has
> also been updated in the local cache?
>
> Is this a property of spin_lock()? Or spin_unlock()?

Yes,

$ vi +1121 Documentation/memory-barriers.txt

(1) LOCK operation implication:

Memory operations issued after the LOCK will be completed after the LOCK
operation has completed.

Memory operations issued before the LOCK may be completed after the LOCK
operation has completed.

(2) UNLOCK operation implication:

Memory operations issued before the UNLOCK will be completed before the
UNLOCK operation has completed.

Memory operations issued after the UNLOCK may be completed before the
UNLOCK operation has completed.

2009-11-05 14:59:33

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

On Thu, Nov 05, 2009 at 07:17:42AM -0500, William Allen Simpson wrote:
> Paul E. McKenney wrote:
>> On Tue, Nov 03, 2009 at 05:38:10PM -0500, William Allen Simpson wrote:
>>> Documentation/RCU/checklist.txt #7 says:
>>>
>>> One exception to this rule: rcu_read_lock() and rcu_read_unlock()
>>> may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
>>> in cases where local bottom halves are already known to be
>>> disabled, for example, in irq or softirq context. Commenting
>>> such cases is a must, of course! And the jury is still out on
>>> whether the increased speed is worth it.
>> I strongly suggest using the matching primitives unless you have a
>> really strong reason not to.
> Eric gave contrary advice. But he also suggested (in an earlier message)
> clearing the secrets with a timer, which could be a separate context --
> although much later in time.
>
> As you suggest, I'll use the _bh suffix everywhere until every i is dotted
> and t is crossed. Then, check for efficiency later after thorough
> analysis by experts such as yourself.
>
> This code will be hit on every SYN and SYNACK that has a cookie option.
> But it's just prior to a CPU intensive sha_transform -- in comparison,
> it's trivial.

Had Eric said that this code were performance-critical, where every
nanosecond mattered, that would certainly be good enough for me.
Eric has excellent knowledge of the networking code, certainly much
better than mine. And 10Gb Ethernet is certainly a performance
challenge, and I don't expect 40Gb Ethernet to be any easier.

Of course, I would still argue that the use of rcu_read_lock() rather
than rcu_read_unlock() needs to be commented. And if this sort of
substitution happens a lot, maybe we need a way for it to happen
automatically.

Thanx, Paul

>>> + rcu_assign_pointer(tcp_secret_generating,
>>> + tcp_secret_secondary);
>>> + rcu_assign_pointer(tcp_secret_retiring,
>>> + tcp_secret_primary);
>>> + spin_unlock_bh(&tcp_secret_locker);
>>> + /* call_rcu() or synchronize_rcu() not needed. */
>> Would you be willing to say why? Are you relying on a time delay for a
>> given item to pass through tcp_secret_secondary and tcp_secret_retiring
>> or some such? If so, how do you know that this time delay will always
>> be long enough?
>> Or are you just shuffling the data structures around, without ever
>> freeing them? If so, is it really OK for a given reader to keep a
>> reference to a given item through the full range of shuffling, especially
>> given that it might be accesssing this concurrently with the ->expires
>> assignments above?
>> Either way, could you please expand the comment to give at least some
>> hint to the poor guy reading your code? ;-)
> Yes. Just shuffling the pointers without ever freeing anything. So,
> there's nothing for call_rcu() to do, and nothing else to synchronize
> (only the pointers). This assumes that after _unlock_ any CPU cache
> with an old pointer->expires will hit the _lock_ code, and that will
> update *both* ->expires and the other array elements concurrently?
>
> One of the advantages of this scheme is the new secret is initialized
> while the old secret is still used, and the old secret can continue to
> be verified as old packets arrive. (I originally designed this for
> Photuris [RFC-2522] circa 1995.)
>
> As described in the long header given, each array element goes through
> four (4) states. This is handling the first state transition. It will
> hit at least 2 more locks, pointer updates, and unlocks before reuse.
>
> Also, a great deal of time passes. After being retired (and expired), it
> will be unused for approximately 5 minutes.
>
> All that's a bit long for a comment.
>
> + /*
> + * The retiring data is never freed. Instead, it is
> + * replaced after later pointer updates and a quiet
> + * time of approximately 5 minutes. There is nothing
> + * for call_rcu() or synchronize_rcu() to handle.
> + */
>
> Clear enough?

2009-11-05 19:44:39

by William Allen Simpson

[permalink] [raw]

Subject: Re: [net-next-2.6 PATCH RFC] TCPCT part 1d: generate Responder Cookie

Eric Dumazet wrote:
> William Allen Simpson a ?crit :
>> As you suggest, I'll use the _bh suffix everywhere until every i is dotted
>> and t is crossed. Then, check for efficiency later after thorough
>> analysis by experts such as yourself.
>>
>> This code will be hit on every SYN and SYNACK that has a cookie option.
>> But it's just prior to a CPU intensive sha_transform -- in comparison,
>> it's trivial.
>>
> I think you misunderstood my advice ;)
>
Yes, I misunderstood, but it's clear now. As I have to re-spin the whole
kit and kaboodle after the recent net-next crashing bug fixes, I'll post
another complete set tomorrow to the net list.

Thanks again (for all messages).