2022-08-29 12:18:36

by Richard Gobert

[permalink] [raw]
Subject: [PATCH 4/4] net-next: frags: dynamic timeout under load

Calculate a dynamic fragment reassembly timeout, taking into
consideration the current fqdir load and the load introduced by
the peer. Reintroduce low_thresh, which now acts as a knob for
adjusting per-peer memory limits.

Signed-off-by: Richard Gobert <[email protected]>
---
Documentation/networking/ip-sysctl.rst | 3 +++
include/net/inet_frag.h | 1 +
net/ipv4/inet_fragment.c | 30 +++++++++++++++++++++++++-
net/ipv4/ip_fragment.c | 2 +-
4 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 56cd4ea059b2..fb25aa6e22a2 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -247,6 +247,9 @@ ipfrag_low_thresh - LONG INTEGER
begins to remove incomplete fragment queues to free up resources.
The kernel still accepts new fragments for defragmentation.

+ (Since linux-6.1)
+ Maximum memory used to reassemble IP fragments sent by a single peer.
+
ipfrag_time - INTEGER
Time in seconds to keep an IP fragment in memory.

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 077a0ec78a58..595a6db57a0e 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -99,6 +99,7 @@ struct inet_frag_queue {
u16 max_size;
struct fqdir *fqdir;
struct inet_peer *peer;
+ u64 timeout;
struct rcu_head rcu;
};

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 8b8d77d548d4..34c5ebba4951 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -314,6 +314,30 @@ void inet_frag_free(struct inet_frag_queue *q)
call_rcu(&q->rcu, inet_frag_destroy_rcu);
}

+static int inet_frag_update_timeout(struct inet_frag_queue *q)
+{
+ u64 peer_timeout, inet_timeout;
+ long peer_mem, inet_mem;
+ long high_thresh = READ_ONCE(q->fqdir->high_thresh);
+ long low_thresh = READ_ONCE(q->fqdir->low_thresh);
+ u64 base_timeout = READ_ONCE(q->fqdir->timeout);
+
+ peer_mem = low_thresh - peer_mem_limit(q);
+ inet_mem = high_thresh - frag_mem_limit(q->fqdir);
+
+ if (peer_mem <= 0 || inet_mem <= 0)
+ return -ENOMEM;
+
+ /* Timeout changes linearly with respect to the amount of free memory.
+ * Choose the more permissive of the two timeouts, to avoid limiting
+ * the system while there is still enough memory.
+ */
+ peer_timeout = div64_long(base_timeout * peer_mem, low_thresh);
+ inet_timeout = div64_long(base_timeout * inet_mem, high_thresh);
+ q->timeout = max_t(u64, peer_timeout, inet_timeout);
+ return 0;
+}
+
void inet_frag_destroy(struct inet_frag_queue *q)
{
struct fqdir *fqdir;
@@ -346,6 +370,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,

q->fqdir = fqdir;
f->constructor(q, arg);
+ if (inet_frag_update_timeout(q)) {
+ inet_frag_free(q);
+ return NULL;
+ }
add_frag_mem_limit(q, f->qsize);

timer_setup(&q->timer, f->frag_expire, 0);
@@ -367,7 +395,7 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
*prev = ERR_PTR(-ENOMEM);
return NULL;
}
- mod_timer(&q->timer, jiffies + fqdir->timeout);
+ mod_timer(&q->timer, jiffies + q->timeout);

*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
&q->node, f->rhash_params);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e35061f6aadb..88a99242d721 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -236,7 +236,7 @@ static int ip_frag_reinit(struct ipq *qp)
{
unsigned int sum_truesize = 0;

- if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
+ if (!mod_timer(&qp->q.timer, jiffies + qp->q.timeout)) {
refcount_inc(&qp->q.refcnt);
return -ETIMEDOUT;
}
--
2.36.1


2022-08-29 18:28:34

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH 4/4] net-next: frags: dynamic timeout under load

On Mon, Aug 29, 2022 at 4:49 AM Richard Gobert <[email protected]> wrote:
>
> Calculate a dynamic fragment reassembly timeout, taking into
> consideration the current fqdir load and the load introduced by
> the peer. Reintroduce low_thresh, which now acts as a knob for
> adjusting per-peer memory limits.
>
> Signed-off-by: Richard Gobert <[email protected]>
> ---
> Documentation/networking/ip-sysctl.rst | 3 +++
> include/net/inet_frag.h | 1 +
> net/ipv4/inet_fragment.c | 30 +++++++++++++++++++++++++-
> net/ipv4/ip_fragment.c | 2 +-
> 4 files changed, 34 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 56cd4ea059b2..fb25aa6e22a2 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -247,6 +247,9 @@ ipfrag_low_thresh - LONG INTEGER
> begins to remove incomplete fragment queues to free up resources.
> The kernel still accepts new fragments for defragmentation.
>
> + (Since linux-6.1)
> + Maximum memory used to reassemble IP fragments sent by a single peer.
> +
> ipfrag_time - INTEGER
> Time in seconds to keep an IP fragment in memory.
>
> diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
> index 077a0ec78a58..595a6db57a0e 100644
> --- a/include/net/inet_frag.h
> +++ b/include/net/inet_frag.h
> @@ -99,6 +99,7 @@ struct inet_frag_queue {
> u16 max_size;
> struct fqdir *fqdir;
> struct inet_peer *peer;
> + u64 timeout;

Why u64 ?

This is not what the timer interface uses (look at mod_timer(), it
uses "unsigned long")

> struct rcu_head rcu;
> };
>
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 8b8d77d548d4..34c5ebba4951 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -314,6 +314,30 @@ void inet_frag_free(struct inet_frag_queue *q)
> call_rcu(&q->rcu, inet_frag_destroy_rcu);
> }
>
> +static int inet_frag_update_timeout(struct inet_frag_queue *q)
> +{
> + u64 peer_timeout, inet_timeout;
> + long peer_mem, inet_mem;
> + long high_thresh = READ_ONCE(q->fqdir->high_thresh);
> + long low_thresh = READ_ONCE(q->fqdir->low_thresh);
> + u64 base_timeout = READ_ONCE(q->fqdir->timeout);
> +
> + peer_mem = low_thresh - peer_mem_limit(q);
> + inet_mem = high_thresh - frag_mem_limit(q->fqdir);
> +
> + if (peer_mem <= 0 || inet_mem <= 0)
> + return -ENOMEM;
> +
> + /* Timeout changes linearly with respect to the amount of free memory.
> + * Choose the more permissive of the two timeouts, to avoid limiting
> + * the system while there is still enough memory.
> + */
> + peer_timeout = div64_long(base_timeout * peer_mem, low_thresh);
> + inet_timeout = div64_long(base_timeout * inet_mem, high_thresh);
> + q->timeout = max_t(u64, peer_timeout, inet_timeout);

If/when under load, timeout is close to zero,
we would fire many timers (increased system load) and make impossible
for datagrams to complete.

In contrast, a reasonable timer and probabilistic drops of new datagrams
when the queue is full lets some datagrams to complete.

Make sure to test your change under a real DDOS, not only non malicious netperf

> + return 0;
> +}
> +
> void inet_frag_destroy(struct inet_frag_queue *q)
> {
> struct fqdir *fqdir;
> @@ -346,6 +370,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
>
> q->fqdir = fqdir;
> f->constructor(q, arg);
> + if (inet_frag_update_timeout(q)) {
> + inet_frag_free(q);
> + return NULL;
> + }
> add_frag_mem_limit(q, f->qsize);
>
> timer_setup(&q->timer, f->frag_expire, 0);
> @@ -367,7 +395,7 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
> *prev = ERR_PTR(-ENOMEM);
> return NULL;
> }
> - mod_timer(&q->timer, jiffies + fqdir->timeout);
> + mod_timer(&q->timer, jiffies + q->timeout);
>
> *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
> &q->node, f->rhash_params);
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index e35061f6aadb..88a99242d721 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -236,7 +236,7 @@ static int ip_frag_reinit(struct ipq *qp)
> {
> unsigned int sum_truesize = 0;
>
> - if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
> + if (!mod_timer(&qp->q.timer, jiffies + qp->q.timeout)) {
> refcount_inc(&qp->q.refcnt);
> return -ETIMEDOUT;
> }
> --
> 2.36.1
>