MIME-Version: 1.0
In-Reply-To: <1515735354-19279-2-git-send-email-frederic@kernel.org>
References: <1515735354-19279-1-git-send-email-frederic@kernel.org> <1515735354-19279-2-git-send-email-frederic@kernel.org>
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 11 Jan 2018 22:22:58 -0800
Message-ID: <CANn89iKbJ6XABv291URQCipDOgNzYpHmUVi6FsPSTeYJXMdaKA@mail.gmail.com>
Subject: Re: [RFC PATCH 1/2] softirq: Account time and iteration stats per vector
To: Frederic Weisbecker <frederic@kernel.org>
Cc: LKML <linux-kernel@vger.kernel.org>,
        Levin Alexander <alexander.levin@verizon.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Linus Torvalds <torvalds@linux-foundation.org>,
        Hannes Frederic Sowa <hannes@stressinduktion.org>,
        "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>,
        Wanpeng Li <wanpeng.li@hotmail.com>,
        Dmitry Safonov <dima@arista.com>,
        Thomas Gleixner <tglx@linutronix.de>,
        Radu Rendec <rrendec@arista.com>,
        Ingo Molnar <mingo@kernel.org>,
        Stanislaw Gruszka <sgruszka@redhat.com>,
        Paolo Abeni <pabeni@redhat.com>,
        Rik van Riel <riel@redhat.com>,
        Andrew Morton <akpm@linux-foundation.org>,
        David Miller <davem@davemloft.net>
Content-Type: text/plain; charset="UTF-8"
Sender: linux-kernel-owner@vger.kernel.org

On Thu, Jan 11, 2018 at 9:35 PM, Frederic Weisbecker
<frederic@kernel.org> wrote:
> As we plan to be able to defer some specific softurq vector processing
> to workqueues when those vectors need more time than IRQs can offer,
> let's first count the time spent and the number of occurences per vector.
>
> For now we still defer to ksoftirqd when the per vector limits are reached
>
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> Cc: Dmitry Safonov <dima@arista.com>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Miller <davem@davemloft.net>
> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Levin Alexander <alexander.levin@verizon.com>
> Cc: Paolo Abeni <pabeni@redhat.com>
> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Cc: Radu Rendec <rrendec@arista.com>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Stanislaw Gruszka <sgruszka@redhat.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Wanpeng Li <wanpeng.li@hotmail.com>
> ---
>  kernel/softirq.c | 37 +++++++++++++++++++++++++++++--------
>  1 file changed, 29 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 2f5e87f..fa267f7 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -26,6 +26,7 @@
>  #include <linux/smpboot.h>
>  #include <linux/tick.h>
>  #include <linux/irq.h>
> +#include <linux/sched/clock.h>
>
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/irq.h>
> @@ -62,6 +63,17 @@ const char * const softirq_to_name[NR_SOFTIRQS] = {
>         "TASKLET", "SCHED", "HRTIMER", "RCU"
>  };
>
> +struct vector_stat {
> +       u64 time;
> +       int count;
> +};
> +
> +struct softirq_stat {
> +       struct vector_stat stat[NR_SOFTIRQS];
> +};
> +
> +static DEFINE_PER_CPU(struct softirq_stat, softirq_stat_cpu);
> +
>  /*
>   * we cannot loop indefinitely here to avoid userspace starvation,
>   * but we also don't want to introduce a worst case 1/HZ latency
> @@ -203,7 +215,7 @@ EXPORT_SYMBOL(__local_bh_enable_ip);
>   * we want to handle softirqs as soon as possible, but they
>   * should not be able to lock up the box.
>   */
> -#define MAX_SOFTIRQ_TIME  msecs_to_jiffies(2)
> +#define MAX_SOFTIRQ_TIME  (2 * NSEC_PER_MSEC)
>  #define MAX_SOFTIRQ_RESTART 10
>
>  #ifdef CONFIG_TRACE_IRQFLAGS
> @@ -241,12 +253,11 @@ static inline void lockdep_softirq_end(bool in_hardirq) { }
>
>  asmlinkage __visible void __softirq_entry __do_softirq(void)
>  {
> -       unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
> +       struct softirq_stat *sstat = this_cpu_ptr(&softirq_stat_cpu);
>         unsigned long old_flags = current->flags;
> -       int max_restart = MAX_SOFTIRQ_RESTART;
>         struct softirq_action *h;
>         bool in_hardirq;
> -       __u32 pending;
> +       __u32 pending, overrun = 0;
>         int softirq_bit;
>
>         /*
> @@ -262,6 +273,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
>         __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
>         in_hardirq = lockdep_softirq_start();
>
> +       memzero_explicit(sstat, sizeof(*sstat));

If you clear sstat here, it means it does not need to be a per cpu
variable, but an automatic one (defined on the stack)

I presume we need a per cpu var to track cpu usage on last time window.

( typical case of 99,000 IRQ per second, one packet delivered per IRQ,
10 usec spent per packet)


>  restart:
>         /* Reset the pending bitmask before enabling irqs */
>         set_softirq_pending(0);
> @@ -271,8 +283,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
>         h = softirq_vec;
>
>         while ((softirq_bit = ffs(pending))) {
> +               struct vector_stat *vstat;
>                 unsigned int vec_nr;
>                 int prev_count;
> +               u64 startime;
>
>                 h += softirq_bit - 1;
>
> @@ -280,10 +294,18 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
>                 prev_count = preempt_count();
>
>                 kstat_incr_softirqs_this_cpu(vec_nr);
> +               vstat = &sstat->stat[vec_nr];
>
>                 trace_softirq_entry(vec_nr);
> +               startime = local_clock();
>                 h->action(h);
> +               vstat->time += local_clock() - startime;

You might store local_clock() in a variable, so that we do not call
local_clock() two times per ->action() called.


> +               vstat->count++;
>                 trace_softirq_exit(vec_nr);
> +
> +               if (vstat->time > MAX_SOFTIRQ_TIME || vstat->count > MAX_SOFTIRQ_RESTART)

If we trust local_clock() to be precise enough, we do not need to
track vstat->count anymore.

> +                       overrun |= 1 << vec_nr;
> +
>                 if (unlikely(prev_count != preempt_count())) {
>                         pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
>                                vec_nr, softirq_to_name[vec_nr], h->action,
> @@ -299,11 +321,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
>
>         pending = local_softirq_pending();
>         if (pending) {
> -               if (time_before(jiffies, end) && !need_resched() &&
> -                   --max_restart)
> +               if (overrun || need_resched())
> +                       wakeup_softirqd();
> +               else
>                         goto restart;
> -
> -               wakeup_softirqd();
>         }
>
>         lockdep_softirq_end(in_hardirq);
> --
> 2.7.4
>