2022-09-02 10:28:33

by Yunhui Cui

[permalink] [raw]
Subject: [PATCH] bpf: added the account of BPF running time

The following high CPU usage problem is caused by BPF, We found it through
the perf tools.
Children Self SharedObject Symbol
45.52% 45.51% [kernel] [k] native_queued_spin_lock_slowpath
45.35% 0.10% [kernel] [k] _raw_spin_lock_irqsave
45.29% 0.01% [kernel] [k] bpf_map_update_elem
45.29% 0.02% [kernel] [k] htab_map_update_elem

The above problem is caught when bpf_prog is executed, but we cannot
get the load on the system from bpf_progs executed before, and we
cannot monitor the occupancy rate of cpu by BPF all the time.

Currently only the running time of a single bpf_prog is counted in the
/proc/$pid/fdinfo/$fd file. It's impossible to count the occupancy rate
of all bpf_progs on the CPU, because we can't know which processes, and
it is possible that the process has exited.

With the increasing use of BPF function modules, all running bpf_progs may
occupy high CPU usage. So we need to add an item to the /proc/stat file to
observe the CPU usage of BPF from a global perspective.

Signed-off-by: Yunhui Cui <[email protected]>
---
Documentation/filesystems/proc.rst | 1 +
fs/proc/stat.c | 25 +++++++++++++++++++++++--
include/linux/filter.h | 17 +++++++++++++++--
kernel/bpf/core.c | 1 +
4 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index e7aafc82be99..353f41c3e4eb 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1477,6 +1477,7 @@ second). The meanings of the columns are as follows, from left to right:
- steal: involuntary wait
- guest: running a normal guest
- guest_nice: running a niced guest
+- bpf: running in bpf_programs

The "intr" line gives counts of interrupts serviced since boot time, for each
of the possible system interrupts. The first column is the total of all
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 4fb8729a68d4..ff8ef959fb4f 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -14,6 +14,8 @@
#include <linux/irqnr.h>
#include <linux/sched/cputime.h>
#include <linux/tick.h>
+#include <linux/filter.h>
+#include <linux/u64_stats_sync.h>

#ifndef arch_irq_stat_cpu
#define arch_irq_stat_cpu(cpu) 0
@@ -22,6 +24,20 @@
#define arch_irq_stat() 0
#endif

+DECLARE_PER_CPU(struct bpf_account, bpftime);
+
+static void get_bpf_time(u64 *ns, int cpu)
+{
+ unsigned int start = 0;
+ const struct bpf_account *bact;
+
+ bact = per_cpu_ptr(&bpftime, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&bact->syncp);
+ *ns = u64_stats_read(&bact->nsecs);
+ } while (u64_stats_fetch_retry_irq(&bact->syncp, start));
+}
+
#ifdef arch_idle_time

u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
@@ -112,11 +128,12 @@ static int show_stat(struct seq_file *p, void *v)
u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
+ u64 bpf_sum, bpf;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec64 boottime;

user = nice = system = idle = iowait =
- irq = softirq = steal = 0;
+ irq = softirq = steal = bpf = bpf_sum = 0;
guest = guest_nice = 0;
getboottime64(&boottime);
/* shift boot timestamp according to the timens offset */
@@ -127,6 +144,7 @@ static int show_stat(struct seq_file *p, void *v)
u64 *cpustat = kcpustat.cpustat;

kcpustat_cpu_fetch(&kcpustat, i);
+ get_bpf_time(&bpf, i);

user += cpustat[CPUTIME_USER];
nice += cpustat[CPUTIME_NICE];
@@ -138,6 +156,7 @@ static int show_stat(struct seq_file *p, void *v)
steal += cpustat[CPUTIME_STEAL];
guest += cpustat[CPUTIME_GUEST];
guest_nice += cpustat[CPUTIME_GUEST_NICE];
+ bpf_sum += bpf;
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);

@@ -160,6 +179,7 @@ static int show_stat(struct seq_file *p, void *v)
seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(bpf_sum));
seq_putc(p, '\n');

for_each_online_cpu(i) {
@@ -167,7 +187,7 @@ static int show_stat(struct seq_file *p, void *v)
u64 *cpustat = kcpustat.cpustat;

kcpustat_cpu_fetch(&kcpustat, i);
-
+ get_bpf_time(&bpf, i);
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
user = cpustat[CPUTIME_USER];
nice = cpustat[CPUTIME_NICE];
@@ -190,6 +210,7 @@ static int show_stat(struct seq_file *p, void *v)
seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(bpf));
seq_putc(p, '\n');
}
seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a5f21dc3c432..9cb072f9e32b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -565,6 +565,12 @@ struct sk_filter {
struct bpf_prog *prog;
};

+struct bpf_account {
+ u64_stats_t nsecs;
+ struct u64_stats_sync syncp;
+};
+DECLARE_PER_CPU(struct bpf_account, bpftime);
+
DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
@@ -577,12 +583,14 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
bpf_dispatcher_fn dfunc)
{
u32 ret;
+ struct bpf_account *bact;
+ unsigned long flags;
+ u64 start = 0;

cant_migrate();
+ start = sched_clock();
if (static_branch_unlikely(&bpf_stats_enabled_key)) {
struct bpf_prog_stats *stats;
- u64 start = sched_clock();
- unsigned long flags;

ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
stats = this_cpu_ptr(prog->stats);
@@ -593,6 +601,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
} else {
ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
}
+ bact = this_cpu_ptr(&bpftime);
+ flags = u64_stats_update_begin_irqsave(&bact->syncp);
+ u64_stats_add(&bact->nsecs, sched_clock() - start);
+ u64_stats_update_end_irqrestore(&bact->syncp, flags);
+
return ret;
}

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c1e10d088dbb..445ac1c6c01a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -618,6 +618,7 @@ static const struct latch_tree_ops bpf_tree_ops = {
.comp = bpf_tree_comp,
};

+DEFINE_PER_CPU(struct bpf_account, bpftime);
static DEFINE_SPINLOCK(bpf_lock);
static LIST_HEAD(bpf_kallsyms);
static struct latch_tree_root bpf_tree __cacheline_aligned;
--
2.20.1


2022-09-02 15:57:11

by Daniel Borkmann

[permalink] [raw]
Subject: Re: [PATCH] bpf: added the account of BPF running time

On 9/2/22 12:12 PM, Yunhui Cui wrote:
[...]
> index a5f21dc3c432..9cb072f9e32b 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -565,6 +565,12 @@ struct sk_filter {
> struct bpf_prog *prog;
> };
>
> +struct bpf_account {
> + u64_stats_t nsecs;
> + struct u64_stats_sync syncp;
> +};
> +DECLARE_PER_CPU(struct bpf_account, bpftime);
> +
> DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>
> typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
> @@ -577,12 +583,14 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
> bpf_dispatcher_fn dfunc)
> {
> u32 ret;
> + struct bpf_account *bact;
> + unsigned long flags;
> + u64 start = 0;
>
> cant_migrate();
> + start = sched_clock();
> if (static_branch_unlikely(&bpf_stats_enabled_key)) {
> struct bpf_prog_stats *stats;
> - u64 start = sched_clock();
> - unsigned long flags;
>
> ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> stats = this_cpu_ptr(prog->stats);
> @@ -593,6 +601,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
> } else {
> ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> }
> + bact = this_cpu_ptr(&bpftime);
> + flags = u64_stats_update_begin_irqsave(&bact->syncp);
> + u64_stats_add(&bact->nsecs, sched_clock() - start);
> + u64_stats_update_end_irqrestore(&bact->syncp, flags);
> +
> return ret;

The overhead this adds unconditionally is no-go. Have you tried using/improving:

commit 47c09d6a9f6794caface4ad50930460b82d7c670
Author: Song Liu <[email protected]>
Date: Mon Mar 9 10:32:15 2020 -0700

bpftool: Introduce "prog profile" command

With fentry/fexit programs, it is possible to profile BPF program with
hardware counters. Introduce bpftool "prog profile", which measures key
metrics of a BPF program.

bpftool prog profile command creates per-cpu perf events. Then it attaches
fentry/fexit programs to the target BPF program. The fentry program saves
perf event value to a map. The fexit program reads the perf event again,
and calculates the difference, which is the instructions/cycles used by
the target program.

Example input and output:

./bpftool prog profile id 337 duration 3 cycles instructions llc_misses

4228 run_cnt
3403698 cycles (84.08%)
3525294 instructions # 1.04 insn per cycle (84.05%)
13 llc_misses # 3.69 LLC misses per million isns (83.50%)

This command measures cycles and instructions for BPF program with id
337 for 3 seconds. The program has triggered 4228 times. The rest of the
output is similar to perf-stat. [...]

Thanks,
Daniel

2022-09-02 20:18:16

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] bpf: added the account of BPF running time

Hi Yunhui,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]
[also build test ERROR on bpf/master linus/master v6.0-rc3 next-20220901]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Yunhui-Cui/bpf-added-the-account-of-BPF-running-time/20220902-181437
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: um-x86_64_defconfig (https://download.01.org/0day-ci/archive/20220903/[email protected]/config)
compiler: gcc-11 (Debian 11.3.0-5) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/e31420772f2a63d6bc561211b8ef0331d41b2123
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Yunhui-Cui/bpf-added-the-account-of-BPF-running-time/20220902-181437
git checkout e31420772f2a63d6bc561211b8ef0331d41b2123
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=um SUBARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

/usr/bin/ld: warning: arch/x86/um/vdso/vdso.o: missing .note.GNU-stack section implies executable stack
/usr/bin/ld: NOTE: This behaviour is deprecated and will be removed in a future version of the linker
/usr/bin/ld: warning: .tmp_vmlinux.kallsyms1 has a LOAD segment with RWX permissions
/usr/bin/ld: kernel/seccomp.o: in function `seccomp_run_filters':
seccomp.c:(.text+0xaa7): undefined reference to `bpftime'
/usr/bin/ld: fs/proc/stat.o: in function `show_stat':
stat.c:(.text+0x124): undefined reference to `bpftime'
/usr/bin/ld: stat.c:(.text+0x332): undefined reference to `bpftime'
/usr/bin/ld: net/core/flow_dissector.o: in function `bpf_flow_dissect':
>> flow_dissector.c:(.text+0x9f5): undefined reference to `bpftime'
/usr/bin/ld: net/core/dev.o: in function `bpf_prog_run_generic_xdp':
>> dev.c:(.text+0x8c43): undefined reference to `bpftime'
/usr/bin/ld: net/core/ptp_classifier.o:ptp_classifier.c:(.text+0xf8): more undefined references to `bpftime' follow
collect2: error: ld returned 1 exit status

--
0-DAY CI Kernel Test Service
https://01.org/lkp

2022-10-12 11:02:41

by Yunhui Cui

[permalink] [raw]
Subject: Re: [External] Re: [PATCH] bpf: added the account of BPF running time

Daniel Borkmann <[email protected]> 于2022年9月2日周五 23:38写道:
>
> On 9/2/22 12:12 PM, Yunhui Cui wrote:
> [...]
> > index a5f21dc3c432..9cb072f9e32b 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -565,6 +565,12 @@ struct sk_filter {
> > struct bpf_prog *prog;
> > };
> >

> > ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > stats = this_cpu_ptr(prog->stats);
> > @@ -593,6 +601,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
> > } else {
> > ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > }
> > + bact = this_cpu_ptr(&bpftime);
> > + flags = u64_stats_update_begin_irqsave(&bact->syncp);
> > + u64_stats_add(&bact->nsecs, sched_clock() - start);
> > + u64_stats_update_end_irqrestore(&bact->syncp, flags);
> > +
> > return ret;
>
> The overhead this adds unconditionally is no-go. Have you tried using/improving:

The commit 47c09d6a9f6794caface4ad50930460b82d7c670 can not meet
the requirement of being able to visually see the cumulative running
time of progs on each cpu.

About the overhead, how about moving the above code to
if(static_branch_unlikely(&bpf_stats_enabled_key)) {}
branch, like prog->stats processing ?

Yunhui