2021-10-06 17:53:19

by Marcelo Tosatti

[permalink] [raw]
Subject: [PATCH bpf-next] bpf: introduce helper bpf_raw_read_cpu_clock



Add bpf_raw_read_cpu_clock helper, to read architecture specific
CPU clock. In x86's case, this is the TSC.

This is necessary to synchronize bpf traces from host and guest bpf-programs
(after subtracting guest tsc-offset from guest timestamps).

Signed-off-by: Marcelo Tosatti <[email protected]>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ab83c22d274e..832bb1f65f28 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -95,6 +95,7 @@ config X86
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_ZONE_DMA_SET if EXPERT
+ select ARCH_HAS_BPF_RAW_CPU_CLOCK
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/x86/include/asm/bpf_raw_cpu_clock.h b/arch/x86/include/asm/bpf_raw_cpu_clock.h
new file mode 100644
index 000000000000..6951c399819e
--- /dev/null
+++ b/arch/x86/include/asm/bpf_raw_cpu_clock.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BPF_RAW_CPU_CLOCK_H_
+#define _ASM_X86_BPF_RAW_CPU_CLOCK_H_
+
+static inline unsigned long long read_raw_cpu_clock(void)
+{
+ return rdtsc_ordered();
+}
+
+#endif /* _ASM_X86_BPF_RAW_CPU_CLOCK_H_ */
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 3eff08d7b8e5..844a44ff508d 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -105,6 +105,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
+ case BPF_FUNC_read_raw_cpu_clock:
+ return &bpf_read_raw_cpu_clock_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_prandom_u32:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d604c8251d88..b6cb426085fb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,6 +2058,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
+extern const struct bpf_func_proto bpf_read_raw_cpu_clock_proto;
extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6fc59d61937a..52191791b089 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4037,6 +4037,13 @@ union bpf_attr {
* Return
* Current *ktime*.
*
+ * u64 bpf_read_raw_cpu_clock(void)
+ * Description
+ * Return the architecture specific CPU clock value.
+ * For x86, this is the TSC clock.
+ * Return
+ * *CPU clock value*
+ *
* long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
* Description
* **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
@@ -5089,6 +5096,7 @@ union bpf_attr {
FN(task_pt_regs), \
FN(get_branch_snapshot), \
FN(trace_vprintk), \
+ FN(read_raw_cpu_clock), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index a82d6de86522..5815db157220 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -21,6 +21,10 @@ config HAVE_EBPF_JIT
config ARCH_WANT_DEFAULT_BPF_JIT
bool

+# Used by archs to tell they support reading raw CPU clock
+config ARCH_HAS_BPF_RAW_CPU_CLOCK
+ bool
+
menu "BPF subsystem"

config BPF_SYSCALL
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b6c72af64d5d..8e2359dfd582 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2345,6 +2345,8 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
+const struct bpf_func_proto bpf_read_raw_cpu_clock_proto __weak;
+

const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ffd469c217f..90b9e5efaf65 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -18,6 +18,10 @@

#include "../../lib/kstrtox.h"

+#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK
+#include <asm/bpf_raw_cpu_clock.h>
+#endif
+
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
* bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
@@ -168,6 +172,21 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
.ret_type = RET_INTEGER,
};

+BPF_CALL_0(bpf_read_raw_cpu_clock)
+{
+#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK
+ return read_raw_cpu_clock();
+#else
+ return sched_clock();
+#endif
+}
+
+const struct bpf_func_proto bpf_read_raw_cpu_clock_proto = {
+ .func = bpf_read_raw_cpu_clock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
BPF_CALL_0(bpf_ktime_get_coarse_ns)
{
return ktime_get_coarse_ns();
@@ -1366,6 +1385,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_read_raw_cpu_clock:
+ return &bpf_read_raw_cpu_clock_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6b3153841a33..047ca7c1d57a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1113,6 +1113,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_read_raw_cpu_clock:
+ return &bpf_read_raw_cpu_clock_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 6fc59d61937a..52191791b089 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4037,6 +4037,13 @@ union bpf_attr {
* Return
* Current *ktime*.
*
+ * u64 bpf_read_raw_cpu_clock(void)
+ * Description
+ * Return the architecture specific CPU clock value.
+ * For x86, this is the TSC clock.
+ * Return
+ * *CPU clock value*
+ *
* long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
* Description
* **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
@@ -5089,6 +5096,7 @@ union bpf_attr {
FN(task_pt_regs), \
FN(get_branch_snapshot), \
FN(trace_vprintk), \
+ FN(read_raw_cpu_clock), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper


2021-10-06 21:41:42

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH bpf-next] bpf: introduce helper bpf_raw_read_cpu_clock

On Wed, Oct 6, 2021 at 10:52 AM Marcelo Tosatti <[email protected]> wrote:
>
>
>
> Add bpf_raw_read_cpu_clock helper, to read architecture specific
> CPU clock. In x86's case, this is the TSC.
>
> This is necessary to synchronize bpf traces from host and guest bpf-programs
> (after subtracting guest tsc-offset from guest timestamps).

Trying to understand the use case. So in a host-guest scenario,
bpf_ktime_get_ns()
will return different values in host and guest, but rdtsc() will give
the same value.
Is this correct?

>
> Signed-off-by: Marcelo Tosatti <[email protected]>
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index ab83c22d274e..832bb1f65f28 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -95,6 +95,7 @@ config X86
> select ARCH_HAS_UBSAN_SANITIZE_ALL
> select ARCH_HAS_DEBUG_WX
> select ARCH_HAS_ZONE_DMA_SET if EXPERT
> + select ARCH_HAS_BPF_RAW_CPU_CLOCK
> select ARCH_HAVE_NMI_SAFE_CMPXCHG
> select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
> select ARCH_MIGHT_HAVE_PC_PARPORT
> diff --git a/arch/x86/include/asm/bpf_raw_cpu_clock.h b/arch/x86/include/asm/bpf_raw_cpu_clock.h
> new file mode 100644
> index 000000000000..6951c399819e
> --- /dev/null
> +++ b/arch/x86/include/asm/bpf_raw_cpu_clock.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_BPF_RAW_CPU_CLOCK_H_
> +#define _ASM_X86_BPF_RAW_CPU_CLOCK_H_
> +
> +static inline unsigned long long read_raw_cpu_clock(void)
> +{
> + return rdtsc_ordered();
> +}
> +
> +#endif /* _ASM_X86_BPF_RAW_CPU_CLOCK_H_ */
> diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
> index 3eff08d7b8e5..844a44ff508d 100644
> --- a/drivers/media/rc/bpf-lirc.c
> +++ b/drivers/media/rc/bpf-lirc.c
> @@ -105,6 +105,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> return &bpf_ktime_get_ns_proto;
> case BPF_FUNC_ktime_get_boot_ns:
> return &bpf_ktime_get_boot_ns_proto;
> + case BPF_FUNC_read_raw_cpu_clock:
> + return &bpf_read_raw_cpu_clock_proto;
> case BPF_FUNC_tail_call:
> return &bpf_tail_call_proto;
> case BPF_FUNC_get_prandom_u32:
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index d604c8251d88..b6cb426085fb 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2058,6 +2058,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
> extern const struct bpf_func_proto bpf_tail_call_proto;
> extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
> extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
> +extern const struct bpf_func_proto bpf_read_raw_cpu_clock_proto;
> extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
> extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
> extern const struct bpf_func_proto bpf_get_current_comm_proto;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 6fc59d61937a..52191791b089 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -4037,6 +4037,13 @@ union bpf_attr {
> * Return
> * Current *ktime*.
> *
> + * u64 bpf_read_raw_cpu_clock(void)
> + * Description
> + * Return the architecture specific CPU clock value.
> + * For x86, this is the TSC clock.
> + * Return
> + * *CPU clock value*
> + *
> * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
> * Description
> * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
> @@ -5089,6 +5096,7 @@ union bpf_attr {
> FN(task_pt_regs), \
> FN(get_branch_snapshot), \
> FN(trace_vprintk), \
> + FN(read_raw_cpu_clock), \
> /* */
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
> index a82d6de86522..5815db157220 100644
> --- a/kernel/bpf/Kconfig
> +++ b/kernel/bpf/Kconfig
> @@ -21,6 +21,10 @@ config HAVE_EBPF_JIT
> config ARCH_WANT_DEFAULT_BPF_JIT
> bool
>
> +# Used by archs to tell they support reading raw CPU clock
> +config ARCH_HAS_BPF_RAW_CPU_CLOCK
> + bool
> +
> menu "BPF subsystem"
>
> config BPF_SYSCALL
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index b6c72af64d5d..8e2359dfd582 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2345,6 +2345,8 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
> const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
> const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
> const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
> +const struct bpf_func_proto bpf_read_raw_cpu_clock_proto __weak;
> +
>
> const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
> const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 1ffd469c217f..90b9e5efaf65 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -18,6 +18,10 @@
>
> #include "../../lib/kstrtox.h"
>
> +#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK
> +#include <asm/bpf_raw_cpu_clock.h>
> +#endif
> +
> /* If kernel subsystem is allowing eBPF programs to call this function,
> * inside its own verifier_ops->get_func_proto() callback it should return
> * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
> @@ -168,6 +172,21 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
> .ret_type = RET_INTEGER,
> };
>
> +BPF_CALL_0(bpf_read_raw_cpu_clock)
> +{
> +#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK
> + return read_raw_cpu_clock();
> +#else
> + return sched_clock();
> +#endif
> +}
> +
> +const struct bpf_func_proto bpf_read_raw_cpu_clock_proto = {
> + .func = bpf_read_raw_cpu_clock,
> + .gpl_only = false,
> + .ret_type = RET_INTEGER,
> +};
> +
> BPF_CALL_0(bpf_ktime_get_coarse_ns)
> {
> return ktime_get_coarse_ns();
> @@ -1366,6 +1385,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
> return &bpf_ktime_get_boot_ns_proto;
> case BPF_FUNC_ktime_get_coarse_ns:
> return &bpf_ktime_get_coarse_ns_proto;
> + case BPF_FUNC_read_raw_cpu_clock:
> + return &bpf_read_raw_cpu_clock_proto;
> case BPF_FUNC_ringbuf_output:
> return &bpf_ringbuf_output_proto;
> case BPF_FUNC_ringbuf_reserve:
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 6b3153841a33..047ca7c1d57a 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1113,6 +1113,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> return &bpf_ktime_get_boot_ns_proto;
> case BPF_FUNC_ktime_get_coarse_ns:
> return &bpf_ktime_get_coarse_ns_proto;
> + case BPF_FUNC_read_raw_cpu_clock:
> + return &bpf_read_raw_cpu_clock_proto;

With the change in bpf_base_func_proto, this part is not needed.

> case BPF_FUNC_tail_call:
> return &bpf_tail_call_proto;
> case BPF_FUNC_get_current_pid_tgid:
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 6fc59d61937a..52191791b089 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -4037,6 +4037,13 @@ union bpf_attr {
> * Return
> * Current *ktime*.
> *
> + * u64 bpf_read_raw_cpu_clock(void)
> + * Description
> + * Return the architecture specific CPU clock value.
> + * For x86, this is the TSC clock.
> + * Return
> + * *CPU clock value*
> + *
> * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
> * Description
> * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
> @@ -5089,6 +5096,7 @@ union bpf_attr {
> FN(task_pt_regs), \
> FN(get_branch_snapshot), \
> FN(trace_vprintk), \
> + FN(read_raw_cpu_clock), \
> /* */
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>

2021-10-07 07:21:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH bpf-next] bpf: introduce helper bpf_raw_read_cpu_clock

On Wed, Oct 06, 2021 at 02:37:09PM -0700, Song Liu wrote:
> On Wed, Oct 6, 2021 at 10:52 AM Marcelo Tosatti <[email protected]> wrote:
> >
> >
> >
> > Add bpf_raw_read_cpu_clock helper, to read architecture specific
> > CPU clock. In x86's case, this is the TSC.
> >
> > This is necessary to synchronize bpf traces from host and guest bpf-programs
> > (after subtracting guest tsc-offset from guest timestamps).
>
> Trying to understand the use case. So in a host-guest scenario,
> bpf_ktime_get_ns()
> will return different values in host and guest, but rdtsc() will give
> the same value.
> Is this correct?

No, it will not. Also, please explain if any of this stands a chance of
working for anything other than x86. Or even on x86 in the face of
guest migration.

Also, please explain, again, what's wrong with dumping snapshots of
CLOCK_MONOTONIC{,_RAW} from host and guest and correlating time that
way?

And also explain why BPF needs to do this differently than all the other
tracers.

2021-10-07 09:53:34

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH bpf-next] bpf: introduce helper bpf_raw_read_cpu_clock

Hi Peter, Song,

On Thu, Oct 07, 2021 at 09:18:56AM +0200, Peter Zijlstra wrote:
> On Wed, Oct 06, 2021 at 02:37:09PM -0700, Song Liu wrote:
> > On Wed, Oct 6, 2021 at 10:52 AM Marcelo Tosatti <[email protected]> wrote:
> > >
> > >
> > >
> > > Add bpf_raw_read_cpu_clock helper, to read architecture specific
> > > CPU clock. In x86's case, this is the TSC.
> > >
> > > This is necessary to synchronize bpf traces from host and guest bpf-programs
> > > (after subtracting guest tsc-offset from guest timestamps).
> >
> > Trying to understand the use case. So in a host-guest scenario,
> > bpf_ktime_get_ns()
> > will return different values in host and guest, but rdtsc() will give
> > the same value.
> > Is this correct?
>
> No, it will not.

No, but we can find out the delta between host and guest TSCs.

On x86, you can read the offset through debugfs file:

debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu,
&vcpu_tsc_offset_fops);

Other architectures can expose that offset.

> Also, please explain if any of this stands a chance of
> working for anything other than x86.

Yes, the same pattern repeats

ARM:

With offset between guest and host:
https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/CNTVCT-EL0--Counter-timer-Virtual-Count-register?lang=en

Without offset:
commit 051ff581ce70e822729e9474941f3c206cbf7436

PPC:
https://yhbt.net/lore/all/[email protected]/T/

(Time Base Register is read through mftb instruction).

> Or even on x86 in the face of
> guest migration.

It won't, but honestly we don't care about tracing at this level across
migration.

> Also, please explain, again, what's wrong with dumping snapshots of
> CLOCK_MONOTONIC{,_RAW} from host and guest and correlating time that
> way?

You can't read the guest and the host clock at the same time (there will always
be some variable delay between reading the two clocks). And that delay
is not fixed, but variable (depending on scheduling of the guest vcpus,
for example). So you will need an algorithm to estimate their differences,
with non zero error bounds:

"
Add a driver with gettime method returning hosts realtime clock.
This allows Chrony to synchronize host and guest clocks with
high precision (see results below).

chronyc> sources
MS Name/IP address Stratum Poll Reach LastRx Last sample
===============================================================================
#* PHC0 0 3 377 6 +4ns[ +4ns] +/- 3ns
"

Now with the hardware clock (which is usually the base for CLOCK_MONOTONIC_RAW),
there are no errors (offset will be 0 ns, rather than 3/4ns).

> And also explain why BPF needs to do this differently than all the other
> tracers.

For x86 we use:

echo "x86-tsc" > /sys/kernel/debug/tracing/trace_clock

For this purpose, on x86, so its not like anything different is being
done?

2021-10-07 18:07:52

by Marcelo Tosatti

[permalink] [raw]
Subject: [PATCH v2 bpf-next] bpf: introduce helper bpf_raw_read_cpu_clock


Add bpf_raw_read_cpu_clock helper, to read architecture specific
CPU clock. In x86's case, this is the TSC.

This is necessary to synchronize bpf traces from host and guest bpf-programs
(after subtracting guest tsc-offset from guest timestamps).

Signed-off-by: Marcelo Tosatti <[email protected]>

---

v2:
- drop redundant addition to bpf_tracing_func_proto (Song Liu)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ab83c22d274e..832bb1f65f28 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -95,6 +95,7 @@ config X86
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_ZONE_DMA_SET if EXPERT
+ select ARCH_HAS_BPF_RAW_CPU_CLOCK
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/x86/include/asm/bpf_raw_cpu_clock.h b/arch/x86/include/asm/bpf_raw_cpu_clock.h
new file mode 100644
index 000000000000..6951c399819e
--- /dev/null
+++ b/arch/x86/include/asm/bpf_raw_cpu_clock.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BPF_RAW_CPU_CLOCK_H_
+#define _ASM_X86_BPF_RAW_CPU_CLOCK_H_
+
+static inline unsigned long long read_raw_cpu_clock(void)
+{
+ return rdtsc_ordered();
+}
+
+#endif /* _ASM_X86_BPF_RAW_CPU_CLOCK_H_ */
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 3eff08d7b8e5..844a44ff508d 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -105,6 +105,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
+ case BPF_FUNC_read_raw_cpu_clock:
+ return &bpf_read_raw_cpu_clock_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_prandom_u32:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d604c8251d88..b6cb426085fb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,6 +2058,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
+extern const struct bpf_func_proto bpf_read_raw_cpu_clock_proto;
extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6fc59d61937a..52191791b089 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4037,6 +4037,13 @@ union bpf_attr {
* Return
* Current *ktime*.
*
+ * u64 bpf_read_raw_cpu_clock(void)
+ * Description
+ * Return the architecture specific CPU clock value.
+ * For x86, this is the TSC clock.
+ * Return
+ * *CPU clock value*
+ *
* long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
* Description
* **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
@@ -5089,6 +5096,7 @@ union bpf_attr {
FN(task_pt_regs), \
FN(get_branch_snapshot), \
FN(trace_vprintk), \
+ FN(read_raw_cpu_clock), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index a82d6de86522..5815db157220 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -21,6 +21,10 @@ config HAVE_EBPF_JIT
config ARCH_WANT_DEFAULT_BPF_JIT
bool

+# Used by archs to tell they support reading raw CPU clock
+config ARCH_HAS_BPF_RAW_CPU_CLOCK
+ bool
+
menu "BPF subsystem"

config BPF_SYSCALL
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b6c72af64d5d..8e2359dfd582 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2345,6 +2345,8 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
+const struct bpf_func_proto bpf_read_raw_cpu_clock_proto __weak;
+

const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ffd469c217f..90b9e5efaf65 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -18,6 +18,10 @@

#include "../../lib/kstrtox.h"

+#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK
+#include <asm/bpf_raw_cpu_clock.h>
+#endif
+
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
* bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
@@ -168,6 +172,21 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
.ret_type = RET_INTEGER,
};

+BPF_CALL_0(bpf_read_raw_cpu_clock)
+{
+#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK
+ return read_raw_cpu_clock();
+#else
+ return sched_clock();
+#endif
+}
+
+const struct bpf_func_proto bpf_read_raw_cpu_clock_proto = {
+ .func = bpf_read_raw_cpu_clock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
BPF_CALL_0(bpf_ktime_get_coarse_ns)
{
return ktime_get_coarse_ns();
@@ -1366,6 +1385,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_read_raw_cpu_clock:
+ return &bpf_read_raw_cpu_clock_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 6fc59d61937a..52191791b089 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4037,6 +4037,13 @@ union bpf_attr {
* Return
* Current *ktime*.
*
+ * u64 bpf_read_raw_cpu_clock(void)
+ * Description
+ * Return the architecture specific CPU clock value.
+ * For x86, this is the TSC clock.
+ * Return
+ * *CPU clock value*
+ *
* long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
* Description
* **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
@@ -5089,6 +5096,7 @@ union bpf_attr {
FN(task_pt_regs), \
FN(get_branch_snapshot), \
FN(trace_vprintk), \
+ FN(read_raw_cpu_clock), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper

2021-10-07 19:00:53

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next] bpf: introduce helper bpf_raw_read_cpu_clock

On Thu, Oct 7, 2021 at 11:04 AM Marcelo Tosatti <[email protected]> wrote:
>
>
> Add bpf_raw_read_cpu_clock helper, to read architecture specific
> CPU clock. In x86's case, this is the TSC.
>
> This is necessary to synchronize bpf traces from host and guest bpf-programs
> (after subtracting guest tsc-offset from guest timestamps).
>
> Signed-off-by: Marcelo Tosatti <[email protected]>

Could you please
1) give more context on the target use case;
2) add some selftests for the feature?

Thanks,
Song

>
> ---
>
> v2:
> - drop redundant addition to bpf_tracing_func_proto (Song Liu)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index ab83c22d274e..832bb1f65f28 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -95,6 +95,7 @@ config X86
> select ARCH_HAS_UBSAN_SANITIZE_ALL
> select ARCH_HAS_DEBUG_WX
> select ARCH_HAS_ZONE_DMA_SET if EXPERT
> + select ARCH_HAS_BPF_RAW_CPU_CLOCK
> select ARCH_HAVE_NMI_SAFE_CMPXCHG
> select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
> select ARCH_MIGHT_HAVE_PC_PARPORT
> diff --git a/arch/x86/include/asm/bpf_raw_cpu_clock.h b/arch/x86/include/asm/bpf_raw_cpu_clock.h
> new file mode 100644
> index 000000000000..6951c399819e
> --- /dev/null
> +++ b/arch/x86/include/asm/bpf_raw_cpu_clock.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_BPF_RAW_CPU_CLOCK_H_
> +#define _ASM_X86_BPF_RAW_CPU_CLOCK_H_
> +
> +static inline unsigned long long read_raw_cpu_clock(void)
> +{
> + return rdtsc_ordered();
> +}
> +
> +#endif /* _ASM_X86_BPF_RAW_CPU_CLOCK_H_ */
> diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
> index 3eff08d7b8e5..844a44ff508d 100644
> --- a/drivers/media/rc/bpf-lirc.c
> +++ b/drivers/media/rc/bpf-lirc.c
> @@ -105,6 +105,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> return &bpf_ktime_get_ns_proto;
> case BPF_FUNC_ktime_get_boot_ns:
> return &bpf_ktime_get_boot_ns_proto;
> + case BPF_FUNC_read_raw_cpu_clock:
> + return &bpf_read_raw_cpu_clock_proto;
> case BPF_FUNC_tail_call:
> return &bpf_tail_call_proto;
> case BPF_FUNC_get_prandom_u32:
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index d604c8251d88..b6cb426085fb 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2058,6 +2058,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
> extern const struct bpf_func_proto bpf_tail_call_proto;
> extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
> extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
> +extern const struct bpf_func_proto bpf_read_raw_cpu_clock_proto;
> extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
> extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
> extern const struct bpf_func_proto bpf_get_current_comm_proto;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 6fc59d61937a..52191791b089 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -4037,6 +4037,13 @@ union bpf_attr {
> * Return
> * Current *ktime*.
> *
> + * u64 bpf_read_raw_cpu_clock(void)
> + * Description
> + * Return the architecture specific CPU clock value.
> + * For x86, this is the TSC clock.
> + * Return
> + * *CPU clock value*
> + *
> * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
> * Description
> * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
> @@ -5089,6 +5096,7 @@ union bpf_attr {
> FN(task_pt_regs), \
> FN(get_branch_snapshot), \
> FN(trace_vprintk), \
> + FN(read_raw_cpu_clock), \
> /* */
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
> index a82d6de86522..5815db157220 100644
> --- a/kernel/bpf/Kconfig
> +++ b/kernel/bpf/Kconfig
> @@ -21,6 +21,10 @@ config HAVE_EBPF_JIT
> config ARCH_WANT_DEFAULT_BPF_JIT
> bool
>
> +# Used by archs to tell they support reading raw CPU clock
> +config ARCH_HAS_BPF_RAW_CPU_CLOCK
> + bool
> +
> menu "BPF subsystem"
>
> config BPF_SYSCALL
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index b6c72af64d5d..8e2359dfd582 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2345,6 +2345,8 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
> const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
> const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
> const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
> +const struct bpf_func_proto bpf_read_raw_cpu_clock_proto __weak;
> +
>
> const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
> const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 1ffd469c217f..90b9e5efaf65 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -18,6 +18,10 @@
>
> #include "../../lib/kstrtox.h"
>
> +#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK
> +#include <asm/bpf_raw_cpu_clock.h>
> +#endif
> +
> /* If kernel subsystem is allowing eBPF programs to call this function,
> * inside its own verifier_ops->get_func_proto() callback it should return
> * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
> @@ -168,6 +172,21 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
> .ret_type = RET_INTEGER,
> };
>
> +BPF_CALL_0(bpf_read_raw_cpu_clock)
> +{
> +#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK
> + return read_raw_cpu_clock();
> +#else
> + return sched_clock();
> +#endif
> +}
> +
> +const struct bpf_func_proto bpf_read_raw_cpu_clock_proto = {
> + .func = bpf_read_raw_cpu_clock,
> + .gpl_only = false,
> + .ret_type = RET_INTEGER,
> +};
> +
> BPF_CALL_0(bpf_ktime_get_coarse_ns)
> {
> return ktime_get_coarse_ns();
> @@ -1366,6 +1385,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
> return &bpf_ktime_get_boot_ns_proto;
> case BPF_FUNC_ktime_get_coarse_ns:
> return &bpf_ktime_get_coarse_ns_proto;
> + case BPF_FUNC_read_raw_cpu_clock:
> + return &bpf_read_raw_cpu_clock_proto;
> case BPF_FUNC_ringbuf_output:
> return &bpf_ringbuf_output_proto;
> case BPF_FUNC_ringbuf_reserve:
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 6fc59d61937a..52191791b089 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -4037,6 +4037,13 @@ union bpf_attr {
> * Return
> * Current *ktime*.
> *
> + * u64 bpf_read_raw_cpu_clock(void)
> + * Description
> + * Return the architecture specific CPU clock value.
> + * For x86, this is the TSC clock.
> + * Return
> + * *CPU clock value*
> + *
> * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
> * Description
> * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
> @@ -5089,6 +5096,7 @@ union bpf_attr {
> FN(task_pt_regs), \
> FN(get_branch_snapshot), \
> FN(trace_vprintk), \
> + FN(read_raw_cpu_clock), \
> /* */
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>