There is a potential security issue that perf kernel samples
may be leaked even though kernel sampling is disabled. For fixing
the potential leakage, the idea is to use instruction_pointer_set
to set invalid ip address in leaked perf samples in some cases.
But instruction_pointer_set is missing on some architectures.
Define instruction_pointer_set for these architectures.
Signed-off-by: Jin Yao <[email protected]>
---
arch/alpha/include/asm/ptrace.h | 6 ++++++
arch/arc/include/asm/ptrace.h | 6 ++++++
arch/nds32/include/asm/ptrace.h | 7 +++++++
arch/xtensa/include/asm/ptrace.h | 6 ++++++
4 files changed, 25 insertions(+)
diff --git a/arch/alpha/include/asm/ptrace.h b/arch/alpha/include/asm/ptrace.h
index df5f317ab3fc..c464d525c110 100644
--- a/arch/alpha/include/asm/ptrace.h
+++ b/arch/alpha/include/asm/ptrace.h
@@ -25,4 +25,10 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
return regs->r0;
}
+static inline void instruction_pointer_set(struct pt_regs *regs,
+ unsigned long val)
+{
+ regs->pc = val;
+}
+
#endif
diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h
index 2fdb87addadc..8869a6c0fe8c 100644
--- a/arch/arc/include/asm/ptrace.h
+++ b/arch/arc/include/asm/ptrace.h
@@ -154,6 +154,12 @@ static inline long regs_return_value(struct pt_regs *regs)
return (long)regs->r0;
}
+static inline void instruction_pointer_set(struct pt_regs *regs,
+ unsigned long val)
+{
+ regs->ret = val;
+}
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_PTRACE_H */
diff --git a/arch/nds32/include/asm/ptrace.h b/arch/nds32/include/asm/ptrace.h
index 919ee223620c..19a916bef7f5 100644
--- a/arch/nds32/include/asm/ptrace.h
+++ b/arch/nds32/include/asm/ptrace.h
@@ -62,6 +62,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
{
return regs->uregs[0];
}
+
+static inline void instruction_pointer_set(struct pt_regs *regs,
+ unsigned long val)
+{
+ regs->ipc = val;
+}
+
extern void show_regs(struct pt_regs *);
/* Avoid circular header include via sched.h */
struct task_struct;
diff --git a/arch/xtensa/include/asm/ptrace.h b/arch/xtensa/include/asm/ptrace.h
index b109416dc07e..82ab1ba99259 100644
--- a/arch/xtensa/include/asm/ptrace.h
+++ b/arch/xtensa/include/asm/ptrace.h
@@ -90,6 +90,12 @@ struct pt_regs {
# define return_pointer(regs) (MAKE_PC_FROM_RA((regs)->areg[0], \
(regs)->areg[1]))
+static inline void instruction_pointer_set(struct pt_regs *regs,
+ unsigned long val)
+{
+ regs->pc = val;
+}
+
# ifndef CONFIG_SMP
# define profile_pc(regs) instruction_pointer(regs)
# else
--
2.17.1
When doing sampling, for example,
perf record -e cycles:u ...
On workloads that do a lot of kernel entry/exits we see kernel
samples, even though :u is specified. This is due to skid.
This is a potential security issue because it may leak kernel
address even though kernel sampling is disabled.
The commit cc1582c231ea ("perf/core: Drop kernel samples even
though :u is specified") dropped the leaked kernel samples but it
broke rr-project.
Another idea is (inspired by Mark Rutland's original patch), it
doesn't lose the samples, it keeps the samples but fakes the regs
by using the user regs of current task. If the user regs is not
available, uses the instruction_pointer_set to set -1L to ip address
of regs.
For CALLCHAIN, the get_perf_callchain has checked user_mode(regs)
and use task_pt_regs(current) instead in some cases. So actually it
has considered the leaking possibility.
For REGS_USER and STACK_USER, it's similar. The perf_sample_regs_user
has also checked the user_mode(regs). It calls perf_get_regs_user()
for kthread. So we don't need to use "regs_fake" there.
Example:
perf record -e cycles:u ./div
perf report --stdio
Before:
# Overhead Command Shared Object Symbol
# ........ ....... ................ ........................
#
41.08% div div [.] main
21.04% div libc-2.27.so [.] __random_r
19.90% div libc-2.27.so [.] __random
9.86% div div [.] compute_flag
4.24% div libc-2.27.so [.] rand
3.88% div div [.] rand@plt
0.01% div [unknown] [k] 0xffffffffb9601c20
0.00% div libc-2.27.so [.] _dl_addr
0.00% div ld-2.27.so [.] __GI___tunables_init
0.00% div [unknown] [k] 0xffffffffb9601210
0.00% div ld-2.27.so [.] _start
"0xffffffffb9601c20, 0xffffffffb9601210" are leaked kernel addresses.
After:
# Overhead Command Shared Object Symbol
# ........ ....... ............. ........................
#
40.86% div div [.] main
20.67% div libc-2.27.so [.] __random_r
20.54% div libc-2.27.so [.] __random
9.65% div div [.] compute_flag
4.32% div libc-2.27.so [.] rand
3.97% div div [.] rand@plt
0.00% div ld-2.27.so [.] __GI___tunables_init
0.00% div ld-2.27.so [.] _start
Now there is no kernel address leaked.
Inspired-by: Mark Rutland <[email protected]>
Signed-off-by: Jin Yao <[email protected]>
---
kernel/events/core.c | 48 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 45 insertions(+), 3 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7c436d705fbd..52f6d7f0b86b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6973,7 +6973,8 @@ static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
- bool kernel = !event->attr.exclude_callchain_kernel;
+ bool kernel = !event->attr.exclude_callchain_kernel &&
+ !event->attr.exclude_kernel;
bool user = !event->attr.exclude_callchain_user;
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
@@ -6988,12 +6989,39 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
return callchain ?: &__empty_callchain;
}
+static struct pt_regs *leak_check(struct perf_event_header *header,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ struct pt_regs *regs_fake = NULL;
+
+ if (event->attr.exclude_kernel && !user_mode(regs)) {
+ if (!(current->flags & PF_KTHREAD)) {
+ regs_fake = task_pt_regs(current);
+ if (!user_mode(regs_fake)) {
+ regs_fake = NULL;
+ instruction_pointer_set(regs, -1L);
+ }
+ } else
+ instruction_pointer_set(regs, -1L);
+
+ if ((header->misc & PERF_RECORD_MISC_CPUMODE_MASK) ==
+ PERF_RECORD_MISC_KERNEL) {
+ header->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
+ header->misc |= PERF_RECORD_MISC_USER;
+ }
+ }
+
+ return regs_fake;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
+ struct pt_regs *regs_fake;
header->type = PERF_RECORD_SAMPLE;
header->size = sizeof(*header) + event->header_size;
@@ -7003,8 +7031,19 @@ void perf_prepare_sample(struct perf_event_header *header,
__perf_event_header__init_id(header, data, event);
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events. To avoid leaking kernel address to
+ * userspace, we try to fake the regs by using the user regs
+ * of current task.
+ */
+ regs_fake = leak_check(header, event, regs);
+
if (sample_type & PERF_SAMPLE_IP)
- data->ip = perf_instruction_pointer(regs);
+ data->ip = (regs_fake) ?
+ perf_instruction_pointer(regs_fake) :
+ perf_instruction_pointer(regs);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
@@ -7099,7 +7138,10 @@ void perf_prepare_sample(struct perf_event_header *header,
/* regs dump ABI info */
int size = sizeof(u64);
- perf_sample_regs_intr(&data->regs_intr, regs);
+ if (regs_fake)
+ perf_sample_regs_intr(&data->regs_intr, regs_fake);
+ else
+ perf_sample_regs_intr(&data->regs_intr, regs);
if (data->regs_intr.regs) {
u64 mask = event->attr.sample_regs_intr;
--
2.17.1
On Fri, Jul 31, 2020 at 10:56:16AM +0800, Jin Yao wrote:
> There is a potential security issue that perf kernel samples
> may be leaked even though kernel sampling is disabled. For fixing
> the potential leakage, the idea is to use instruction_pointer_set
> to set invalid ip address in leaked perf samples in some cases.
>
> But instruction_pointer_set is missing on some architectures.
> Define instruction_pointer_set for these architectures.
>
> Signed-off-by: Jin Yao <[email protected]>
> ---
> arch/alpha/include/asm/ptrace.h | 6 ++++++
> arch/arc/include/asm/ptrace.h | 6 ++++++
> arch/nds32/include/asm/ptrace.h | 7 +++++++
> arch/xtensa/include/asm/ptrace.h | 6 ++++++
> 4 files changed, 25 insertions(+)
AFAICT you forgot to actually Cc the maintainers for all that.
> diff --git a/arch/alpha/include/asm/ptrace.h b/arch/alpha/include/asm/ptrace.h
> index df5f317ab3fc..c464d525c110 100644
> --- a/arch/alpha/include/asm/ptrace.h
> +++ b/arch/alpha/include/asm/ptrace.h
> @@ -25,4 +25,10 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
> return regs->r0;
> }
>
> +static inline void instruction_pointer_set(struct pt_regs *regs,
> + unsigned long val)
> +{
> + regs->pc = val;
> +}
> +
> #endif
> diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h
> index 2fdb87addadc..8869a6c0fe8c 100644
> --- a/arch/arc/include/asm/ptrace.h
> +++ b/arch/arc/include/asm/ptrace.h
> @@ -154,6 +154,12 @@ static inline long regs_return_value(struct pt_regs *regs)
> return (long)regs->r0;
> }
>
> +static inline void instruction_pointer_set(struct pt_regs *regs,
> + unsigned long val)
> +{
> + regs->ret = val;
> +}
> +
> #endif /* !__ASSEMBLY__ */
>
> #endif /* __ASM_PTRACE_H */
> diff --git a/arch/nds32/include/asm/ptrace.h b/arch/nds32/include/asm/ptrace.h
> index 919ee223620c..19a916bef7f5 100644
> --- a/arch/nds32/include/asm/ptrace.h
> +++ b/arch/nds32/include/asm/ptrace.h
> @@ -62,6 +62,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
> {
> return regs->uregs[0];
> }
> +
> +static inline void instruction_pointer_set(struct pt_regs *regs,
> + unsigned long val)
> +{
> + regs->ipc = val;
> +}
> +
> extern void show_regs(struct pt_regs *);
> /* Avoid circular header include via sched.h */
> struct task_struct;
> diff --git a/arch/xtensa/include/asm/ptrace.h b/arch/xtensa/include/asm/ptrace.h
> index b109416dc07e..82ab1ba99259 100644
> --- a/arch/xtensa/include/asm/ptrace.h
> +++ b/arch/xtensa/include/asm/ptrace.h
> @@ -90,6 +90,12 @@ struct pt_regs {
> # define return_pointer(regs) (MAKE_PC_FROM_RA((regs)->areg[0], \
> (regs)->areg[1]))
>
> +static inline void instruction_pointer_set(struct pt_regs *regs,
> + unsigned long val)
> +{
> + regs->pc = val;
> +}
> +
> # ifndef CONFIG_SMP
> # define profile_pc(regs) instruction_pointer(regs)
> # else
> --
> 2.17.1
>
On Fri, Jul 31, 2020 at 10:56:17AM +0800, Jin Yao wrote:
> @@ -6973,7 +6973,8 @@ static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
> struct perf_callchain_entry *
> perf_callchain(struct perf_event *event, struct pt_regs *regs)
> {
> - bool kernel = !event->attr.exclude_callchain_kernel;
> + bool kernel = !event->attr.exclude_callchain_kernel &&
> + !event->attr.exclude_kernel;
This seems weird; how can we get there. Also it seems to me that if you
have !exclude_callchain_kernel you already have permission for kernel
bits, so who cares.
> bool user = !event->attr.exclude_callchain_user;
> /* Disallow cross-task user callchains. */
> bool crosstask = event->ctx->task && event->ctx->task != current;
> @@ -6988,12 +6989,39 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
> return callchain ?: &__empty_callchain;
> }
>
> +static struct pt_regs *leak_check(struct perf_event_header *header,
> + struct perf_event *event,
> + struct pt_regs *regs)
> +{
> + struct pt_regs *regs_fake = NULL;
> +
> + if (event->attr.exclude_kernel && !user_mode(regs)) {
> + if (!(current->flags & PF_KTHREAD)) {
> + regs_fake = task_pt_regs(current);
> + if (!user_mode(regs_fake)) {
> + regs_fake = NULL;
> + instruction_pointer_set(regs, -1L);
> + }
> + } else
> + instruction_pointer_set(regs, -1L);
That violates coding style, also:
if (!(current->flags & PF_KTHREAD)) {
regs_fake = task_pt_regs(current);
if (!user_mode(regs_fake)) /* is this not a BUG? */
regs_fake = NULL;
}
if (!regs_fake)
instruction_pointer_set(regs, -1L);
Seems simpler to me.
> + if ((header->misc & PERF_RECORD_MISC_CPUMODE_MASK) ==
> + PERF_RECORD_MISC_KERNEL) {
> + header->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
> + header->misc |= PERF_RECORD_MISC_USER;
> + }
Why the conditional? At this point it had better be unconditionally
user, no?
headers->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
headers->misc |= PERF_RECORD_MISC_USER;
> + }
> +
> + return regs_fake;
> +}
On Thu, Jul 30, 2020 at 7:59 PM Jin Yao <[email protected]> wrote:
>
> There is a potential security issue that perf kernel samples
> may be leaked even though kernel sampling is disabled. For fixing
> the potential leakage, the idea is to use instruction_pointer_set
> to set invalid ip address in leaked perf samples in some cases.
>
> But instruction_pointer_set is missing on some architectures.
> Define instruction_pointer_set for these architectures.
>
> Signed-off-by: Jin Yao <[email protected]>
> ---
> arch/alpha/include/asm/ptrace.h | 6 ++++++
> arch/arc/include/asm/ptrace.h | 6 ++++++
> arch/nds32/include/asm/ptrace.h | 7 +++++++
> arch/xtensa/include/asm/ptrace.h | 6 ++++++
> 4 files changed, 25 insertions(+)
For xtensa:
Acked-by: Max Filippov <[email protected]>
--
Thanks.
-- Max
Hi Peter,
On 8/4/2020 7:31 PM, [email protected] wrote:
> On Fri, Jul 31, 2020 at 10:56:16AM +0800, Jin Yao wrote:
>> There is a potential security issue that perf kernel samples
>> may be leaked even though kernel sampling is disabled. For fixing
>> the potential leakage, the idea is to use instruction_pointer_set
>> to set invalid ip address in leaked perf samples in some cases.
>>
>> But instruction_pointer_set is missing on some architectures.
>> Define instruction_pointer_set for these architectures.
>>
>> Signed-off-by: Jin Yao <[email protected]>
>> ---
>> arch/alpha/include/asm/ptrace.h | 6 ++++++
>> arch/arc/include/asm/ptrace.h | 6 ++++++
>> arch/nds32/include/asm/ptrace.h | 7 +++++++
>> arch/xtensa/include/asm/ptrace.h | 6 ++++++
>> 4 files changed, 25 insertions(+)
>
> AFAICT you forgot to actually Cc the maintainers for all that.
>
Thanks so much for helping to Cc the maintainers. I just checked the MAINTAINERS file and Cc the
maintainer for PTRACE SUPPORT but I forgot to use the get_maintainer.pl script. :(
Thanks
Jin Yao
>> diff --git a/arch/alpha/include/asm/ptrace.h b/arch/alpha/include/asm/ptrace.h
>> index df5f317ab3fc..c464d525c110 100644
>> --- a/arch/alpha/include/asm/ptrace.h
>> +++ b/arch/alpha/include/asm/ptrace.h
>> @@ -25,4 +25,10 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
>> return regs->r0;
>> }
>>
>> +static inline void instruction_pointer_set(struct pt_regs *regs,
>> + unsigned long val)
>> +{
>> + regs->pc = val;
>> +}
>> +
>> #endif
>> diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h
>> index 2fdb87addadc..8869a6c0fe8c 100644
>> --- a/arch/arc/include/asm/ptrace.h
>> +++ b/arch/arc/include/asm/ptrace.h
>> @@ -154,6 +154,12 @@ static inline long regs_return_value(struct pt_regs *regs)
>> return (long)regs->r0;
>> }
>>
>> +static inline void instruction_pointer_set(struct pt_regs *regs,
>> + unsigned long val)
>> +{
>> + regs->ret = val;
>> +}
>> +
>> #endif /* !__ASSEMBLY__ */
>>
>> #endif /* __ASM_PTRACE_H */
>> diff --git a/arch/nds32/include/asm/ptrace.h b/arch/nds32/include/asm/ptrace.h
>> index 919ee223620c..19a916bef7f5 100644
>> --- a/arch/nds32/include/asm/ptrace.h
>> +++ b/arch/nds32/include/asm/ptrace.h
>> @@ -62,6 +62,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
>> {
>> return regs->uregs[0];
>> }
>> +
>> +static inline void instruction_pointer_set(struct pt_regs *regs,
>> + unsigned long val)
>> +{
>> + regs->ipc = val;
>> +}
>> +
>> extern void show_regs(struct pt_regs *);
>> /* Avoid circular header include via sched.h */
>> struct task_struct;
>> diff --git a/arch/xtensa/include/asm/ptrace.h b/arch/xtensa/include/asm/ptrace.h
>> index b109416dc07e..82ab1ba99259 100644
>> --- a/arch/xtensa/include/asm/ptrace.h
>> +++ b/arch/xtensa/include/asm/ptrace.h
>> @@ -90,6 +90,12 @@ struct pt_regs {
>> # define return_pointer(regs) (MAKE_PC_FROM_RA((regs)->areg[0], \
>> (regs)->areg[1]))
>>
>> +static inline void instruction_pointer_set(struct pt_regs *regs,
>> + unsigned long val)
>> +{
>> + regs->pc = val;
>> +}
>> +
>> # ifndef CONFIG_SMP
>> # define profile_pc(regs) instruction_pointer(regs)
>> # else
>> --
>> 2.17.1
>>
Hi Peter,
On 8/4/2020 7:49 PM, [email protected] wrote:
> On Fri, Jul 31, 2020 at 10:56:17AM +0800, Jin Yao wrote:
>> @@ -6973,7 +6973,8 @@ static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
>> struct perf_callchain_entry *
>> perf_callchain(struct perf_event *event, struct pt_regs *regs)
>> {
>> - bool kernel = !event->attr.exclude_callchain_kernel;
>> + bool kernel = !event->attr.exclude_callchain_kernel &&
>> + !event->attr.exclude_kernel;
>
> This seems weird; how can we get there. Also it seems to me that if you
> have !exclude_callchain_kernel you already have permission for kernel
> bits, so who cares.
>
In perf tool, exclude_callchain_kernel is set to 1 when perf-record only collects the user
callchains and exclude_kernel is set to 1 when events are configured to run in user space.
So if an event is configured to run in user space, that should make sense we don't need it's kernel
callchains.
But it seems to me there is no code logic in perf tool which can make sure !exclude_callchain_kernel
-> !exclude_kernel.
Jiri, Arnaldo, is my understanding correct?
>> bool user = !event->attr.exclude_callchain_user;
>> /* Disallow cross-task user callchains. */
>> bool crosstask = event->ctx->task && event->ctx->task != current;
>> @@ -6988,12 +6989,39 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
>> return callchain ?: &__empty_callchain;
>> }
>>
>> +static struct pt_regs *leak_check(struct perf_event_header *header,
>> + struct perf_event *event,
>> + struct pt_regs *regs)
>> +{
>> + struct pt_regs *regs_fake = NULL;
>> +
>> + if (event->attr.exclude_kernel && !user_mode(regs)) {
>> + if (!(current->flags & PF_KTHREAD)) {
>> + regs_fake = task_pt_regs(current);
>> + if (!user_mode(regs_fake)) {
>> + regs_fake = NULL;
>> + instruction_pointer_set(regs, -1L);
>> + }
>> + } else
>> + instruction_pointer_set(regs, -1L);
>
> That violates coding style, also:
>
Thanks, I should use:
} else {
instruction_pointer_set(regs, -1L);
}
> if (!(current->flags & PF_KTHREAD)) {
> regs_fake = task_pt_regs(current);
> if (!user_mode(regs_fake)) /* is this not a BUG? */
We don't need !user_mode(regs_fake) check here, it's unnecessary check.
> regs_fake = NULL;
> }
>
> if (!regs_fake)
> instruction_pointer_set(regs, -1L);
>
> Seems simpler to me.
>
So the new code looks like:
if (event->attr.exclude_kernel && !user_mode(regs)) {
if (!(current->flags & PF_KTHREAD)) {
regs_fake = task_pt_regs(current);
if (!regs_fake)
instruction_pointer_set(regs, -1L);
} else {
instruction_pointer_set(regs, -1L);
}
>> + if ((header->misc & PERF_RECORD_MISC_CPUMODE_MASK) ==
>> + PERF_RECORD_MISC_KERNEL) {
>> + header->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
>> + header->misc |= PERF_RECORD_MISC_USER;
>> + }
>
> Why the conditional? At this point it had better be unconditionally
> user, no?
>
> headers->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
> headers->misc |= PERF_RECORD_MISC_USER;
>
#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
#define PERF_RECORD_MISC_KERNEL (1 << 0)
#define PERF_RECORD_MISC_USER (2 << 0)
#define PERF_RECORD_MISC_HYPERVISOR (3 << 0)
#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0)
#define PERF_RECORD_MISC_GUEST_USER (5 << 0)
If we unconditionally set user, it will reset for hypervisor, guest kernel and guest_user.
Thanks
Jin Yao
>> + }
>> +
>> + return regs_fake;
>> +}
On Wed, Aug 05, 2020 at 02:44:54PM +0200, [email protected] wrote:
> How's this?
Clearly I didn't even hold it near a compiler...
> ---
> kernel/events/core.c | 38 +++++++++++++++++++++++++++++++++-----
> 1 file changed, 33 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 7c436d705fbd..3e4e328b521a 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6988,23 +6988,49 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
> return callchain ?: &__empty_callchain;
> }
>
> +/*
> + * Due to interrupt latency (skid), we may enter the kernel before taking the
> + * PMI, even if the PMU is configured to only count user events. To avoid
> + * leaking kernel addresses, use task_pt_regs(), when available.
> + */
> +static struct pt_regs *sanitize_sample_regs(struct perf_event *event, struct pt_regs *regs)
> +{
> + struct pt_regs *sample_regs = regs;
> +
> + /* user only */
> + if (!event->attr.exclude_kernel || !event->attr.exclude_hv ||
> + !event->attr.exclude_host || !event->attr.exclude_guest)
> + return sample_regs;
> +
> + if (sample_regs(regs))
> + return sample_regs;
That wants to he:
if (user_regs(regs))
return sample_regs;
> +
> + if (!(current->flags & PF_KTHREAD)) {
s/{//
> + sample_regs = task_pt_regs(current);
> + else
> + instruction_pointer_set(regs, -1L);
> +
> + return sample_regs;
> +}
On Wed, Aug 05, 2020 at 10:15:26AM +0800, Jin, Yao wrote:
> Hi Peter,
>
> On 8/4/2020 7:49 PM, [email protected] wrote:
> > On Fri, Jul 31, 2020 at 10:56:17AM +0800, Jin Yao wrote:
> > > @@ -6973,7 +6973,8 @@ static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
> > > struct perf_callchain_entry *
> > > perf_callchain(struct perf_event *event, struct pt_regs *regs)
> > > {
> > > - bool kernel = !event->attr.exclude_callchain_kernel;
> > > + bool kernel = !event->attr.exclude_callchain_kernel &&
> > > + !event->attr.exclude_kernel;
> >
> > This seems weird; how can we get there. Also it seems to me that if you
> > have !exclude_callchain_kernel you already have permission for kernel
> > bits, so who cares.
> >
>
> In perf tool, exclude_callchain_kernel is set to 1 when perf-record only
> collects the user callchains and exclude_kernel is set to 1 when events are
> configured to run in user space.
>
> So if an event is configured to run in user space, that should make sense we
> don't need it's kernel callchains.
>
> But it seems to me there is no code logic in perf tool which can make sure
> !exclude_callchain_kernel -> !exclude_kernel.
>
> Jiri, Arnaldo, is my understanding correct?
What the perf tool does or does not do is irrelevant. It is a valid,
(albeit slightly silly) configuration to have:
exclude_kernel && !exclude_callchain_kernel
You're now saying that when you configure things like this you're not
allowed kernel IPs, that's wrong I think.
Also, !exclude_callchain_kernel should require privilidge, whcih needs
fixing, see below.
> So the new code looks like:
>
> if (event->attr.exclude_kernel && !user_mode(regs)) {
> if (!(current->flags & PF_KTHREAD)) {
> regs_fake = task_pt_regs(current);
> if (!regs_fake)
> instruction_pointer_set(regs, -1L);
> } else {
> instruction_pointer_set(regs, -1L);
> }
Again:
if (!(current->flags & PF_KTHREAD))
regs_fake = task_pt_regs(current);
if (!regs_fake)
instruction_pointer_set(regs, -1L);
Is much simpler and more readable.
> > > + if ((header->misc & PERF_RECORD_MISC_CPUMODE_MASK) ==
> > > + PERF_RECORD_MISC_KERNEL) {
> > > + header->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
> > > + header->misc |= PERF_RECORD_MISC_USER;
> > > + }
> >
> > Why the conditional? At this point it had better be unconditionally
> > user, no?
> >
> > headers->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
> > headers->misc |= PERF_RECORD_MISC_USER;
> >
>
> #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
> #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
> #define PERF_RECORD_MISC_KERNEL (1 << 0)
> #define PERF_RECORD_MISC_USER (2 << 0)
> #define PERF_RECORD_MISC_HYPERVISOR (3 << 0)
> #define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0)
> #define PERF_RECORD_MISC_GUEST_USER (5 << 0)
>
> If we unconditionally set user, it will reset for hypervisor, guest
> kernel and guest_user.
At the same time :u had better not get any of those either. Which seems
to suggest we're going about this wrong.
Also, if we call this before perf_misc_flags() we don't need to fix it
up.
How's this?
---
kernel/events/core.c | 38 +++++++++++++++++++++++++++++++++-----
1 file changed, 33 insertions(+), 5 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7c436d705fbd..3e4e328b521a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6988,23 +6988,49 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
return callchain ?: &__empty_callchain;
}
+/*
+ * Due to interrupt latency (skid), we may enter the kernel before taking the
+ * PMI, even if the PMU is configured to only count user events. To avoid
+ * leaking kernel addresses, use task_pt_regs(), when available.
+ */
+static struct pt_regs *sanitize_sample_regs(struct perf_event *event, struct pt_regs *regs)
+{
+ struct pt_regs *sample_regs = regs;
+
+ /* user only */
+ if (!event->attr.exclude_kernel || !event->attr.exclude_hv ||
+ !event->attr.exclude_host || !event->attr.exclude_guest)
+ return sample_regs;
+
+ if (sample_regs(regs))
+ return sample_regs;
+
+ if (!(current->flags & PF_KTHREAD)) {
+ sample_regs = task_pt_regs(current);
+ else
+ instruction_pointer_set(regs, -1L);
+
+ return sample_regs;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
{
+ struct pt_regs *sample_regs = sanitize_sample_regs(event, regs);
u64 sample_type = event->attr.sample_type;
header->type = PERF_RECORD_SAMPLE;
header->size = sizeof(*header) + event->header_size;
header->misc = 0;
- header->misc |= perf_misc_flags(regs);
+ header->misc |= perf_misc_flags(sample_regs);
__perf_event_header__init_id(header, data, event);
if (sample_type & PERF_SAMPLE_IP)
- data->ip = perf_instruction_pointer(regs);
+ data->ip = perf_instruction_pointer(sample_regs);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
@@ -7054,9 +7080,10 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) {
perf_sample_regs_user(&data->regs_user, regs,
&data->regs_user_copy);
+ }
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
@@ -7099,7 +7126,7 @@ void perf_prepare_sample(struct perf_event_header *header,
/* regs dump ABI info */
int size = sizeof(u64);
- perf_sample_regs_intr(&data->regs_intr, regs);
+ perf_sample_regs_intr(&data->regs_intr, sample_regs);
if (data->regs_intr.regs) {
u64 mask = event->attr.sample_regs_intr;
@@ -11609,7 +11636,8 @@ SYSCALL_DEFINE5(perf_event_open,
if (err)
return err;
- if (!attr.exclude_kernel) {
+ if (!attr.exclude_kernel || !attr.exclude_callchain_kernel ||
+ !attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest) {
err = perf_allow_kernel(&attr);
if (err)
return err;
Hi Peter,
On 8/5/2020 8:44 PM, [email protected] wrote:
> On Wed, Aug 05, 2020 at 10:15:26AM +0800, Jin, Yao wrote:
>> Hi Peter,
>>
>> On 8/4/2020 7:49 PM, [email protected] wrote:
>>> On Fri, Jul 31, 2020 at 10:56:17AM +0800, Jin Yao wrote:
>>>> @@ -6973,7 +6973,8 @@ static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
>>>> struct perf_callchain_entry *
>>>> perf_callchain(struct perf_event *event, struct pt_regs *regs)
>>>> {
>>>> - bool kernel = !event->attr.exclude_callchain_kernel;
>>>> + bool kernel = !event->attr.exclude_callchain_kernel &&
>>>> + !event->attr.exclude_kernel;
>>>
>>> This seems weird; how can we get there. Also it seems to me that if you
>>> have !exclude_callchain_kernel you already have permission for kernel
>>> bits, so who cares.
>>>
>>
>> In perf tool, exclude_callchain_kernel is set to 1 when perf-record only
>> collects the user callchains and exclude_kernel is set to 1 when events are
>> configured to run in user space.
>>
>> So if an event is configured to run in user space, that should make sense we
>> don't need it's kernel callchains.
>>
>> But it seems to me there is no code logic in perf tool which can make sure
>> !exclude_callchain_kernel -> !exclude_kernel.
>>
>> Jiri, Arnaldo, is my understanding correct?
>
> What the perf tool does or does not do is irrelevant. It is a valid,
> (albeit slightly silly) configuration to have:
>
> exclude_kernel && !exclude_callchain_kernel
>
> You're now saying that when you configure things like this you're not
> allowed kernel IPs, that's wrong I think.
>
> Also, !exclude_callchain_kernel should require privilidge, whcih needs
> fixing, see below.
>
I see you add '!exclude_callchain_kernel' check before perf_allow_kernel() at syscall entry in below
code.
So if we allow callchain_kernel collection that means we allow kernel by default. That makes sense,
thanks!
>> So the new code looks like:
>>
>> if (event->attr.exclude_kernel && !user_mode(regs)) {
>> if (!(current->flags & PF_KTHREAD)) {
>> regs_fake = task_pt_regs(current);
>> if (!regs_fake)
>> instruction_pointer_set(regs, -1L);
>> } else {
>> instruction_pointer_set(regs, -1L);
>> }
>
> Again:
>
> if (!(current->flags & PF_KTHREAD))
> regs_fake = task_pt_regs(current);
>
> if (!regs_fake)
> instruction_pointer_set(regs, -1L);
>
> Is much simpler and more readable.
>
Yes, agree. Your code is much simpler and better.
>>>> + if ((header->misc & PERF_RECORD_MISC_CPUMODE_MASK) ==
>>>> + PERF_RECORD_MISC_KERNEL) {
>>>> + header->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
>>>> + header->misc |= PERF_RECORD_MISC_USER;
>>>> + }
>>>
>>> Why the conditional? At this point it had better be unconditionally
>>> user, no?
>>>
>>> headers->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
>>> headers->misc |= PERF_RECORD_MISC_USER;
>>>
>>
>> #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
>> #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
>> #define PERF_RECORD_MISC_KERNEL (1 << 0)
>> #define PERF_RECORD_MISC_USER (2 << 0)
>> #define PERF_RECORD_MISC_HYPERVISOR (3 << 0)
>> #define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0)
>> #define PERF_RECORD_MISC_GUEST_USER (5 << 0)
>>
>> If we unconditionally set user, it will reset for hypervisor, guest
>> kernel and guest_user.
>
> At the same time :u had better not get any of those either. Which seems
> to suggest we're going about this wrong.
>
> Also, if we call this before perf_misc_flags() we don't need to fix it
> up.
>
> How's this?
>
> ---
> kernel/events/core.c | 38 +++++++++++++++++++++++++++++++++-----
> 1 file changed, 33 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 7c436d705fbd..3e4e328b521a 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6988,23 +6988,49 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
> return callchain ?: &__empty_callchain;
> }
>
> +/*
> + * Due to interrupt latency (skid), we may enter the kernel before taking the
> + * PMI, even if the PMU is configured to only count user events. To avoid
> + * leaking kernel addresses, use task_pt_regs(), when available.
> + */
> +static struct pt_regs *sanitize_sample_regs(struct perf_event *event, struct pt_regs *regs)
> +{
> + struct pt_regs *sample_regs = regs;
> +
> + /* user only */
> + if (!event->attr.exclude_kernel || !event->attr.exclude_hv ||
> + !event->attr.exclude_host || !event->attr.exclude_guest)
> + return sample_regs;
> +
Is this condition correct?
Say counting user event on host, exclude_kernel = 1 and exclude_host = 0. It will go "return
sample_regs" path.
> + if (sample_regs(regs))
> + return sample_regs;
> +
In your another mail, you said it should be:
if (user_regs(regs))
return sample_regs;
> + if (!(current->flags & PF_KTHREAD)) {
No '{', you mentioned in another mail.
> + sample_regs = task_pt_regs(current);
> + else
> + instruction_pointer_set(regs, -1L);
> +
> + return sample_regs;
> +}
> +
> void perf_prepare_sample(struct perf_event_header *header,
> struct perf_sample_data *data,
> struct perf_event *event,
> struct pt_regs *regs)
> {
> + struct pt_regs *sample_regs = sanitize_sample_regs(event, regs);
> u64 sample_type = event->attr.sample_type;
>
> header->type = PERF_RECORD_SAMPLE;
> header->size = sizeof(*header) + event->header_size;
>
> header->misc = 0;
> - header->misc |= perf_misc_flags(regs);
> + header->misc |= perf_misc_flags(sample_regs);
>
> __perf_event_header__init_id(header, data, event);
>
> if (sample_type & PERF_SAMPLE_IP)
> - data->ip = perf_instruction_pointer(regs);
> + data->ip = perf_instruction_pointer(sample_regs);
>
> if (sample_type & PERF_SAMPLE_CALLCHAIN) {
> int size = 1;
> @@ -7054,9 +7080,10 @@ void perf_prepare_sample(struct perf_event_header *header,
> header->size += size;
> }
>
> - if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
> + if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) {
> perf_sample_regs_user(&data->regs_user, regs,
> &data->regs_user_copy);
> + }
>
> if (sample_type & PERF_SAMPLE_REGS_USER) {
> /* regs dump ABI info */
> @@ -7099,7 +7126,7 @@ void perf_prepare_sample(struct perf_event_header *header,
> /* regs dump ABI info */
> int size = sizeof(u64);
>
> - perf_sample_regs_intr(&data->regs_intr, regs);
> + perf_sample_regs_intr(&data->regs_intr, sample_regs);
>
> if (data->regs_intr.regs) {
> u64 mask = event->attr.sample_regs_intr;
> @@ -11609,7 +11636,8 @@ SYSCALL_DEFINE5(perf_event_open,
> if (err)
> return err;
>
> - if (!attr.exclude_kernel) {
> + if (!attr.exclude_kernel || !attr.exclude_callchain_kernel ||
> + !attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest) {
> err = perf_allow_kernel(&attr);
> if (err)
> return err;
>
I can understand the conditions "!attr.exclude_kernel || !attr.exclude_callchain_kernel".
But I'm not very sure about the "!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest".
On host, exclude_hv = 1, exclude_guest = 1 and exclude_host = 0, right?
So even exclude_kernel = 1 but exclude_host = 0, we will still go perf_allow_kernel path. Please
correct me if my understanding is wrong.
Thanks
Jin Yao
On Thu, Aug 06, 2020 at 10:26:29AM +0800, Jin, Yao wrote:
> > +static struct pt_regs *sanitize_sample_regs(struct perf_event *event, struct pt_regs *regs)
> > +{
> > + struct pt_regs *sample_regs = regs;
> > +
> > + /* user only */
> > + if (!event->attr.exclude_kernel || !event->attr.exclude_hv ||
> > + !event->attr.exclude_host || !event->attr.exclude_guest)
> > + return sample_regs;
> > +
>
> Is this condition correct?
>
> Say counting user event on host, exclude_kernel = 1 and exclude_host = 0. It
> will go "return sample_regs" path.
I'm not sure, I'm terminally confused on virt stuff.
Suppose we have nested virt:
L0-hv
|
G0/L1-hv
|
G1
And we're running in G0, then:
- 'exclude_hv' would exclude L0 events
- 'exclude_host' would ... exclude L1-hv events?
- 'exclude_guest' would ... exclude G1 events?
Then the next question is, if G0 is a host, does the L1-hv run in
G0 userspace or G0 kernel space?
I was assuming G0 userspace would not include anything L1 (kvm is a
kernel module after all), but what do I know.
> > @@ -11609,7 +11636,8 @@ SYSCALL_DEFINE5(perf_event_open,
> > if (err)
> > return err;
> > - if (!attr.exclude_kernel) {
> > + if (!attr.exclude_kernel || !attr.exclude_callchain_kernel ||
> > + !attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest) {
> > err = perf_allow_kernel(&attr);
> > if (err)
> > return err;
> >
>
> I can understand the conditions "!attr.exclude_kernel || !attr.exclude_callchain_kernel".
>
> But I'm not very sure about the "!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest".
Well, I'm very sure G0 userspace should never see L0 or G1 state, so
exclude_hv and exclude_guest had better be true.
> On host, exclude_hv = 1, exclude_guest = 1 and exclude_host = 0, right?
Same as above, is G0 host state G0 userspace?
> So even exclude_kernel = 1 but exclude_host = 0, we will still go
> perf_allow_kernel path. Please correct me if my understanding is wrong.
Yes, because with those permission checks in place it means you have
permission to see kernel bits.
On Thu, Aug 06, 2020 at 11:18:27AM +0200, [email protected] wrote:
> On Thu, Aug 06, 2020 at 10:26:29AM +0800, Jin, Yao wrote:
>
> > > +static struct pt_regs *sanitize_sample_regs(struct perf_event *event, struct pt_regs *regs)
> > > +{
> > > + struct pt_regs *sample_regs = regs;
> > > +
> > > + /* user only */
> > > + if (!event->attr.exclude_kernel || !event->attr.exclude_hv ||
> > > + !event->attr.exclude_host || !event->attr.exclude_guest)
> > > + return sample_regs;
> > > +
> >
> > Is this condition correct?
> >
> > Say counting user event on host, exclude_kernel = 1 and exclude_host = 0. It
> > will go "return sample_regs" path.
>
> I'm not sure, I'm terminally confused on virt stuff.
[A]
> Suppose we have nested virt:
>
> L0-hv
> |
> G0/L1-hv
> |
> G1
>
> And we're running in G0, then:
>
> - 'exclude_hv' would exclude L0 events
> - 'exclude_host' would ... exclude L1-hv events?
> - 'exclude_guest' would ... exclude G1 events?
[B]
> Then the next question is, if G0 is a host, does the L1-hv run in
> G0 userspace or G0 kernel space?
>
> I was assuming G0 userspace would not include anything L1 (kvm is a
> kernel module after all), but what do I know.
>
> > > @@ -11609,7 +11636,8 @@ SYSCALL_DEFINE5(perf_event_open,
> > > if (err)
> > > return err;
> > > - if (!attr.exclude_kernel) {
> > > + if (!attr.exclude_kernel || !attr.exclude_callchain_kernel ||
> > > + !attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest) {
> > > err = perf_allow_kernel(&attr);
> > > if (err)
> > > return err;
> > >
> >
> > I can understand the conditions "!attr.exclude_kernel || !attr.exclude_callchain_kernel".
> >
> > But I'm not very sure about the "!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest".
>
> Well, I'm very sure G0 userspace should never see L0 or G1 state, so
> exclude_hv and exclude_guest had better be true.
>
> > On host, exclude_hv = 1, exclude_guest = 1 and exclude_host = 0, right?
>
> Same as above, is G0 host state G0 userspace?
>
> > So even exclude_kernel = 1 but exclude_host = 0, we will still go
> > perf_allow_kernel path. Please correct me if my understanding is wrong.
>
> Yes, because with those permission checks in place it means you have
> permission to see kernel bits.
So if I understand 'exclude_host' wrong -- a distinct possibility -- can
we then pretty please have the above [A-B] corrected and put in a
comment near perf_event_attr and the exclude_* comments changed to refer
to that?
On Thu, Aug 06, 2020 at 11:18:27AM +0200, [email protected] wrote:
> Suppose we have nested virt:
>
> L0-hv
> |
> G0/L1-hv
> |
> G1
>
> And we're running in G0, then:
>
> - 'exclude_hv' would exclude L0 events
> - 'exclude_host' would ... exclude L1-hv events?
> - 'exclude_guest' would ... exclude G1 events?
So in arch/x86/events/intel/core.c we have:
static inline void intel_set_masks(struct perf_event *event, int idx)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (event->attr.exclude_host)
__set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
if (event->attr.exclude_guest)
__set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
if (event_is_checkpointed(event))
__set_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
}
which is, afaict, just plain wrong. Should that not be something like:
if (!event->attr.exclude_host)
__set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
if (!event->attr.exclude_guest)
__set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
Also, ARM64 seems to also implement this stuff, Mark, do you have any
insight on how all this is 'supposed' to work?
Hi Peter,
On 8/6/2020 5:18 PM, [email protected] wrote:
> On Thu, Aug 06, 2020 at 10:26:29AM +0800, Jin, Yao wrote:
>
>>> +static struct pt_regs *sanitize_sample_regs(struct perf_event *event, struct pt_regs *regs)
>>> +{
>>> + struct pt_regs *sample_regs = regs;
>>> +
>>> + /* user only */
>>> + if (!event->attr.exclude_kernel || !event->attr.exclude_hv ||
>>> + !event->attr.exclude_host || !event->attr.exclude_guest)
>>> + return sample_regs;
>>> +
>>
>> Is this condition correct?
>>
>> Say counting user event on host, exclude_kernel = 1 and exclude_host = 0. It
>> will go "return sample_regs" path.
>
> I'm not sure, I'm terminally confused on virt stuff.
>
> Suppose we have nested virt:
>
> L0-hv
> |
> G0/L1-hv
> |
> G1
>
> And we're running in G0, then:
>
> - 'exclude_hv' would exclude L0 events
> - 'exclude_host' would ... exclude L1-hv events?
I think the exclude_host is generally set by guest (/arch/x86/kvm/pmu.c, pmc_reprogram_counter).
If G0 is a host, if we set exclude_host in G0, I think we will not be able to count the events on G0.
The appropriate usage is, G1 sets the exclude_host, then the events on G0 will not be collected by
guest G1.
That's my understanding for the usage of exclude_host.
> - 'exclude_guest' would ... exclude G1 events?
>
Similarly, the appropriate usage is, the host (G0) sets the exclude_guest, then the events on G1
will not be collected by host G0.
If G1 sets exclude_guest, since no guest is under G1, that's ineffective.
> Then the next question is, if G0 is a host, does the L1-hv run in
> G0 userspace or G0 kernel space?
>
I'm not very sure. Maybe some in kernel, some in userspace(qemu)? Maybe some KVM experts can help to
answer this question.
> I was assuming G0 userspace would not include anything L1 (kvm is a
> kernel module after all), but what do I know.
>
I have tested following conditions in native environment (not in KVM guests), the result is not
expected.
/* user only */
if (!event->attr.exclude_kernel || !event->attr.exclude_hv ||
!event->attr.exclude_host || !event->attr.exclude_guest)
return sample_regs;
perf record -e cycles:u ./div
perf report --stdio
# Overhead Command Shared Object Symbol
# ........ ....... ................ .......................
#
49.51% div libc-2.27.so [.] __random_r
33.93% div libc-2.27.so [.] __random
8.13% div libc-2.27.so [.] rand
4.29% div div [.] main
4.14% div div [.] rand@plt
0.00% div [unknown] [k] 0xffffffffbd600cb0
0.00% div [unknown] [k] 0xffffffffbd600df0
0.00% div ld-2.27.so [.] _dl_relocate_object
0.00% div ld-2.27.so [.] _dl_start
0.00% div ld-2.27.so [.] _start
0xffffffffbd600cb0 and 0xffffffffbd600df0 are leaked kernel addresses.
From debug, I can see:
[ 6272.320258] jinyao: sanitize_sample_regs: event->attr.exclude_kernel = 1, event->attr.exclude_hv
= 1, event->attr.exclude_host = 0, event->attr.exclude_guest = 0
So it goes "return sample_regs;" path.
>>> @@ -11609,7 +11636,8 @@ SYSCALL_DEFINE5(perf_event_open,
>>> if (err)
>>> return err;
>>> - if (!attr.exclude_kernel) {
>>> + if (!attr.exclude_kernel || !attr.exclude_callchain_kernel ||
>>> + !attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest) {
>>> err = perf_allow_kernel(&attr);
>>> if (err)
>>> return err;
>>>
>>
>> I can understand the conditions "!attr.exclude_kernel || !attr.exclude_callchain_kernel".
>>
>> But I'm not very sure about the "!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest".
>
> Well, I'm very sure G0 userspace should never see L0 or G1 state, so
> exclude_hv and exclude_guest had better be true.
>
>> On host, exclude_hv = 1, exclude_guest = 1 and exclude_host = 0, right?
>
> Same as above, is G0 host state G0 userspace?
>
>> So even exclude_kernel = 1 but exclude_host = 0, we will still go
>> perf_allow_kernel path. Please correct me if my understanding is wrong.
>
> Yes, because with those permission checks in place it means you have
> permission to see kernel bits.
>
At the syscall entry, I also added some printk.
Aug 7 03:37:40 kbl-ppc kernel: [ 854.688045] syscall: attr.exclude_kernel = 1,
attr.exclude_callchain_kernel = 0, attr.exclude_hv = 0, attr.exclude_host = 0, attr.exclude_guest = 0
For my test case ("perf record -e cycles:u ./div"), the perf_allow_kernel() is also executed.
Thanks
Jin Yao
Hi Peter,
On 8/6/2020 5:24 PM, [email protected] wrote:
> On Thu, Aug 06, 2020 at 11:18:27AM +0200, [email protected] wrote:
>> On Thu, Aug 06, 2020 at 10:26:29AM +0800, Jin, Yao wrote:
>>
>>>> +static struct pt_regs *sanitize_sample_regs(struct perf_event *event, struct pt_regs *regs)
>>>> +{
>>>> + struct pt_regs *sample_regs = regs;
>>>> +
>>>> + /* user only */
>>>> + if (!event->attr.exclude_kernel || !event->attr.exclude_hv ||
>>>> + !event->attr.exclude_host || !event->attr.exclude_guest)
>>>> + return sample_regs;
>>>> +
>>>
>>> Is this condition correct?
>>>
>>> Say counting user event on host, exclude_kernel = 1 and exclude_host = 0. It
>>> will go "return sample_regs" path.
>>
>> I'm not sure, I'm terminally confused on virt stuff.
>
> [A]
>
>> Suppose we have nested virt:
>>
>> L0-hv
>> |
>> G0/L1-hv
>> |
>> G1
>>
>> And we're running in G0, then:
>>
>> - 'exclude_hv' would exclude L0 events
>> - 'exclude_host' would ... exclude L1-hv events?
>> - 'exclude_guest' would ... exclude G1 events?
>
> [B]
>
>> Then the next question is, if G0 is a host, does the L1-hv run in
>> G0 userspace or G0 kernel space?
>>
>> I was assuming G0 userspace would not include anything L1 (kvm is a
>> kernel module after all), but what do I know.
>>
>>>> @@ -11609,7 +11636,8 @@ SYSCALL_DEFINE5(perf_event_open,
>>>> if (err)
>>>> return err;
>>>> - if (!attr.exclude_kernel) {
>>>> + if (!attr.exclude_kernel || !attr.exclude_callchain_kernel ||
>>>> + !attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest) {
>>>> err = perf_allow_kernel(&attr);
>>>> if (err)
>>>> return err;
>>>>
>>>
>>> I can understand the conditions "!attr.exclude_kernel || !attr.exclude_callchain_kernel".
>>>
>>> But I'm not very sure about the "!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest".
>>
>> Well, I'm very sure G0 userspace should never see L0 or G1 state, so
>> exclude_hv and exclude_guest had better be true.
>>
>>> On host, exclude_hv = 1, exclude_guest = 1 and exclude_host = 0, right?
>>
>> Same as above, is G0 host state G0 userspace?
>>
>>> So even exclude_kernel = 1 but exclude_host = 0, we will still go
>>> perf_allow_kernel path. Please correct me if my understanding is wrong.
>>
>> Yes, because with those permission checks in place it means you have
>> permission to see kernel bits.
>
> So if I understand 'exclude_host' wrong -- a distinct possibility -- can
> we then pretty please have the above [A-B] corrected and put in a
> comment near perf_event_attr and the exclude_* comments changed to refer
> to that?
>
In my previous mail, I explained what I understood for 'exclude_host', but not sure if it's correct.
Needs more review comments.
Thanks
Jin Yao
Hi Peter,
On 8/6/2020 7:00 PM, [email protected] wrote:
> On Thu, Aug 06, 2020 at 11:18:27AM +0200, [email protected] wrote:
>
>> Suppose we have nested virt:
>>
>> L0-hv
>> |
>> G0/L1-hv
>> |
>> G1
>>
>> And we're running in G0, then:
>>
>> - 'exclude_hv' would exclude L0 events
>> - 'exclude_host' would ... exclude L1-hv events?
>> - 'exclude_guest' would ... exclude G1 events?
>
> So in arch/x86/events/intel/core.c we have:
>
> static inline void intel_set_masks(struct perf_event *event, int idx)
> {
> struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>
> if (event->attr.exclude_host)
> __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
> if (event->attr.exclude_guest)
> __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
> if (event_is_checkpointed(event))
> __set_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
> }
>
exclude_host is now set by guest (pmc_reprogram_counter, arch/x86/kvm/pmu.c). When enabling the
event, we can check exclude_host to know if it's a guest.
Otherwise we may need more flags in event->attr to indicate the status.
> which is, afaict, just plain wrong. Should that not be something like:
>
> if (!event->attr.exclude_host)
> __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
> if (!event->attr.exclude_guest)
> __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
>
>
How can we know it's guest or host even if exclude_host is set in guest?
Thanks
Jin Yao
> Also, ARM64 seems to also implement this stuff, Mark, do you have any
> insight on how all this is 'supposed' to work?
>
On Fri, Aug 07, 2020 at 02:24:30PM +0800, Jin, Yao wrote:
> Hi Peter,
>
> On 8/6/2020 7:00 PM, [email protected] wrote:
> > On Thu, Aug 06, 2020 at 11:18:27AM +0200, [email protected] wrote:
> >
> > > Suppose we have nested virt:
> > >
> > > L0-hv
> > > |
> > > G0/L1-hv
> > > |
> > > G1
> > >
> > > And we're running in G0, then:
> > >
> > > - 'exclude_hv' would exclude L0 events
> > > - 'exclude_host' would ... exclude L1-hv events?
> > > - 'exclude_guest' would ... exclude G1 events?
> >
> > So in arch/x86/events/intel/core.c we have:
> >
> > static inline void intel_set_masks(struct perf_event *event, int idx)
> > {
> > struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> >
> > if (event->attr.exclude_host)
> > __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
> > if (event->attr.exclude_guest)
> > __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
> > if (event_is_checkpointed(event))
> > __set_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
> > }
> >
>
> exclude_host is now set by guest (pmc_reprogram_counter,
> arch/x86/kvm/pmu.c). When enabling the event, we can check exclude_host to
> know if it's a guest.
>
> Otherwise we may need more flags in event->attr to indicate the status.
>
> > which is, afaict, just plain wrong. Should that not be something like:
> >
> > if (!event->attr.exclude_host)
> > __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
> > if (!event->attr.exclude_guest)
> > __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
> >
> >
>
> How can we know it's guest or host even if exclude_host is set in guest?
I'm not following you, consider:
xh xg h g h' g'
0 0 0 0 1 1
0 1 1 0 1 0
1 0 0 1 0 1
1 1 1 1 0 0
So the 0,0 and 1,1 cases get flipped. I have a suspicion, but this
_really_ should have fat comments all over :-(
What a sodding trainwreck..
Hi Peter,
On 8/7/2020 5:02 PM, [email protected] wrote:
> On Fri, Aug 07, 2020 at 02:24:30PM +0800, Jin, Yao wrote:
>> Hi Peter,
>>
>> On 8/6/2020 7:00 PM, [email protected] wrote:
>>> On Thu, Aug 06, 2020 at 11:18:27AM +0200, [email protected] wrote:
>>>
>>>> Suppose we have nested virt:
>>>>
>>>> L0-hv
>>>> |
>>>> G0/L1-hv
>>>> |
>>>> G1
>>>>
>>>> And we're running in G0, then:
>>>>
>>>> - 'exclude_hv' would exclude L0 events
>>>> - 'exclude_host' would ... exclude L1-hv events?
>>>> - 'exclude_guest' would ... exclude G1 events?
>>>
>>> So in arch/x86/events/intel/core.c we have:
>>>
>>> static inline void intel_set_masks(struct perf_event *event, int idx)
>>> {
>>> struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>>>
>>> if (event->attr.exclude_host)
>>> __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
>>> if (event->attr.exclude_guest)
>>> __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
>>> if (event_is_checkpointed(event))
>>> __set_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
>>> }
>>>
>>
>> exclude_host is now set by guest (pmc_reprogram_counter,
>> arch/x86/kvm/pmu.c). When enabling the event, we can check exclude_host to
>> know if it's a guest.
>>
>> Otherwise we may need more flags in event->attr to indicate the status.
>>
>>> which is, afaict, just plain wrong. Should that not be something like:
>>>
>>> if (!event->attr.exclude_host)
>>> __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
>>> if (!event->attr.exclude_guest)
>>> __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
>>>
>>>
>>
>> How can we know it's guest or host even if exclude_host is set in guest?
>
> I'm not following you, consider:
>
> xh xg h g h' g'
> 0 0 0 0 1 1
> 0 1 1 0 1 0
> 1 0 0 1 0 1
> 1 1 1 1 0 0
>
>
Thanks for the table! It clearly shows the combinations of different conditions.
My understanding is:
xh = exclude_host
xg = exclude_guest
h = intel_ctrl_host_mask (before)
g = intel_ctrl_guest_mask (before)
h' = intel_ctrl_host_mask (after)
g' = intel_ctrl_guest_mask (after)
For guest, exclude_host = 1 and exclude_guest = 0
xh xg h g h' g'
1 0 0 1 0 1
before/after values are not changed.
For host, exclude_host = 0 and exclude_guest = 1
xh xg h g h' g'
0 1 1 0 1 0
before/after values are not changed.
> So the 0,0 and 1,1 cases get flipped. I have a suspicion, but this
> _really_ should have fat comments all over :-(
>
I'm not very sure about other cases.
xh xg h g h' g'
0 0 0 0 1 1
1 1 1 1 0 0
The before/after values are just reversed. I don't know if there will be some negative impacts?
Maybe we need more reviews here.
> What a sodding trainwreck..
>
:(
Thanks
Jin Yao
Hi Peter,
On 8/6/2020 10:26 AM, Jin, Yao wrote:
> Hi Peter,
>
> On 8/5/2020 8:44 PM, [email protected] wrote:
>> On Wed, Aug 05, 2020 at 10:15:26AM +0800, Jin, Yao wrote:
>>> Hi Peter,
>>>
>>> On 8/4/2020 7:49 PM, [email protected] wrote:
>>>> On Fri, Jul 31, 2020 at 10:56:17AM +0800, Jin Yao wrote:
>>>>> @@ -6973,7 +6973,8 @@ static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
>>>>> struct perf_callchain_entry *
>>>>> perf_callchain(struct perf_event *event, struct pt_regs *regs)
>>>>> {
>>>>> - bool kernel = !event->attr.exclude_callchain_kernel;
>>>>> + bool kernel = !event->attr.exclude_callchain_kernel &&
>>>>> + !event->attr.exclude_kernel;
>>>>
>>>> This seems weird; how can we get there. Also it seems to me that if you
>>>> have !exclude_callchain_kernel you already have permission for kernel
>>>> bits, so who cares.
>>>>
>>>
>>> In perf tool, exclude_callchain_kernel is set to 1 when perf-record only
>>> collects the user callchains and exclude_kernel is set to 1 when events are
>>> configured to run in user space.
>>>
>>> So if an event is configured to run in user space, that should make sense we
>>> don't need it's kernel callchains.
>>>
>>> But it seems to me there is no code logic in perf tool which can make sure
>>> !exclude_callchain_kernel -> !exclude_kernel.
>>>
>>> Jiri, Arnaldo, is my understanding correct?
>>
>> What the perf tool does or does not do is irrelevant. It is a valid,
>> (albeit slightly silly) configuration to have:
>>
>> exclude_kernel && !exclude_callchain_kernel
>>
>> You're now saying that when you configure things like this you're not
>> allowed kernel IPs, that's wrong I think.
>>
>> Also, !exclude_callchain_kernel should require privilidge, whcih needs
>> fixing, see below.
>>
>
> I see you add '!exclude_callchain_kernel' check before perf_allow_kernel() at syscall entry in below
> code.
>
> So if we allow callchain_kernel collection that means we allow kernel by default. That makes sense,
> thanks!
>
>>> So the new code looks like:
>>>
>>> if (event->attr.exclude_kernel && !user_mode(regs)) {
>>> if (!(current->flags & PF_KTHREAD)) {
>>> regs_fake = task_pt_regs(current);
>>> if (!regs_fake)
>>> instruction_pointer_set(regs, -1L);
>>> } else {
>>> instruction_pointer_set(regs, -1L);
>>> }
>>
>> Again:
>>
>> if (!(current->flags & PF_KTHREAD))
>> regs_fake = task_pt_regs(current);
>>
>> if (!regs_fake)
>> instruction_pointer_set(regs, -1L);
>>
>> Is much simpler and more readable.
>>
>
> Yes, agree. Your code is much simpler and better.
>
>>>>> + if ((header->misc & PERF_RECORD_MISC_CPUMODE_MASK) ==
>>>>> + PERF_RECORD_MISC_KERNEL) {
>>>>> + header->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
>>>>> + header->misc |= PERF_RECORD_MISC_USER;
>>>>> + }
>>>>
>>>> Why the conditional? At this point it had better be unconditionally
>>>> user, no?
>>>>
>>>> headers->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
>>>> headers->misc |= PERF_RECORD_MISC_USER;
>>>>
>>>
>>> #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
>>> #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
>>> #define PERF_RECORD_MISC_KERNEL (1 << 0)
>>> #define PERF_RECORD_MISC_USER (2 << 0)
>>> #define PERF_RECORD_MISC_HYPERVISOR (3 << 0)
>>> #define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0)
>>> #define PERF_RECORD_MISC_GUEST_USER (5 << 0)
>>>
>>> If we unconditionally set user, it will reset for hypervisor, guest
>>> kernel and guest_user.
>>
>> At the same time :u had better not get any of those either. Which seems
>> to suggest we're going about this wrong.
>>
>> Also, if we call this before perf_misc_flags() we don't need to fix it
>> up.
>>
>> How's this?
>>
>> ---
>> kernel/events/core.c | 38 +++++++++++++++++++++++++++++++++-----
>> 1 file changed, 33 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 7c436d705fbd..3e4e328b521a 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -6988,23 +6988,49 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
>> return callchain ?: &__empty_callchain;
>> }
>> +/*
>> + * Due to interrupt latency (skid), we may enter the kernel before taking the
>> + * PMI, even if the PMU is configured to only count user events. To avoid
>> + * leaking kernel addresses, use task_pt_regs(), when available.
>> + */
>> +static struct pt_regs *sanitize_sample_regs(struct perf_event *event, struct pt_regs *regs)
>> +{
>> + struct pt_regs *sample_regs = regs;
>> +
>> + /* user only */
>> + if (!event->attr.exclude_kernel || !event->attr.exclude_hv ||
>> + !event->attr.exclude_host || !event->attr.exclude_guest)
>> + return sample_regs;
>> +
>
> Is this condition correct?
>
> Say counting user event on host, exclude_kernel = 1 and exclude_host = 0. It will go "return
> sample_regs" path.
>
>> + if (sample_regs(regs))
>> + return sample_regs;
>> +
>
> In your another mail, you said it should be:
>
> if (user_regs(regs))
> return sample_regs;
>
>> + if (!(current->flags & PF_KTHREAD)) {
>
> No '{', you mentioned in another mail.
>
>> + sample_regs = task_pt_regs(current);
>> + else
>> + instruction_pointer_set(regs, -1L);
>> +
>> + return sample_regs;
>> +}
>> +
>> void perf_prepare_sample(struct perf_event_header *header,
>> struct perf_sample_data *data,
>> struct perf_event *event,
>> struct pt_regs *regs)
>> {
>> + struct pt_regs *sample_regs = sanitize_sample_regs(event, regs);
>> u64 sample_type = event->attr.sample_type;
>> header->type = PERF_RECORD_SAMPLE;
>> header->size = sizeof(*header) + event->header_size;
>> header->misc = 0;
>> - header->misc |= perf_misc_flags(regs);
>> + header->misc |= perf_misc_flags(sample_regs);
>> __perf_event_header__init_id(header, data, event);
>> if (sample_type & PERF_SAMPLE_IP)
>> - data->ip = perf_instruction_pointer(regs);
>> + data->ip = perf_instruction_pointer(sample_regs);
>> if (sample_type & PERF_SAMPLE_CALLCHAIN) {
>> int size = 1;
>> @@ -7054,9 +7080,10 @@ void perf_prepare_sample(struct perf_event_header *header,
>> header->size += size;
>> }
>> - if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
>> + if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) {
>> perf_sample_regs_user(&data->regs_user, regs,
>> &data->regs_user_copy);
>> + }
>> if (sample_type & PERF_SAMPLE_REGS_USER) {
>> /* regs dump ABI info */
>> @@ -7099,7 +7126,7 @@ void perf_prepare_sample(struct perf_event_header *header,
>> /* regs dump ABI info */
>> int size = sizeof(u64);
>> - perf_sample_regs_intr(&data->regs_intr, regs);
>> + perf_sample_regs_intr(&data->regs_intr, sample_regs);
>> if (data->regs_intr.regs) {
>> u64 mask = event->attr.sample_regs_intr;
>> @@ -11609,7 +11636,8 @@ SYSCALL_DEFINE5(perf_event_open,
>> if (err)
>> return err;
>> - if (!attr.exclude_kernel) {
>> + if (!attr.exclude_kernel || !attr.exclude_callchain_kernel ||
>> + !attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest) {
>> err = perf_allow_kernel(&attr);
>> if (err)
>> return err;
>>
>
> I can understand the conditions "!attr.exclude_kernel || !attr.exclude_callchain_kernel".
>
> But I'm not very sure about the "!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest".
>
> On host, exclude_hv = 1, exclude_guest = 1 and exclude_host = 0, right?
>
> So even exclude_kernel = 1 but exclude_host = 0, we will still go perf_allow_kernel path. Please
> correct me if my understanding is wrong.
>
> Thanks
> Jin Yao
Could I post v2 which basically refers to your patch but removes some conditions since I see some
issues in test if we use these conditions.
1. Remove '!event->attr.exclude_hv || !event->attr.exclude_host ||
!event->attr.exclude_guest' at the entry of sanitize_sample_regs().
2. Remove '!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest'
at the perf_event_open syscall entry.
Thanks
Jin Yao
On Tue, Aug 11, 2020 at 03:50:43PM +0800, Jin, Yao wrote:
> Could I post v2 which basically refers to your patch but removes some
> conditions since I see some issues in test if we use these conditions.
>
> 1. Remove '!event->attr.exclude_hv || !event->attr.exclude_host ||
> !event->attr.exclude_guest' at the entry of sanitize_sample_regs().
>
> 2. Remove '!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest'
> at the perf_event_open syscall entry.
exclude_host, maybe -- due to the dodgy semantics of it, but the others
should definitely be there.
Hi Peter,
On 8/11/2020 3:59 PM, Peter Zijlstra wrote:
> On Tue, Aug 11, 2020 at 03:50:43PM +0800, Jin, Yao wrote:
>> Could I post v2 which basically refers to your patch but removes some
>> conditions since I see some issues in test if we use these conditions.
>>
>> 1. Remove '!event->attr.exclude_hv || !event->attr.exclude_host ||
>> !event->attr.exclude_guest' at the entry of sanitize_sample_regs().
>>
>> 2. Remove '!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest'
>> at the perf_event_open syscall entry.
>
> exclude_host, maybe -- due to the dodgy semantics of it, but the others
> should definitely be there.
>
exclude_guest and exclude_hv are tricky too.
If we do 'perf record -e cycles:u' in both host and guest, we can see:
event->attr.exclude_guest = 0
thus sanitize_sample_regs() returns regs directly even if exclude_kernel = 1.
And in guest, exclude_hv = 0, it's out of my expectation too.
Thanks
Jin Yao
On Tue, Aug 11, 2020 at 04:31:10PM +0800, Jin, Yao wrote:
> Hi Peter,
>
> On 8/11/2020 3:59 PM, Peter Zijlstra wrote:
> > On Tue, Aug 11, 2020 at 03:50:43PM +0800, Jin, Yao wrote:
> > > Could I post v2 which basically refers to your patch but removes some
> > > conditions since I see some issues in test if we use these conditions.
> > >
> > > 1. Remove '!event->attr.exclude_hv || !event->attr.exclude_host ||
> > > !event->attr.exclude_guest' at the entry of sanitize_sample_regs().
> > >
> > > 2. Remove '!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest'
> > > at the perf_event_open syscall entry.
> >
> > exclude_host, maybe -- due to the dodgy semantics of it, but the others
> > should definitely be there.
> >
>
> exclude_guest and exclude_hv are tricky too.
>
> If we do 'perf record -e cycles:u' in both host and guest, we can see:
>
> event->attr.exclude_guest = 0
>
> thus sanitize_sample_regs() returns regs directly even if exclude_kernel = 1.
>
> And in guest, exclude_hv = 0, it's out of my expectation too.
I'm confused, how can 'perf record -e cycles:u' _ever_ have
exclude_guest=0, exclude_hv=0 ? That simply makes no sense and is utterly
broken.
You explicitly ask for userspace-only, reporting hypervisor or guest
events is a straight up bug.
Hi Peter,
On 8/11/2020 4:45 PM, Peter Zijlstra wrote:
> On Tue, Aug 11, 2020 at 04:31:10PM +0800, Jin, Yao wrote:
>> Hi Peter,
>>
>> On 8/11/2020 3:59 PM, Peter Zijlstra wrote:
>>> On Tue, Aug 11, 2020 at 03:50:43PM +0800, Jin, Yao wrote:
>>>> Could I post v2 which basically refers to your patch but removes some
>>>> conditions since I see some issues in test if we use these conditions.
>>>>
>>>> 1. Remove '!event->attr.exclude_hv || !event->attr.exclude_host ||
>>>> !event->attr.exclude_guest' at the entry of sanitize_sample_regs().
>>>>
>>>> 2. Remove '!attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest'
>>>> at the perf_event_open syscall entry.
>>>
>>> exclude_host, maybe -- due to the dodgy semantics of it, but the others
>>> should definitely be there.
>>>
>>
>> exclude_guest and exclude_hv are tricky too.
>>
>> If we do 'perf record -e cycles:u' in both host and guest, we can see:
>>
>> event->attr.exclude_guest = 0
>>
>> thus sanitize_sample_regs() returns regs directly even if exclude_kernel = 1.
>>
>> And in guest, exclude_hv = 0, it's out of my expectation too.
>
> I'm confused, how can 'perf record -e cycles:u' _ever_ have
> exclude_guest=0, exclude_hv=0 ? That simply makes no sense and is utterly
> broken.
>
> You explicitly ask for userspace-only, reporting hypervisor or guest
> events is a straight up bug.
>
If we run 'perf record -e cycles:u',
1. On host, exclude_guest = 0 and exclude_hv = 1
perf tool doesn't specially set 'exclude_guest' when it parses the 'u' modifier. I agree that can be
improved. I will post a perf tool patch to fix that.
2. On guest, exclude_guest = 0 and exclude_hv = 0.
For exclude_hv = 0, it looks like a bug but x86 doesn't use exclude_hv. But yes, we should fix that.
CC Like Xu <[email protected]>.
Thanks
Jin Yao
On 2020/8/12 11:52, Jin, Yao wrote:
> Hi Peter,
>
> On 8/11/2020 4:45 PM, Peter Zijlstra wrote:
>> On Tue, Aug 11, 2020 at 04:31:10PM +0800, Jin, Yao wrote:
>>> Hi Peter,
>>>
>>> On 8/11/2020 3:59 PM, Peter Zijlstra wrote:
>>>> On Tue, Aug 11, 2020 at 03:50:43PM +0800, Jin, Yao wrote:
>>>>> Could I post v2 which basically refers to your patch but removes some
>>>>> conditions since I see some issues in test if we use these conditions.
>>>>>
>>>>> 1. Remove '!event->attr.exclude_hv || !event->attr.exclude_host ||
>>>>> !event->attr.exclude_guest' at the entry of sanitize_sample_regs().
>>>>>
>>>>> 2. Remove '!attr.exclude_hv || !attr.exclude_host ||
>>>>> !attr.exclude_guest'
>>>>> at the perf_event_open syscall entry.
>>>>
>>>> exclude_host, maybe -- due to the dodgy semantics of it, but the others
>>>> should definitely be there.
>>>>
>>>
>>> exclude_guest and exclude_hv are tricky too.
>>>
>>> If we do 'perf record -e cycles:u' in both host and guest, we can see:
>>>
>>> event->attr.exclude_guest = 0
>>>
>>> thus sanitize_sample_regs() returns regs directly even if exclude_kernel
>>> = 1.
>>>
>>> And in guest, exclude_hv = 0, it's out of my expectation too.
>>
>> I'm confused, how can 'perf record -e cycles:u' _ever_ have
>> exclude_guest=0, exclude_hv=0 ? That simply makes no sense and is utterly
>> broken.
>>
>> You explicitly ask for userspace-only, reporting hypervisor or guest
>> events is a straight up bug.
Guest events = guest user-space events + guest kernel-space events.
Some perf users on the host may only want to count guest user space events.
>>
>
> If we run 'perf record -e cycles:u',
>
> 1. On host, exclude_guest = 0 and exclude_hv = 1
>
> perf tool doesn't specially set 'exclude_guest' when it parses the 'u'
> modifier. I agree that can be improved. I will post a perf tool patch to
> fix that.
>
> 2. On guest, exclude_guest = 0 and exclude_hv = 0.
>
> For exclude_hv = 0, it looks like a bug but x86 doesn't use exclude_hv. But
> yes, we should fix that.
>
> CC Like Xu <[email protected]>.
>
> Thanks
> Jin Yao