On workloads that do a lot of kernel entry/exits we see kernel
samples, even though :u is specified. This is due to skid existing.
This might be a security issue because it can leak kernel addresses even
though kernel sampling support is disabled.
One patch "perf/core: Drop kernel samples even though :u is specified"
was posted in last year but it was reverted because it introduced a
regression issue that broke the rr-project.
Now this patch set uses sysctl to control the dropping of leaked
kernel samples.
/sys/devices/cpu/perf_allow_sample_leakage:
0 - default, drop the leaked kernel samples.
1 - don't drop the leaked kernel samples.
For rr it can write 1 to /sys/devices/cpu/perf_allow_sample_leakage to
keep original system behavior.
Jin Yao (2):
perf/core: Use sysctl to turn on/off dropping leaked kernel samples
perf Documentation: Introduce the sysctl perf_allow_sample_leakage
kernel/events/core.c | 58 ++++++++++++++++++++++++++++++++
tools/perf/Documentation/perf-record.txt | 14 ++++++++
2 files changed, 72 insertions(+)
--
2.7.4
When doing sampling, for example:
perf record -e cycles:u ...
On workloads that do a lot of kernel entry/exits we see kernel
samples, even though :u is specified. This is due to skid existing.
This might be a security issue because it can leak kernel addresses even
though kernel sampling support is disabled.
One patch "perf/core: Drop kernel samples even though :u is specified"
was posted in last year but it was reverted because it introduced a
regression issue that broke the rr-project, which used sampling
events to receive a signal on overflow. These signals were critical
to the correct operation of rr.
See '6a8a75f32357 ("Revert "perf/core: Drop kernel samples even
though :u is specified"")' for detail.
Now the idea is to use sysctl to control the dropping of leaked
kernel samples.
/sys/devices/cpu/perf_allow_sample_leakage:
0 - default, drop the leaked kernel samples.
1 - don't drop the leaked kernel samples.
For rr it can write 1 to /sys/devices/cpu/perf_allow_sample_leakage.
For example,
root@skl:/tmp# cat /sys/devices/cpu/perf_allow_sample_leakage
0
root@skl:/tmp# perf record -e cycles:u ./div
root@skl:/tmp# perf report --stdio
........ ....... ............. ................
47.01% div div [.] main
20.74% div libc-2.23.so [.] __random_r
15.59% div libc-2.23.so [.] __random
8.68% div div [.] compute_flag
4.48% div libc-2.23.so [.] rand
3.50% div div [.] rand@plt
0.00% div ld-2.23.so [.] do_lookup_x
0.00% div ld-2.23.so [.] memcmp
0.00% div ld-2.23.so [.] _dl_start
0.00% div ld-2.23.so [.] _start
There is no kernel symbol reported.
root@skl:/tmp# echo 1 > /sys/devices/cpu/perf_allow_sample_leakage
root@skl:/tmp# cat /sys/devices/cpu/perf_allow_sample_leakage
1
root@skl:/tmp# perf record -e cycles:u ./div
root@skl:/tmp# perf report --stdio
........ ....... ................ .............
47.53% div div [.] main
20.62% div libc-2.23.so [.] __random_r
15.32% div libc-2.23.so [.] __random
8.66% div div [.] compute_flag
4.53% div libc-2.23.so [.] rand
3.34% div div [.] rand@plt
0.00% div [kernel.vmlinux] [k] apic_timer_interrupt
0.00% div libc-2.23.so [.] intel_check_word
0.00% div ld-2.23.so [.] brk
0.00% div [kernel.vmlinux] [k] page_fault
0.00% div ld-2.23.so [.] _start
We can see the kernel symbols apic_timer_interrupt and page_fault.
Signed-off-by: Jin Yao <[email protected]>
---
kernel/events/core.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80cca2b..7867541 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7721,6 +7721,28 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
+static int perf_allow_sample_leakage __read_mostly;
+
+static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+{
+ int allow_leakage = READ_ONCE(perf_allow_sample_leakage);
+
+ if (allow_leakage)
+ return true;
+
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ * To avoid leaking information to userspace, we must always
+ * reject kernel samples when exclude_kernel is set.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return false;
+
+ return true;
+}
+
/*
* Generic event overflow handling, sampling.
*/
@@ -7742,6 +7764,12 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
/*
+ * For security, drop the skid kernel samples if necessary.
+ */
+ if (!sample_is_allowed(event, regs))
+ return ret;
+
+ /*
* XXX event_limit might not quite work as expected on inherited
* events
*/
@@ -9500,9 +9528,39 @@ perf_event_mux_interval_ms_store(struct device *dev,
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+static ssize_t
+perf_allow_sample_leakage_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ int allow_leakage = READ_ONCE(perf_allow_sample_leakage);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n", allow_leakage);
+}
+
+static ssize_t
+perf_allow_sample_leakage_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int allow_leakage, ret;
+
+ ret = kstrtoint(buf, 0, &allow_leakage);
+ if (ret)
+ return ret;
+
+ if (allow_leakage != 0 && allow_leakage != 1)
+ return -EINVAL;
+
+ WRITE_ONCE(perf_allow_sample_leakage, allow_leakage);
+
+ return count;
+}
+static DEVICE_ATTR_RW(perf_allow_sample_leakage);
+
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
+ &dev_attr_perf_allow_sample_leakage.attr,
NULL,
};
ATTRIBUTE_GROUPS(pmu_dev);
--
2.7.4
Introduce a new sysctl /sys/devices/cpu/perf_allow_sample_leakage, which
turns on/off dropping leaked kernel samples.
Signed-off-by: Jin Yao <[email protected]>
---
tools/perf/Documentation/perf-record.txt | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 04168da..97fb0f8 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -93,6 +93,20 @@ OPTIONS
prevent the shell interpretation. You also need to use --group on
"perf report" to view group events together.
+ Note that if workload does a lot of kernel entry/exit we may see
+ kernel samples even if :u is specified. That is due to skid existing.
+ This might be a security issue because it can leak kernel address even
+ though kernel sampling support is disabled. We have a sysctl to turn
+ on/off the dropping of leaked kernel samples.
+
+ /sys/devices/cpu/perf_allow_sample_leakage
+
+ 0 - drop the leaked kernel samples, default option.
+ 1 - don't drop the leaked kernel samples.
+
+ For example, write 1 to perf_allow_sample_leakage
+ echo 1 > /sys/devices/cpu/perf_allow_sample_leakage
+
--filter=<filter>::
Event filter. This option should follow a event selector (-e) which
selects either tracepoint event(s) or a hardware trace PMU
--
2.7.4
I strongly object to this patch as written. As I said when I
originally reported[0] the regression introduced by the previous
version of this patch a year ago.
"It seems like this change should, at a bare minimum, be limited to
counters that actually perform sampling of register state when the
interrupt fires. In our case, with the retired conditional branches
counter restricted to counting userspace events only, it makes no
difference that the PMU interrupt happened to be delivered in the
kernel."
This means identifying which values of `perf_event_attr::sample_type`
are security concerns (presumably PERF_SAMPLE_IP is, and
PERF_SAMPLE_TIME is not, and someone needs to go through and decide on
all of them) and filtering on those values for this new behavior.
And because rr sets its sample_type to 0, once you do that, the sysctl
will not be necessary.
- Kyle
On Fri, Jun 15, 2018 at 3:03 AM, Jin Yao <[email protected]> wrote:
> On workloads that do a lot of kernel entry/exits we see kernel
> samples, even though :u is specified. This is due to skid existing.
>
> This might be a security issue because it can leak kernel addresses even
> though kernel sampling support is disabled.
>
> One patch "perf/core: Drop kernel samples even though :u is specified"
> was posted in last year but it was reverted because it introduced a
> regression issue that broke the rr-project.
>
> Now this patch set uses sysctl to control the dropping of leaked
> kernel samples.
>
> /sys/devices/cpu/perf_allow_sample_leakage:
>
> 0 - default, drop the leaked kernel samples.
> 1 - don't drop the leaked kernel samples.
>
> For rr it can write 1 to /sys/devices/cpu/perf_allow_sample_leakage to
> keep original system behavior.
>
> Jin Yao (2):
> perf/core: Use sysctl to turn on/off dropping leaked kernel samples
> perf Documentation: Introduce the sysctl perf_allow_sample_leakage
>
> kernel/events/core.c | 58 ++++++++++++++++++++++++++++++++
> tools/perf/Documentation/perf-record.txt | 14 ++++++++
> 2 files changed, 72 insertions(+)
>
> --
> 2.7.4
>
[0] https://lkml.org/lkml/2017/6/27/1159
On 6/15/2018 11:35 AM, Kyle Huey wrote:
> I strongly object to this patch as written. As I said when I
> originally reported[0] the regression introduced by the previous
> version of this patch a year ago.
>
> "It seems like this change should, at a bare minimum, be limited to
> counters that actually perform sampling of register state when the
> interrupt fires. In our case, with the retired conditional branches
> counter restricted to counting userspace events only, it makes no
> difference that the PMU interrupt happened to be delivered in the
> kernel."
>
> This means identifying which values of `perf_event_attr::sample_type`
> are security concerns (presumably PERF_SAMPLE_IP is, and
> PERF_SAMPLE_TIME is not, and someone needs to go through and decide on
> all of them) and filtering on those values for this new behavior.
>
> And because rr sets its sample_type to 0, once you do that, the sysctl
> will not be necessary.
>
> - Kyle
>
Since rr sets sample_type to 0, the easiest way is to add checking like:
if (event->attr.sample_type) {
if (event->attr.exclude_kernel && !user_mode(regs))
return false;
}
So the rr doesn't need to be changed and for other use cases the leaked
kernel samples will be dropped.
But I don't like this is because:
1. It's too specific for rr case.
2. If we create a new sample_type, e.g. PERF_SAMPLE_ALLOW_LEAKAGE, the
code will be:
if !(event->attr.sample_type & PERF_SAMPLE_ALLOW_LEAKAGE) {
if (event->attr.exclude_kernel && !user_mode(regs))
return false;
}
But rr needs to add PERF_SAMPLE_ALLOW_LEAKAGE to sample_type since by
default the bit is not set.
3. Sysctl is a more flexible way. It provides us with an option when we
want to see if skid is existing, we can use sysctl to turn on that.
Thanks
Jin Yao
On Thu, Jun 14, 2018 at 7:10 PM Jin Yao <[email protected]> wrote:
>
> When doing sampling, for example:
>
> perf record -e cycles:u ...
>
> On workloads that do a lot of kernel entry/exits we see kernel
> samples, even though :u is specified. This is due to skid existing.
>
> This might be a security issue because it can leak kernel addresses even
> though kernel sampling support is disabled.
>
> One patch "perf/core: Drop kernel samples even though :u is specified"
> was posted in last year but it was reverted because it introduced a
> regression issue that broke the rr-project, which used sampling
> events to receive a signal on overflow. These signals were critical
> to the correct operation of rr.
>
> See '6a8a75f32357 ("Revert "perf/core: Drop kernel samples even
> though :u is specified"")' for detail.
>
> Now the idea is to use sysctl to control the dropping of leaked
> kernel samples.
>
> /sys/devices/cpu/perf_allow_sample_leakage:
>
> 0 - default, drop the leaked kernel samples.
> 1 - don't drop the leaked kernel samples.
>
> For rr it can write 1 to /sys/devices/cpu/perf_allow_sample_leakage.
>
> For example,
>
> root@skl:/tmp# cat /sys/devices/cpu/perf_allow_sample_leakage
> 0
> root@skl:/tmp# perf record -e cycles:u ./div
> root@skl:/tmp# perf report --stdio
>
> ........ ....... ............. ................
>
> 47.01% div div [.] main
> 20.74% div libc-2.23.so [.] __random_r
> 15.59% div libc-2.23.so [.] __random
> 8.68% div div [.] compute_flag
> 4.48% div libc-2.23.so [.] rand
> 3.50% div div [.] rand@plt
> 0.00% div ld-2.23.so [.] do_lookup_x
> 0.00% div ld-2.23.so [.] memcmp
> 0.00% div ld-2.23.so [.] _dl_start
> 0.00% div ld-2.23.so [.] _start
>
> There is no kernel symbol reported.
>
> root@skl:/tmp# echo 1 > /sys/devices/cpu/perf_allow_sample_leakage
> root@skl:/tmp# cat /sys/devices/cpu/perf_allow_sample_leakage
> 1
> root@skl:/tmp# perf record -e cycles:u ./div
> root@skl:/tmp# perf report --stdio
>
> ........ ....... ................ .............
>
> 47.53% div div [.] main
> 20.62% div libc-2.23.so [.] __random_r
> 15.32% div libc-2.23.so [.] __random
> 8.66% div div [.] compute_flag
> 4.53% div libc-2.23.so [.] rand
> 3.34% div div [.] rand@plt
> 0.00% div [kernel.vmlinux] [k] apic_timer_interrupt
> 0.00% div libc-2.23.so [.] intel_check_word
> 0.00% div ld-2.23.so [.] brk
> 0.00% div [kernel.vmlinux] [k] page_fault
> 0.00% div ld-2.23.so [.] _start
>
> We can see the kernel symbols apic_timer_interrupt and page_fault.
>
These kernel symbols do not match your description here. How much skid
do you think you have here?
You're saying you are at the user level, you get a counter overflow,
and the interrupted IP lands in the kernel
because you where there by the time the interrupt is delivered. How
many instructions does it take to get
from user land to apic_timer_interrupt() or page_fault()? These
functions are not right at the kernel entry,
I believe. So how did you get there, the skid must have been VERY big
or symbolization has a problem.
> Signed-off-by: Jin Yao <[email protected]>
> ---
> kernel/events/core.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 58 insertions(+)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 80cca2b..7867541 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7721,6 +7721,28 @@ int perf_event_account_interrupt(struct perf_event *event)
> return __perf_event_account_interrupt(event, 1);
> }
>
> +static int perf_allow_sample_leakage __read_mostly;
> +
> +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
> +{
> + int allow_leakage = READ_ONCE(perf_allow_sample_leakage);
> +
> + if (allow_leakage)
> + return true;
> +
> + /*
> + * Due to interrupt latency (AKA "skid"), we may enter the
> + * kernel before taking an overflow, even if the PMU is only
> + * counting user events.
> + * To avoid leaking information to userspace, we must always
> + * reject kernel samples when exclude_kernel is set.
> + */
> + if (event->attr.exclude_kernel && !user_mode(regs))
> + return false;
> +
> + return true;
> +}
> +
> /*
> * Generic event overflow handling, sampling.
> */
> @@ -7742,6 +7764,12 @@ static int __perf_event_overflow(struct perf_event *event,
> ret = __perf_event_account_interrupt(event, throttle);
>
> /*
> + * For security, drop the skid kernel samples if necessary.
> + */
> + if (!sample_is_allowed(event, regs))
> + return ret;
> +
> + /*
> * XXX event_limit might not quite work as expected on inherited
> * events
> */
> @@ -9500,9 +9528,39 @@ perf_event_mux_interval_ms_store(struct device *dev,
> }
> static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
>
> +static ssize_t
> +perf_allow_sample_leakage_show(struct device *dev,
> + struct device_attribute *attr, char *page)
> +{
> + int allow_leakage = READ_ONCE(perf_allow_sample_leakage);
> +
> + return snprintf(page, PAGE_SIZE-1, "%d\n", allow_leakage);
> +}
> +
> +static ssize_t
> +perf_allow_sample_leakage_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + int allow_leakage, ret;
> +
> + ret = kstrtoint(buf, 0, &allow_leakage);
> + if (ret)
> + return ret;
> +
> + if (allow_leakage != 0 && allow_leakage != 1)
> + return -EINVAL;
> +
> + WRITE_ONCE(perf_allow_sample_leakage, allow_leakage);
> +
> + return count;
> +}
> +static DEVICE_ATTR_RW(perf_allow_sample_leakage);
> +
> static struct attribute *pmu_dev_attrs[] = {
> &dev_attr_type.attr,
> &dev_attr_perf_event_mux_interval_ms.attr,
> + &dev_attr_perf_allow_sample_leakage.attr,
> NULL,
> };
> ATTRIBUTE_GROUPS(pmu_dev);
> --
> 2.7.4
>
On Thu, Jun 14, 2018 at 7:10 PM Jin Yao <[email protected]> wrote:
>
> When doing sampling, for example:
>
> perf record -e cycles:u ...
>
> On workloads that do a lot of kernel entry/exits we see kernel
> samples, even though :u is specified. This is due to skid existing.
>
> This might be a security issue because it can leak kernel addresses even
> though kernel sampling support is disabled.
>
> One patch "perf/core: Drop kernel samples even though :u is specified"
> was posted in last year but it was reverted because it introduced a
> regression issue that broke the rr-project, which used sampling
> events to receive a signal on overflow. These signals were critical
> to the correct operation of rr.
>
> See '6a8a75f32357 ("Revert "perf/core: Drop kernel samples even
> though :u is specified"")' for detail.
>
> Now the idea is to use sysctl to control the dropping of leaked
> kernel samples.
>
> /sys/devices/cpu/perf_allow_sample_leakage:
>
> 0 - default, drop the leaked kernel samples.
> 1 - don't drop the leaked kernel samples.
>
> For rr it can write 1 to /sys/devices/cpu/perf_allow_sample_leakage.
>
> For example,
>
> root@skl:/tmp# cat /sys/devices/cpu/perf_allow_sample_leakage
> 0
> root@skl:/tmp# perf record -e cycles:u ./div
> root@skl:/tmp# perf report --stdio
>
> ........ ....... ............. ................
>
> 47.01% div div [.] main
> 20.74% div libc-2.23.so [.] __random_r
> 15.59% div libc-2.23.so [.] __random
> 8.68% div div [.] compute_flag
> 4.48% div libc-2.23.so [.] rand
> 3.50% div div [.] rand@plt
> 0.00% div ld-2.23.so [.] do_lookup_x
> 0.00% div ld-2.23.so [.] memcmp
> 0.00% div ld-2.23.so [.] _dl_start
> 0.00% div ld-2.23.so [.] _start
>
> There is no kernel symbol reported.
>
> root@skl:/tmp# echo 1 > /sys/devices/cpu/perf_allow_sample_leakage
> root@skl:/tmp# cat /sys/devices/cpu/perf_allow_sample_leakage
> 1
> root@skl:/tmp# perf record -e cycles:u ./div
> root@skl:/tmp# perf report --stdio
>
> ........ ....... ................ .............
>
> 47.53% div div [.] main
> 20.62% div libc-2.23.so [.] __random_r
> 15.32% div libc-2.23.so [.] __random
> 8.66% div div [.] compute_flag
> 4.53% div libc-2.23.so [.] rand
> 3.34% div div [.] rand@plt
> 0.00% div [kernel.vmlinux] [k] apic_timer_interrupt
> 0.00% div libc-2.23.so [.] intel_check_word
> 0.00% div ld-2.23.so [.] brk
> 0.00% div [kernel.vmlinux] [k] page_fault
> 0.00% div ld-2.23.so [.] _start
>
> We can see the kernel symbols apic_timer_interrupt and page_fault.
>
> Signed-off-by: Jin Yao <[email protected]>
> ---
> kernel/events/core.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 58 insertions(+)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 80cca2b..7867541 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7721,6 +7721,28 @@ int perf_event_account_interrupt(struct perf_event *event)
> return __perf_event_account_interrupt(event, 1);
> }
>
> +static int perf_allow_sample_leakage __read_mostly;
> +
> +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
> +{
> + int allow_leakage = READ_ONCE(perf_allow_sample_leakage);
> +
> + if (allow_leakage)
> + return true;
> +
> + /*
> + * Due to interrupt latency (AKA "skid"), we may enter the
> + * kernel before taking an overflow, even if the PMU is only
> + * counting user events.
> + * To avoid leaking information to userspace, we must always
> + * reject kernel samples when exclude_kernel is set.
> + */
> + if (event->attr.exclude_kernel && !user_mode(regs))
> + return false;
> +
And how does that filter PEBS or LBR records?
> + return true;
> +}
> +
> /*
> * Generic event overflow handling, sampling.
> */
> @@ -7742,6 +7764,12 @@ static int __perf_event_overflow(struct perf_event *event,
> ret = __perf_event_account_interrupt(event, throttle);
>
> /*
> + * For security, drop the skid kernel samples if necessary.
> + */
> + if (!sample_is_allowed(event, regs))
> + return ret;
> +
> + /*
> * XXX event_limit might not quite work as expected on inherited
> * events
> */
> @@ -9500,9 +9528,39 @@ perf_event_mux_interval_ms_store(struct device *dev,
> }
> static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
>
> +static ssize_t
> +perf_allow_sample_leakage_show(struct device *dev,
> + struct device_attribute *attr, char *page)
> +{
> + int allow_leakage = READ_ONCE(perf_allow_sample_leakage);
> +
> + return snprintf(page, PAGE_SIZE-1, "%d\n", allow_leakage);
> +}
> +
> +static ssize_t
> +perf_allow_sample_leakage_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + int allow_leakage, ret;
> +
> + ret = kstrtoint(buf, 0, &allow_leakage);
> + if (ret)
> + return ret;
> +
> + if (allow_leakage != 0 && allow_leakage != 1)
> + return -EINVAL;
> +
> + WRITE_ONCE(perf_allow_sample_leakage, allow_leakage);
> +
> + return count;
> +}
> +static DEVICE_ATTR_RW(perf_allow_sample_leakage);
> +
> static struct attribute *pmu_dev_attrs[] = {
> &dev_attr_type.attr,
> &dev_attr_perf_event_mux_interval_ms.attr,
> + &dev_attr_perf_allow_sample_leakage.attr,
> NULL,
> };
> ATTRIBUTE_GROUPS(pmu_dev);
> --
> 2.7.4
>
On 6/15/2018 1:59 PM, Stephane Eranian wrote:
> On Thu, Jun 14, 2018 at 7:10 PM Jin Yao <[email protected]> wrote:
>>
>> When doing sampling, for example:
>>
>> perf record -e cycles:u ...
>>
>> On workloads that do a lot of kernel entry/exits we see kernel
>> samples, even though :u is specified. This is due to skid existing.
>>
>> This might be a security issue because it can leak kernel addresses even
>> though kernel sampling support is disabled.
>>
>> One patch "perf/core: Drop kernel samples even though :u is specified"
>> was posted in last year but it was reverted because it introduced a
>> regression issue that broke the rr-project, which used sampling
>> events to receive a signal on overflow. These signals were critical
>> to the correct operation of rr.
>>
>> See '6a8a75f32357 ("Revert "perf/core: Drop kernel samples even
>> though :u is specified"")' for detail.
>>
>> Now the idea is to use sysctl to control the dropping of leaked
>> kernel samples.
>>
>> /sys/devices/cpu/perf_allow_sample_leakage:
>>
>> 0 - default, drop the leaked kernel samples.
>> 1 - don't drop the leaked kernel samples.
>>
>> For rr it can write 1 to /sys/devices/cpu/perf_allow_sample_leakage.
>>
>> For example,
>>
>> root@skl:/tmp# cat /sys/devices/cpu/perf_allow_sample_leakage
>> 0
>> root@skl:/tmp# perf record -e cycles:u ./div
>> root@skl:/tmp# perf report --stdio
>>
>> ........ ....... ............. ................
>>
>> 47.01% div div [.] main
>> 20.74% div libc-2.23.so [.] __random_r
>> 15.59% div libc-2.23.so [.] __random
>> 8.68% div div [.] compute_flag
>> 4.48% div libc-2.23.so [.] rand
>> 3.50% div div [.] rand@plt
>> 0.00% div ld-2.23.so [.] do_lookup_x
>> 0.00% div ld-2.23.so [.] memcmp
>> 0.00% div ld-2.23.so [.] _dl_start
>> 0.00% div ld-2.23.so [.] _start
>>
>> There is no kernel symbol reported.
>>
>> root@skl:/tmp# echo 1 > /sys/devices/cpu/perf_allow_sample_leakage
>> root@skl:/tmp# cat /sys/devices/cpu/perf_allow_sample_leakage
>> 1
>> root@skl:/tmp# perf record -e cycles:u ./div
>> root@skl:/tmp# perf report --stdio
>>
>> ........ ....... ................ .............
>>
>> 47.53% div div [.] main
>> 20.62% div libc-2.23.so [.] __random_r
>> 15.32% div libc-2.23.so [.] __random
>> 8.66% div div [.] compute_flag
>> 4.53% div libc-2.23.so [.] rand
>> 3.34% div div [.] rand@plt
>> 0.00% div [kernel.vmlinux] [k] apic_timer_interrupt
>> 0.00% div libc-2.23.so [.] intel_check_word
>> 0.00% div ld-2.23.so [.] brk
>> 0.00% div [kernel.vmlinux] [k] page_fault
>> 0.00% div ld-2.23.so [.] _start
>>
>> We can see the kernel symbols apic_timer_interrupt and page_fault.
>>
> These kernel symbols do not match your description here. How much skid
> do you think you have here?
> You're saying you are at the user level, you get a counter overflow,
> and the interrupted IP lands in the kernel
> because you where there by the time the interrupt is delivered. How
> many instructions does it take to get
> from user land to apic_timer_interrupt() or page_fault()? These
> functions are not right at the kernel entry,
> I believe. So how did you get there, the skid must have been VERY big
> or symbolization has a problem.
>
I'm testing with the latest perf/core branch (4.17+). Again I test with
Linux's vmstat (not test with my application).
perf record -e cycles:u vmstat 1
perf script -F ip
7f84e2b0bc30
7f84e2b0bc30
7f84e2b0bc30
7f84e2b0bc30
ffffffffb7a01070
7f84e2b243f0
7f84e2b11891
7f84e2b27f5e
7f84e25a3b26
7f84e25680f5
cat /proc/kallsyms | grep page_fault
....
ffffffffb7a01070 T page_fault
ffffffffb7a010a0 T async_page_fault
....
So one sample (ip ffffffffb7a01070) hits on page_fault.
Maybe you can have a try too. :)
Thanks
Jin Yao
>> Signed-off-by: Jin Yao <[email protected]>
>> ---
>> kernel/events/core.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 58 insertions(+)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 80cca2b..7867541 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -7721,6 +7721,28 @@ int perf_event_account_interrupt(struct perf_event *event)
>> return __perf_event_account_interrupt(event, 1);
>> }
>>
>> +static int perf_allow_sample_leakage __read_mostly;
>> +
>> +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
>> +{
>> + int allow_leakage = READ_ONCE(perf_allow_sample_leakage);
>> +
>> + if (allow_leakage)
>> + return true;
>> +
>> + /*
>> + * Due to interrupt latency (AKA "skid"), we may enter the
>> + * kernel before taking an overflow, even if the PMU is only
>> + * counting user events.
>> + * To avoid leaking information to userspace, we must always
>> + * reject kernel samples when exclude_kernel is set.
>> + */
>> + if (event->attr.exclude_kernel && !user_mode(regs))
>> + return false;
>> +
>> + return true;
>> +}
>> +
>> /*
>> * Generic event overflow handling, sampling.
>> */
>> @@ -7742,6 +7764,12 @@ static int __perf_event_overflow(struct perf_event *event,
>> ret = __perf_event_account_interrupt(event, throttle);
>>
>> /*
>> + * For security, drop the skid kernel samples if necessary.
>> + */
>> + if (!sample_is_allowed(event, regs))
>> + return ret;
>> +
>> + /*
>> * XXX event_limit might not quite work as expected on inherited
>> * events
>> */
>> @@ -9500,9 +9528,39 @@ perf_event_mux_interval_ms_store(struct device *dev,
>> }
>> static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
>>
>> +static ssize_t
>> +perf_allow_sample_leakage_show(struct device *dev,
>> + struct device_attribute *attr, char *page)
>> +{
>> + int allow_leakage = READ_ONCE(perf_allow_sample_leakage);
>> +
>> + return snprintf(page, PAGE_SIZE-1, "%d\n", allow_leakage);
>> +}
>> +
>> +static ssize_t
>> +perf_allow_sample_leakage_store(struct device *dev,
>> + struct device_attribute *attr,
>> + const char *buf, size_t count)
>> +{
>> + int allow_leakage, ret;
>> +
>> + ret = kstrtoint(buf, 0, &allow_leakage);
>> + if (ret)
>> + return ret;
>> +
>> + if (allow_leakage != 0 && allow_leakage != 1)
>> + return -EINVAL;
>> +
>> + WRITE_ONCE(perf_allow_sample_leakage, allow_leakage);
>> +
>> + return count;
>> +}
>> +static DEVICE_ATTR_RW(perf_allow_sample_leakage);
>> +
>> static struct attribute *pmu_dev_attrs[] = {
>> &dev_attr_type.attr,
>> &dev_attr_perf_event_mux_interval_ms.attr,
>> + &dev_attr_perf_allow_sample_leakage.attr,
>> NULL,
>> };
>> ATTRIBUTE_GROUPS(pmu_dev);
>> --
>> 2.7.4
>>
On Fri, Jun 15, 2018 at 06:03:21PM +0800, Jin Yao wrote:
> On workloads that do a lot of kernel entry/exits we see kernel
> samples, even though :u is specified. This is due to skid existing.
>
> This might be a security issue because it can leak kernel addresses even
> though kernel sampling support is disabled.
>
> One patch "perf/core: Drop kernel samples even though :u is specified"
> was posted in last year but it was reverted because it introduced a
> regression issue that broke the rr-project.
>
> Now this patch set uses sysctl to control the dropping of leaked
> kernel samples.
So what happened to the suggestion of keeping the samples but 0-stuffing
all the tricky bits?
On 6/15/2018 3:45 PM, Peter Zijlstra wrote:
> On Fri, Jun 15, 2018 at 06:03:21PM +0800, Jin Yao wrote:
>> On workloads that do a lot of kernel entry/exits we see kernel
>> samples, even though :u is specified. This is due to skid existing.
>>
>> This might be a security issue because it can leak kernel addresses even
>> though kernel sampling support is disabled.
>>
>> One patch "perf/core: Drop kernel samples even though :u is specified"
>> was posted in last year but it was reverted because it introduced a
>> regression issue that broke the rr-project.
>>
>> Now this patch set uses sysctl to control the dropping of leaked
>> kernel samples.
>
> So what happened to the suggestion of keeping the samples but 0-stuffing
> all the tricky bits?
>
Bring more overhead to kernel if we zero the bits considering the number
of leaked samples may be not too small?
And the skid information may be interesting (see example of hitting on
page_fault in previous mail). If we zero it, we will not know.
Thanks
Jin Yao
On Fri, Jun 15, 2018 at 04:01:45PM +0800, Jin, Yao wrote:
> Bring more overhead to kernel if we zero the bits considering the number of
> leaked samples may be not too small?
Keeping the samples at least allows you to know how many samples
happened and such things.
> And the skid information may be interesting (see example of hitting on
> page_fault in previous mail). If we zero it, we will not know.
If you throw them out the window you also don't know, do you?
On Thu, Jun 14, 2018 at 11:02:53PM -0700, Stephane Eranian wrote:
> On Thu, Jun 14, 2018 at 7:10 PM Jin Yao <[email protected]> wrote:
> > + /*
> > + * Due to interrupt latency (AKA "skid"), we may enter the
> > + * kernel before taking an overflow, even if the PMU is only
> > + * counting user events.
> > + * To avoid leaking information to userspace, we must always
> > + * reject kernel samples when exclude_kernel is set.
> > + */
> > + if (event->attr.exclude_kernel && !user_mode(regs))
> > + return false;
> > +
> And how does that filter PEBS or LBR records?
I suspect the user_mode() thing actually covers PEBS, but yes LBR might
need additional filtering.
On 6/15/2018 4:12 PM, Peter Zijlstra wrote:
> On Fri, Jun 15, 2018 at 04:01:45PM +0800, Jin, Yao wrote:
>
>> Bring more overhead to kernel if we zero the bits considering the number of
>> leaked samples may be not too small?
>
> Keeping the samples at least allows you to know how many samples
> happened and such things.
>
Yeah, agree, but a little bit overhead ...
>> And the skid information may be interesting (see example of hitting on
>> page_fault in previous mail). If we zero it, we will not know.
>
> If you throw them out the window you also don't know, do you?
>
Yes, default I can't know that window. I have to use sysctl to enable.
Thanks
Jin Yao
On Fri, Jun 15, 2018 at 06:03:22PM +0800, Jin Yao wrote:
> When doing sampling, for example:
>
> perf record -e cycles:u ...
>
> On workloads that do a lot of kernel entry/exits we see kernel
> samples, even though :u is specified. This is due to skid existing.
>
> This might be a security issue because it can leak kernel addresses even
> though kernel sampling support is disabled.
>
> One patch "perf/core: Drop kernel samples even though :u is specified"
> was posted in last year but it was reverted because it introduced a
> regression issue that broke the rr-project, which used sampling
> events to receive a signal on overflow. These signals were critical
> to the correct operation of rr.
>
> See '6a8a75f32357 ("Revert "perf/core: Drop kernel samples even
> though :u is specified"")' for detail.
>
> Now the idea is to use sysctl to control the dropping of leaked
> kernel samples.
>
> /sys/devices/cpu/perf_allow_sample_leakage:
>
> 0 - default, drop the leaked kernel samples.
> 1 - don't drop the leaked kernel samples.
Does this need to be conditional at all?
At least for sampling the GPRs, we could do something like below
unconditionally, which seems sufficient for my test cases.
Mark.
---->8----
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce359ad..79a21531d57c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6359,6 +6359,24 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
return callchain ?: &__empty_callchain;
}
+static struct pt_regs *perf_get_sample_regs(struct perf_event *event, struct pt_regs *regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the kernel
+ * before taking an overflow, even if the PMU is only counting user
+ * events.
+ *
+ * If we're not counting kernel events, always use the user regs when
+ * sampling.
+ *
+ * TODO: how does this interact with guest sampling?
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return task_pt_regs(current);
+
+ return regs;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
@@ -6366,6 +6384,8 @@ void perf_prepare_sample(struct perf_event_header *header,
{
u64 sample_type = event->attr.sample_type;
+ regs = perf_get_sample_regs(event, regs);
+
header->type = PERF_RECORD_SAMPLE;
header->size = sizeof(*header) + event->header_size;
> On Thu, Jun 14, 2018 at 11:02:53PM -0700, Stephane Eranian wrote:
> > On Thu, Jun 14, 2018 at 7:10 PM Jin Yao <[email protected]> wrote:
> > > + /*
> > > + * Due to interrupt latency (AKA "skid"), we may enter the
> > > + * kernel before taking an overflow, even if the PMU is only
> > > + * counting user events.
> > > + * To avoid leaking information to userspace, we must always
> > > + * reject kernel samples when exclude_kernel is set.
> > > + */
> > > + if (event->attr.exclude_kernel && !user_mode(regs))
> > > + return false;
> > > +
> > And how does that filter PEBS or LBR records?
>
> I suspect the user_mode() thing actually covers PEBS, but yes LBR might need
> additional filtering.
I think the large PEBS still need to be specially handled.
Thanks,
Kan
Hi,
On Fri, Jun 15, 2018 at 1:12 AM Peter Zijlstra <[email protected]> wrote:
>
> On Fri, Jun 15, 2018 at 04:01:45PM +0800, Jin, Yao wrote:
>
> > Bring more overhead to kernel if we zero the bits considering the number of
> > leaked samples may be not too small?
>
> Keeping the samples at least allows you to know how many samples
> happened and such things.
>
Yes, important point. You cannot just drop a sample. It may break some
calculations
done like num_samples x fixed_periods = total number of events observed. Some
tools do this.
> > And the skid information may be interesting (see example of hitting on
> > page_fault in previous mail). If we zero it, we will not know.
>
> If you throw them out the window you also don't know, do you?
On Thu, Jun 14, 2018 at 10:11 PM, Jin, Yao <[email protected]> wrote:
>
>
> On 6/15/2018 11:35 AM, Kyle Huey wrote:
>>
>> I strongly object to this patch as written. As I said when I
>> originally reported[0] the regression introduced by the previous
>> version of this patch a year ago.
>>
>> "It seems like this change should, at a bare minimum, be limited to
>> counters that actually perform sampling of register state when the
>> interrupt fires. In our case, with the retired conditional branches
>> counter restricted to counting userspace events only, it makes no
>> difference that the PMU interrupt happened to be delivered in the
>> kernel."
>>
>> This means identifying which values of `perf_event_attr::sample_type`
>> are security concerns (presumably PERF_SAMPLE_IP is, and
>> PERF_SAMPLE_TIME is not, and someone needs to go through and decide on
>> all of them) and filtering on those values for this new behavior.
>>
>> And because rr sets its sample_type to 0, once you do that, the sysctl
>> will not be necessary.
>>
>> - Kyle
>>
>
> Since rr sets sample_type to 0, the easiest way is to add checking like:
>
> if (event->attr.sample_type) {
> if (event->attr.exclude_kernel && !user_mode(regs))
> return false;
> }
>
> So the rr doesn't need to be changed and for other use cases the leaked
> kernel samples will be dropped.
>
> But I don't like this is because:
>
> 1. It's too specific for rr case.
Keeping existing software working is the first rule of kernel development!
There is no disclosure of kernel space state in the way rr uses this
API, so there is no reason that this API should not keep working.
> 2. If we create a new sample_type, e.g. PERF_SAMPLE_ALLOW_LEAKAGE, the code
> will be:
>
> if !(event->attr.sample_type & PERF_SAMPLE_ALLOW_LEAKAGE) {
> if (event->attr.exclude_kernel && !user_mode(regs))
> return false;
> }
>
> But rr needs to add PERF_SAMPLE_ALLOW_LEAKAGE to sample_type since by
> default the bit is not set.
There's no reason to add a new PERF_SAMPLE flag. You need to audit the
*existing* PERF_SAMPLE flags and figure out which ones are problems,
and then do
if (event->attr.exclude_kernel && !user_mode(regs) &&
sampling_discloses_kernel_information(event->attr.sample_type)) {
return false;
}
> 3. Sysctl is a more flexible way. It provides us with an option when we want
> to see if skid is existing, we can use sysctl to turn on that.
If you want a sysctl for your own reasons that's fine. But we don't
want a sysctl. We want to work without any further configuration.
> Thanks
> Jin Yao
>
- Kyle
On Fri, Jun 15, 2018 at 10:16 AM, Kyle Huey <[email protected]> wrote:
>
> If you want a sysctl for your own reasons that's fine. But we don't
> want a sysctl. We want to work without any further configuration.
Also toggling a sysctl would require root privileges, but rr does not
currently need to run as root. Thus rr users would have to either
permanently change their system configuration (and every extra
configuration step is a pain), or run rr as root so rr can toggle the
sysctl itself. Running rr as root is highly undesirable.
Rob
--
Su ot deraeppa sah dna Rehtaf eht htiw saw hcihw, efil lanrete eht uoy
ot mialcorp ew dna, ti ot yfitset dna ti nees evah ew; deraeppa efil
eht. Efil fo Drow eht gninrecnoc mialcorp ew siht - dehcuot evah sdnah
ruo dna ta dekool evah ew hcihw, seye ruo htiw nees evah ew hcihw,
draeh evah ew hcihw, gninnigeb eht morf saw hcihw taht.
Hi All,
This patch raised many questions, I was prepared. :)
I'd like to try another proposal that it adds a special flag in the
returned perf_sample_data to indicate the perf binary that this sample
is a leaked sample.
For example, create a new PERF_SAMPLE_RETURN_LEAKAGE in
perf_event_sample_format.
In perf_prepare_sample(),
if (event->attr.exclude_kernel && !user_mode(regs))
data->type |= PERF_SAMPLE_RETURN_LEAKAGE;
Now all the samples are kept and the leaked kernel samples are tagged
with PERF_SAMPLE_RETURN_LEAKAGE.
In perf binary, it filters out the samples with
PERF_SAMPLE_RETURN_LEAKAGE. It needs perf binary modification but rr
doesn't need to be changed.
I don't 0-stuffing some fields because:
1. Keeping the skid info should allow us to look at that if we have
interesting later.
2. Not sure if 0-stuffing some fields has potential conflicts with some
applications.
Is this proposal reasonable?
Thanks
Jin Yao
On 6/16/2018 1:34 AM, Robert O'Callahan wrote:
> On Fri, Jun 15, 2018 at 10:16 AM, Kyle Huey <[email protected]> wrote:
>>
>> If you want a sysctl for your own reasons that's fine. But we don't
>> want a sysctl. We want to work without any further configuration.
>
> Also toggling a sysctl would require root privileges, but rr does not
> currently need to run as root. Thus rr users would have to either
> permanently change their system configuration (and every extra
> configuration step is a pain), or run rr as root so rr can toggle the
> sysctl itself. Running rr as root is highly undesirable.
>
> Rob
>
On Fri, Jun 15, 2018 at 5:50 PM, Jin, Yao <[email protected]> wrote:
> Hi All,
>
> This patch raised many questions, I was prepared. :)
>
> I'd like to try another proposal that it adds a special flag in the returned
> perf_sample_data to indicate the perf binary that this sample is a leaked
> sample.
>
> For example, create a new PERF_SAMPLE_RETURN_LEAKAGE in
> perf_event_sample_format.
>
> In perf_prepare_sample(),
>
> if (event->attr.exclude_kernel && !user_mode(regs))
> data->type |= PERF_SAMPLE_RETURN_LEAKAGE;
>
> Now all the samples are kept and the leaked kernel samples are tagged with
> PERF_SAMPLE_RETURN_LEAKAGE.
>
> In perf binary, it filters out the samples with PERF_SAMPLE_RETURN_LEAKAGE.
> It needs perf binary modification but rr doesn't need to be changed.
>
> I don't 0-stuffing some fields because:
>
> 1. Keeping the skid info should allow us to look at that if we have
> interesting later.
>
> 2. Not sure if 0-stuffing some fields has potential conflicts with some
> applications.
>
> Is this proposal reasonable?
>
> Thanks
> Jin Yao
>
>
> On 6/16/2018 1:34 AM, Robert O'Callahan wrote:
>>
>> On Fri, Jun 15, 2018 at 10:16 AM, Kyle Huey <[email protected]> wrote:
>>>
>>>
>>> If you want a sysctl for your own reasons that's fine. But we don't
>>> want a sysctl. We want to work without any further configuration.
>>
>>
>> Also toggling a sysctl would require root privileges, but rr does not
>> currently need to run as root. Thus rr users would have to either
>> permanently change their system configuration (and every extra
>> configuration step is a pain), or run rr as root so rr can toggle the
>> sysctl itself. Running rr as root is highly undesirable.
>>
>> Rob
>>
>
If the problem you're trying to fix is an inappropriate disclosure of
kernel-space information to user-space, how does filtering the samples
in user space solve anything?
- Kyle
On 6/16/2018 8:56 AM, Kyle Huey wrote:
> On Fri, Jun 15, 2018 at 5:50 PM, Jin, Yao <[email protected]> wrote:
>> Hi All,
>>
>> This patch raised many questions, I was prepared. :)
>>
>> I'd like to try another proposal that it adds a special flag in the returned
>> perf_sample_data to indicate the perf binary that this sample is a leaked
>> sample.
>>
>> For example, create a new PERF_SAMPLE_RETURN_LEAKAGE in
>> perf_event_sample_format.
>>
>> In perf_prepare_sample(),
>>
>> if (event->attr.exclude_kernel && !user_mode(regs))
>> data->type |= PERF_SAMPLE_RETURN_LEAKAGE;
>>
>> Now all the samples are kept and the leaked kernel samples are tagged with
>> PERF_SAMPLE_RETURN_LEAKAGE.
>>
>> In perf binary, it filters out the samples with PERF_SAMPLE_RETURN_LEAKAGE.
>> It needs perf binary modification but rr doesn't need to be changed.
>>
>> I don't 0-stuffing some fields because:
>>
>> 1. Keeping the skid info should allow us to look at that if we have
>> interesting later.
>>
>> 2. Not sure if 0-stuffing some fields has potential conflicts with some
>> applications.
>>
>> Is this proposal reasonable?
>>
>> Thanks
>> Jin Yao
>>
>>
>> On 6/16/2018 1:34 AM, Robert O'Callahan wrote:
>>>
>>> On Fri, Jun 15, 2018 at 10:16 AM, Kyle Huey <[email protected]> wrote:
>>>>
>>>>
>>>> If you want a sysctl for your own reasons that's fine. But we don't
>>>> want a sysctl. We want to work without any further configuration.
>>>
>>>
>>> Also toggling a sysctl would require root privileges, but rr does not
>>> currently need to run as root. Thus rr users would have to either
>>> permanently change their system configuration (and every extra
>>> configuration step is a pain), or run rr as root so rr can toggle the
>>> sysctl itself. Running rr as root is highly undesirable.
>>>
>>> Rob
>>>
>>
>
> If the problem you're trying to fix is an inappropriate disclosure of
> kernel-space information to user-space, how does filtering the samples
> in user space solve anything?
>
> - Kyle
>
In theory it is. But actually we just use perf tool to look at the
sampling result.
And suppose a case, if we want to estimate the skid window, we still
need the skid info.
Thanks
Jin Yao
On Fri, Jun 15, 2018 at 8:36 PM Mark Rutland <[email protected]> wrote:
>
> At least for sampling the GPRs, we could do something like below
> unconditionally, which seems sufficient for my test cases.
Ack.
The PEBS case may need checking, but maybe PEBS doesn't even have this issue?
Linus
On 6/15/2018 7:36 PM, Mark Rutland wrote:
> On Fri, Jun 15, 2018 at 06:03:22PM +0800, Jin Yao wrote:
>> When doing sampling, for example:
>>
>> perf record -e cycles:u ...
>>
>> On workloads that do a lot of kernel entry/exits we see kernel
>> samples, even though :u is specified. This is due to skid existing.
>>
>> This might be a security issue because it can leak kernel addresses even
>> though kernel sampling support is disabled.
>>
>> One patch "perf/core: Drop kernel samples even though :u is specified"
>> was posted in last year but it was reverted because it introduced a
>> regression issue that broke the rr-project, which used sampling
>> events to receive a signal on overflow. These signals were critical
>> to the correct operation of rr.
>>
>> See '6a8a75f32357 ("Revert "perf/core: Drop kernel samples even
>> though :u is specified"")' for detail.
>>
>> Now the idea is to use sysctl to control the dropping of leaked
>> kernel samples.
>>
>> /sys/devices/cpu/perf_allow_sample_leakage:
>>
>> 0 - default, drop the leaked kernel samples.
>> 1 - don't drop the leaked kernel samples.
>
> Does this need to be conditional at all?
>
> At least for sampling the GPRs, we could do something like below
> unconditionally, which seems sufficient for my test cases.
>
> Mark.
>
> ---->8----
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 67612ce359ad..79a21531d57c 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6359,6 +6359,24 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
> return callchain ?: &__empty_callchain;
> }
>
> +static struct pt_regs *perf_get_sample_regs(struct perf_event *event, struct pt_regs *regs)
> +{
> + /*
> + * Due to interrupt latency (AKA "skid"), we may enter the kernel
> + * before taking an overflow, even if the PMU is only counting user
> + * events.
> + *
> + * If we're not counting kernel events, always use the user regs when
> + * sampling.
> + *
> + * TODO: how does this interact with guest sampling?
> + */
> + if (event->attr.exclude_kernel && !user_mode(regs))
> + return task_pt_regs(current);
> +
> + return regs;
> +}
> +
> void perf_prepare_sample(struct perf_event_header *header,
> struct perf_sample_data *data,
> struct perf_event *event,
> @@ -6366,6 +6384,8 @@ void perf_prepare_sample(struct perf_event_header *header,
> {
> u64 sample_type = event->attr.sample_type;
>
> + regs = perf_get_sample_regs(event, regs);
> +
> header->type = PERF_RECORD_SAMPLE;
> header->size = sizeof(*header) + event->header_size;
>
>
Hi Mark,
Thanks for providing the patch. I understand this approach.
In my opinion, the skid window is from counter overflow to interrupt
delivered. While if the skid window is too *big* (e.g. user -> kernel),
it should be not very useful. So personally, I'd prefer to drop the samples.
Thanks
Jin Yao
On Sat, Jun 16, 2018 at 10:27:27AM +0900, Linus Torvalds wrote:
> On Fri, Jun 15, 2018 at 8:36 PM Mark Rutland <[email protected]> wrote:
> >
> > At least for sampling the GPRs, we could do something like below
> > unconditionally, which seems sufficient for my test cases.
>
> Ack.
>
> The PEBS case may need checking, but maybe PEBS doesn't even have this issue?
Sadly PEBS is susceptible as well, but the proposed patch actually works
for that as well.
On Fri, Jun 15, 2018 at 01:31:34PM +0000, Liang, Kan wrote:
> > On Thu, Jun 14, 2018 at 11:02:53PM -0700, Stephane Eranian wrote:
> > > On Thu, Jun 14, 2018 at 7:10 PM Jin Yao <[email protected]> wrote:
> > > > + /*
> > > > + * Due to interrupt latency (AKA "skid"), we may enter the
> > > > + * kernel before taking an overflow, even if the PMU is only
> > > > + * counting user events.
> > > > + * To avoid leaking information to userspace, we must always
> > > > + * reject kernel samples when exclude_kernel is set.
> > > > + */
> > > > + if (event->attr.exclude_kernel && !user_mode(regs))
> > > > + return false;
> > > > +
> > > And how does that filter PEBS or LBR records?
> >
> > I suspect the user_mode() thing actually covers PEBS, but yes LBR might need
> > additional filtering.
>
> I think the large PEBS still need to be specially handled.
large pebs should be fine too, all pebs stuff eventually calls
perf_event_output() with a synthesized register set.
On Mon, Jun 18, 2018 at 02:55:32PM +0800, Jin, Yao wrote:
> Thanks for providing the patch. I understand this approach.
>
> In my opinion, the skid window is from counter overflow to interrupt
> delivered. While if the skid window is too *big* (e.g. user -> kernel), it
> should be not very useful. So personally, I'd prefer to drop the samples.
I really don't get your insitence on dropping the sample. Dropping
samples is bad. Furthermore, doing what Mark suggests actually improves
the result by reducing the skid, if the event happened before we entered
(as it damn well should) then the user regs, which point at the entry
site, are a better approximation than our in-kernel set.
So not only do you not loose the sample, you actually get a better
sample.
On 6/18/2018 6:45 PM, Peter Zijlstra wrote:
> On Mon, Jun 18, 2018 at 02:55:32PM +0800, Jin, Yao wrote:
>> Thanks for providing the patch. I understand this approach.
>>
>> In my opinion, the skid window is from counter overflow to interrupt
>> delivered. While if the skid window is too *big* (e.g. user -> kernel), it
>> should be not very useful. So personally, I'd prefer to drop the samples.
>
> I really don't get your insitence on dropping the sample. Dropping
> samples is bad. Furthermore, doing what Mark suggests actually improves
> the result by reducing the skid, if the event happened before we entered
> (as it damn well should) then the user regs, which point at the entry
> site, are a better approximation than our in-kernel set.
>
> So not only do you not loose the sample, you actually get a better
> sample.
>
OK, that's fine, thanks!
I guess Mark will post this patch, right?
Anyway looks we don't need following patch (0-stuffing sample->ip to
indicate perf tool that it is a leak sample), right?
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80cca2b..628b515 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6361,6 +6361,21 @@ perf_callchain(struct perf_event *event, struct
pt_regs *regs)
return callchain ?: &__empty_callchain;
}
+static bool sample_is_leaked(struct perf_event *event, struct pt_regs
*regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ * To avoid leaking information to userspace, we must always
+ * reject kernel samples when exclude_kernel is set.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return true;
+
+ return false;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
@@ -6480,6 +6495,9 @@ void perf_prepare_sample(struct perf_event_header
*header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+
+ if (sample_is_leaked(event, regs))
+ data->ip = 0;
}
static void __always_inline
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index bfa60bc..1bfb697 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -404,6 +404,7 @@ struct events_stats {
u64 total_aux_lost;
u64 total_aux_partial;
u64 total_invalid_chains;
+ u64 total_dropped_samples;
u32 nr_events[PERF_RECORD_HEADER_MAX];
u32 nr_non_filtered_samples;
u32 nr_lost_warned;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 8b93693..ec923f1 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1269,6 +1269,12 @@ static int machines__deliver_event(struct
machines *machines,
++evlist->stats.nr_unprocessable_samples;
return 0;
}
+
+ if (sample->ip == 0) {
+ /* Drop the leaked kernel samples */
+ ++evlist->stats.total_dropped_samples;
+ return 0;
+ }
return perf_evlist__deliver_sample(evlist, tool, event,
sample, evsel, machine);
case PERF_RECORD_MMAP:
return tool->mmap(tool, event, sample, machine);
Thanks
Jin Yao
On Tue, Jun 19, 2018 at 09:39:02AM +0800, Jin, Yao wrote:
>
>
> On 6/18/2018 6:45 PM, Peter Zijlstra wrote:
> > On Mon, Jun 18, 2018 at 02:55:32PM +0800, Jin, Yao wrote:
> > > Thanks for providing the patch. I understand this approach.
> > >
> > > In my opinion, the skid window is from counter overflow to interrupt
> > > delivered. While if the skid window is too *big* (e.g. user -> kernel), it
> > > should be not very useful. So personally, I'd prefer to drop the samples.
> >
> > I really don't get your insitence on dropping the sample. Dropping
> > samples is bad. Furthermore, doing what Mark suggests actually improves
> > the result by reducing the skid, if the event happened before we entered
> > (as it damn well should) then the user regs, which point at the entry
> > site, are a better approximation than our in-kernel set.
> >
> > So not only do you not loose the sample, you actually get a better
> > sample.
> >
>
> OK, that's fine, thanks!
>
> I guess Mark will post this patch, right?
I'll try to spin something shortly -- I'm just figuring out how this should
work with guest sampling.
Thanks,
Mark.
On Fri, Jun 15, 2018 at 12:15 AM Jin, Yao <[email protected]> wrote:
>
>
>
> On 6/15/2018 1:59 PM, Stephane Eranian wrote:
> > On Thu, Jun 14, 2018 at 7:10 PM Jin Yao <[email protected]> wrote:
> >>
> >> When doing sampling, for example:
> >>
> >> perf record -e cycles:u ...
> >>
> >> On workloads that do a lot of kernel entry/exits we see kernel
> >> samples, even though :u is specified. This is due to skid existing.
> >>
> >> This might be a security issue because it can leak kernel addresses even
> >> though kernel sampling support is disabled.
> >>
> >> One patch "perf/core: Drop kernel samples even though :u is specified"
> >> was posted in last year but it was reverted because it introduced a
> >> regression issue that broke the rr-project, which used sampling
> >> events to receive a signal on overflow. These signals were critical
> >> to the correct operation of rr.
> >>
> >> See '6a8a75f32357 ("Revert "perf/core: Drop kernel samples even
> >> though :u is specified"")' for detail.
> >>
> >> Now the idea is to use sysctl to control the dropping of leaked
> >> kernel samples.
> >>
> >> /sys/devices/cpu/perf_allow_sample_leakage:
> >>
> >> 0 - default, drop the leaked kernel samples.
> >> 1 - don't drop the leaked kernel samples.
> >>
> >> For rr it can write 1 to /sys/devices/cpu/perf_allow_sample_leakage.
> >>
> >> For example,
> >>
> >> root@skl:/tmp# cat /sys/devices/cpu/perf_allow_sample_leakage
> >> 0
> >> root@skl:/tmp# perf record -e cycles:u ./div
> >> root@skl:/tmp# perf report --stdio
> >>
> >> ........ ....... ............. ................
> >>
> >> 47.01% div div [.] main
> >> 20.74% div libc-2.23.so [.] __random_r
> >> 15.59% div libc-2.23.so [.] __random
> >> 8.68% div div [.] compute_flag
> >> 4.48% div libc-2.23.so [.] rand
> >> 3.50% div div [.] rand@plt
> >> 0.00% div ld-2.23.so [.] do_lookup_x
> >> 0.00% div ld-2.23.so [.] memcmp
> >> 0.00% div ld-2.23.so [.] _dl_start
> >> 0.00% div ld-2.23.so [.] _start
> >>
> >> There is no kernel symbol reported.
> >>
> >> root@skl:/tmp# echo 1 > /sys/devices/cpu/perf_allow_sample_leakage
> >> root@skl:/tmp# cat /sys/devices/cpu/perf_allow_sample_leakage
> >> 1
> >> root@skl:/tmp# perf record -e cycles:u ./div
> >> root@skl:/tmp# perf report --stdio
> >>
> >> ........ ....... ................ .............
> >>
> >> 47.53% div div [.] main
> >> 20.62% div libc-2.23.so [.] __random_r
> >> 15.32% div libc-2.23.so [.] __random
> >> 8.66% div div [.] compute_flag
> >> 4.53% div libc-2.23.so [.] rand
> >> 3.34% div div [.] rand@plt
> >> 0.00% div [kernel.vmlinux] [k] apic_timer_interrupt
> >> 0.00% div libc-2.23.so [.] intel_check_word
> >> 0.00% div ld-2.23.so [.] brk
> >> 0.00% div [kernel.vmlinux] [k] page_fault
> >> 0.00% div ld-2.23.so [.] _start
> >>
> >> We can see the kernel symbols apic_timer_interrupt and page_fault.
> >>
> > These kernel symbols do not match your description here. How much skid
> > do you think you have here?
> > You're saying you are at the user level, you get a counter overflow,
> > and the interrupted IP lands in the kernel
> > because you where there by the time the interrupt is delivered. How
> > many instructions does it take to get
> > from user land to apic_timer_interrupt() or page_fault()? These
> > functions are not right at the kernel entry,
> > I believe. So how did you get there, the skid must have been VERY big
> > or symbolization has a problem.
> >
>
> I'm testing with the latest perf/core branch (4.17+). Again I test with
> Linux's vmstat (not test with my application).
>
> perf record -e cycles:u vmstat 1
> perf script -F ip
>
> 7f84e2b0bc30
> 7f84e2b0bc30
> 7f84e2b0bc30
> 7f84e2b0bc30
> ffffffffb7a01070
> 7f84e2b243f0
> 7f84e2b11891
> 7f84e2b27f5e
> 7f84e25a3b26
> 7f84e25680f5
>
> cat /proc/kallsyms | grep page_fault
> ....
> ffffffffb7a01070 T page_fault
> ffffffffb7a010a0 T async_page_fault
> ....
>
> So one sample (ip ffffffffb7a01070) hits on page_fault.
>
> Maybe you can have a try too. :)
>
Ok, I tried and checked! These symbols are all in entry_64.S, these
are the first instructions executed on the timer intr or fault.
So it looks normal that they show up due to the interrupt skid, even
if the skid is 1 cycle.
PEBS, especially when using precise=1 could also show these symbols.
>
> Thanks
> Jin Yao
>
> >> Signed-off-by: Jin Yao <[email protected]>
> >> ---
> >> kernel/events/core.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >> 1 file changed, 58 insertions(+)
> >>
> >> diff --git a/kernel/events/core.c b/kernel/events/core.c
> >> index 80cca2b..7867541 100644
> >> --- a/kernel/events/core.c
> >> +++ b/kernel/events/core.c
> >> @@ -7721,6 +7721,28 @@ int perf_event_account_interrupt(struct perf_event *event)
> >> return __perf_event_account_interrupt(event, 1);
> >> }
> >>
> >> +static int perf_allow_sample_leakage __read_mostly;
> >> +
> >> +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
> >> +{
> >> + int allow_leakage = READ_ONCE(perf_allow_sample_leakage);
> >> +
> >> + if (allow_leakage)
> >> + return true;
> >> +
> >> + /*
> >> + * Due to interrupt latency (AKA "skid"), we may enter the
> >> + * kernel before taking an overflow, even if the PMU is only
> >> + * counting user events.
> >> + * To avoid leaking information to userspace, we must always
> >> + * reject kernel samples when exclude_kernel is set.
> >> + */
> >> + if (event->attr.exclude_kernel && !user_mode(regs))
> >> + return false;
> >> +
> >> + return true;
> >> +}
> >> +
> >> /*
> >> * Generic event overflow handling, sampling.
> >> */
> >> @@ -7742,6 +7764,12 @@ static int __perf_event_overflow(struct perf_event *event,
> >> ret = __perf_event_account_interrupt(event, throttle);
> >>
> >> /*
> >> + * For security, drop the skid kernel samples if necessary.
> >> + */
> >> + if (!sample_is_allowed(event, regs))
> >> + return ret;
> >> +
> >> + /*
> >> * XXX event_limit might not quite work as expected on inherited
> >> * events
> >> */
> >> @@ -9500,9 +9528,39 @@ perf_event_mux_interval_ms_store(struct device *dev,
> >> }
> >> static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
> >>
> >> +static ssize_t
> >> +perf_allow_sample_leakage_show(struct device *dev,
> >> + struct device_attribute *attr, char *page)
> >> +{
> >> + int allow_leakage = READ_ONCE(perf_allow_sample_leakage);
> >> +
> >> + return snprintf(page, PAGE_SIZE-1, "%d\n", allow_leakage);
> >> +}
> >> +
> >> +static ssize_t
> >> +perf_allow_sample_leakage_store(struct device *dev,
> >> + struct device_attribute *attr,
> >> + const char *buf, size_t count)
> >> +{
> >> + int allow_leakage, ret;
> >> +
> >> + ret = kstrtoint(buf, 0, &allow_leakage);
> >> + if (ret)
> >> + return ret;
> >> +
> >> + if (allow_leakage != 0 && allow_leakage != 1)
> >> + return -EINVAL;
> >> +
> >> + WRITE_ONCE(perf_allow_sample_leakage, allow_leakage);
> >> +
> >> + return count;
> >> +}
> >> +static DEVICE_ATTR_RW(perf_allow_sample_leakage);
> >> +
> >> static struct attribute *pmu_dev_attrs[] = {
> >> &dev_attr_type.attr,
> >> &dev_attr_perf_event_mux_interval_ms.attr,
> >> + &dev_attr_perf_allow_sample_leakage.attr,
> >> NULL,
> >> };
> >> ATTRIBUTE_GROUPS(pmu_dev);
> >> --
> >> 2.7.4
> >>