LinuxLists.cc - [RFC 0/5] CR4 handling improvements

[permalink] [raw]

Subject: [RFC 1/5] x86: Clean up cr4 manipulation

CR4 manipulation was split, seemingly at random, between direct
(write_cr4) and using a helper (set/clear_in_cr4). Unfortunately,
the set_in_cr4 and clear_in_cr4 helpers also poke at the boot code,
which only a small subset of users actually wanted.

This patch replaces all cr4 access in functions that don't leave cr4
exactly the way they found it with new helpers cr4_set, cr4_clear,
and cr4_set_and_update_boot.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/processor.h | 33 --------------------------------
arch/x86/include/asm/tlbflush.h | 37 ++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/virtext.h | 3 ++-
arch/x86/kernel/cpu/common.c | 10 +++++-----
arch/x86/kernel/cpu/mcheck/mce.c | 3 ++-
arch/x86/kernel/cpu/mcheck/p5.c | 3 ++-
arch/x86/kernel/cpu/mcheck/winchip.c | 3 ++-
arch/x86/kernel/cpu/perf_event.c | 7 ++++---
arch/x86/kernel/i387.c | 3 ++-
arch/x86/kernel/process.c | 5 +++--
arch/x86/kernel/xsave.c | 3 ++-
arch/x86/kvm/vmx.c | 4 ++--
arch/x86/mm/init.c | 4 ++--
arch/x86/xen/enlighten.c | 4 ++--
14 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index eb71ec794732..ddd8d13a010f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -578,39 +578,6 @@ static inline void load_sp0(struct tss_struct *tss,
#define set_iopl_mask native_set_iopl_mask
#endif /* CONFIG_PARAVIRT */

-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-extern u32 *trampoline_cr4_features;
-
-static inline void set_in_cr4(unsigned long mask)
-{
- unsigned long cr4;
-
- mmu_cr4_features |= mask;
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;
- cr4 = read_cr4();
- cr4 |= mask;
- write_cr4(cr4);
-}
-
-static inline void clear_in_cr4(unsigned long mask)
-{
- unsigned long cr4;
-
- mmu_cr4_features &= ~mask;
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;
- cr4 = read_cr4();
- cr4 &= ~mask;
- write_cr4(cr4);
-}
-
typedef struct {
unsigned long seg;
} mm_segment_t;
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 04905bfc508b..95b672f8b493 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,6 +15,43 @@
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif

+/* Set in this cpu's CR4. */
+static inline void cr4_set(unsigned long mask)
+{
+ unsigned long cr4;
+
+ cr4 = read_cr4();
+ cr4 |= mask;
+ write_cr4(cr4);
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear(unsigned long mask)
+{
+ unsigned long cr4;
+
+ cr4 = read_cr4();
+ cr4 &= ~mask;
+ write_cr4(cr4);
+}
+
+/*
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags. This should only be used
+ * during boot on the boot cpu.
+ */
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
+
+static inline void cr4_set_and_update_boot(unsigned long mask)
+{
+ mmu_cr4_features |= mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4_set(mask);
+}
+
static inline void __native_flush_tlb(void)
{
native_write_cr3(native_read_cr3());
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 5da71c27cc59..7503a642294a 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -19,6 +19,7 @@

#include <asm/vmx.h>
#include <asm/svm.h>
+#include <asm/tlbflush.h>

/*
* VMX functions:
@@ -40,7 +41,7 @@ static inline int cpu_has_vmx(void)
static inline void cpu_vmxoff(void)
{
asm volatile (ASM_VMX_VMXOFF : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear(X86_CR4_VMXE);
}

static inline int cpu_vmx_enabled(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e4ab2b42bd6f..7d8400a4b192 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -276,7 +276,7 @@ __setup("nosmep", setup_disable_smep);
static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_SMEP))
- set_in_cr4(X86_CR4_SMEP);
+ cr4_set(X86_CR4_SMEP);
}

static __init int setup_disable_smap(char *arg)
@@ -296,9 +296,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)

if (cpu_has(c, X86_FEATURE_SMAP)) {
#ifdef CONFIG_X86_SMAP
- set_in_cr4(X86_CR4_SMAP);
+ cr4_set(X86_CR4_SMAP);
#else
- clear_in_cr4(X86_CR4_SMAP);
+ cr4_clear(X86_CR4_SMAP);
#endif
}
}
@@ -1307,7 +1307,7 @@ void cpu_init(void)

pr_debug("Initializing CPU#%d\n", cpu);

- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);

/*
* Initialize the per-CPU GDT with the boot GDT,
@@ -1392,7 +1392,7 @@ void cpu_init(void)
printk(KERN_INFO "Initializing CPU#%d\n", cpu);

if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);

load_current_idt();
switch_to_new_gdt(cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index bd9ccda8087f..d1b4903d82bb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,7 @@
#include <linux/export.h>

#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -1455,7 +1456,7 @@ static void __mcheck_cpu_init_generic(void)
bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC | m_fl, &all_banks);

- set_in_cr4(X86_CR4_MCE);
+ cr4_set(X86_CR4_MCE);

rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index a3042989398c..fed632fd79bb 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -8,6 +8,7 @@
#include <linux/smp.h>

#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -59,7 +60,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
"Intel old style machine check architecture supported.\n");

/* Enable MCE: */
- set_in_cr4(X86_CR4_MCE);
+ cr4_set(X86_CR4_MCE);
printk(KERN_INFO
"Intel old style machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 7dc5564d0cdf..e4b151428915 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -7,6 +7,7 @@
#include <linux/types.h>

#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>

@@ -31,7 +32,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);

- set_in_cr4(X86_CR4_MCE);
+ cr4_set(X86_CR4_MCE);

printk(KERN_INFO
"Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2879ecdaac43..2401593df4a3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/alternative.h>
+#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/desc.h>
#include <asm/ldt.h>
@@ -1326,7 +1327,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)

case CPU_STARTING:
if (x86_pmu.attr_rdpmc)
- set_in_cr4(X86_CR4_PCE);
+ cr4_set(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;
@@ -1832,9 +1833,9 @@ static void change_rdpmc(void *info)
bool enable = !!(unsigned long)info;

if (enable)
- set_in_cr4(X86_CR4_PCE);
+ cr4_set(X86_CR4_PCE);
else
- clear_in_cr4(X86_CR4_PCE);
+ cr4_clear(X86_CR4_PCE);
}

static ssize_t set_attr_rdpmc(struct device *cdev,
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a9a4229f6161..dc369b52e949 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -13,6 +13,7 @@
#include <asm/sigcontext.h>
#include <asm/processor.h>
#include <asm/math_emu.h>
+#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#include <asm/ptrace.h>
#include <asm/i387.h>
@@ -180,7 +181,7 @@ void fpu_init(void)
if (cpu_has_xmm)
cr4_mask |= X86_CR4_OSXMMEXCPT;
if (cr4_mask)
- set_in_cr4(cr4_mask);
+ cr4_set(cr4_mask);

cr0 = read_cr0();
cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f804dc935d2a..d7e4c27b731f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
+#include <asm/tlbflush.h>

/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -139,7 +140,7 @@ void flush_thread(void)

static void hard_disable_TSC(void)
{
- write_cr4(read_cr4() | X86_CR4_TSD);
+ cr4_set(X86_CR4_TSD);
}

void disable_TSC(void)
@@ -156,7 +157,7 @@ void disable_TSC(void)

static void hard_enable_TSC(void)
{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
+ cr4_clear(X86_CR4_TSD);
}

static void enable_TSC(void)
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 940b142cc11f..c4057fd56c38 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -12,6 +12,7 @@
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
+#include <asm/tlbflush.h>
#include <asm/xcr.h>

/*
@@ -452,7 +453,7 @@ static void prepare_fx_sw_frame(void)
*/
static inline void xstate_enable(void)
{
- set_in_cr4(X86_CR4_OSXSAVE);
+ cr4_set(X86_CR4_OSXSAVE);
xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
}

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bfe11cf124a1..a48c26d01ab8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2737,7 +2737,7 @@ static int hardware_enable(void *garbage)
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
- write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+ cr4_set(X86_CR4_VMXE);

if (vmm_exclusive) {
kvm_cpu_vmxon(phys_addr);
@@ -2774,7 +2774,7 @@ static void hardware_disable(void *garbage)
vmclear_local_loaded_vmcss();
kvm_cpu_vmxoff();
}
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear(X86_CR4_VMXE);
}

static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 66dba36f2343..f64386652bd5 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -144,11 +144,11 @@ static void __init probe_page_size_mask(void)

/* Enable PSE if available */
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_and_update_boot(X86_CR4_PSE);

/* Enable PGE if available */
if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c0cb11fb5008..9acf0af34ed1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1482,10 +1482,10 @@ static void xen_pvh_set_cr_flags(int cpu)
* set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
*/
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_and_update_boot(X86_CR4_PSE);

if (cpu_has_pge)
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_and_update_boot(X86_CR4_PGE);
}

/*
--
1.9.3

2014-10-14 22:58:02

[permalink] [raw]

Subject: [RFC 4/5] perf: Add pmu callbacks to track event mapping and unmapping

Signed-off-by: Andy Lutomirski <[email protected]>
---
include/linux/perf_event.h | 7 +++++++
kernel/events/core.c | 9 +++++++++
2 files changed, 16 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 707617a8c0f6..88069e319a3f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -206,6 +206,13 @@ struct pmu {
*/
int (*event_init) (struct perf_event *event);

+ /*
+ * Notification that the event was mapped or unmapped. Called
+ * in the context of the mapping task.
+ */
+ void (*event_mapped) (struct perf_event *event); /*optional*/
+ void (*event_unmapped) (struct perf_event *event); /*optional*/
+
#define PERF_EF_START 0x01 /* start the counter when adding */
#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 963bf139e2b2..dc73e9723748 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4008,6 +4008,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)

atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
+
+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
}

/*
@@ -4027,6 +4030,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);

+ if (event->pmu->event_unmapped)
+ event->pmu->event_unmapped(event);
+
atomic_dec(&rb->mmap_count);

if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4228,6 +4234,9 @@ unlock:
vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;

+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
+
return ret;
}

--
1.9.3

2014-10-14 22:57:58

[permalink] [raw]

Subject: [RFC 3/5] x86: Add a comment clarifying LDT context switching

The code is correct, but only for a rather subtle reason. This
confused me for quite a while when I read switch_mm, so clarify the
code to avoid confusing other people, too.

TBH, I wouldn't be surprised if this code was only correct by
accident.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/mmu_context.h | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 166af2a8e865..04478103df37 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -53,7 +53,16 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));

- /* Load the LDT, if the LDT is different: */
+ /*
+ * Load the LDT, if the LDT is different.
+ *
+ * It's possible leave_mm(prev) has been called. If so,
+ * then prev->context.ldt could be out of sync with the
+ * LDT descriptor or the LDT register. This can only happen
+ * if prev->context.ldt is non-null, since we never free
+ * an LDT. But LDTs can't be shared across mms, so
+ * prev->context.ldt won't be equal to next->context.ldt.
+ */
if (unlikely(prev->context.ldt != next->context.ldt))
load_LDT_nolock(&next->context);
}
--
1.9.3

2014-10-14 22:58:08

[permalink] [raw]

Subject: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

We currently allow any process to use rdpmc. This significantly
weakens the protection offered by PR_TSC_DISABLED, and it could be
helpful to users attempting to exploit timing attacks.

Since we can't enable access to individual counters, use a very
coarse heuristic to limit access to rdpmc: allow access only when
a perf_event is mmapped. This protects seccomp sandboxes.

There is plenty of room to further tighen these restrictions. For
example, on x86, *all* perf_event mappings set cap_user_rdpmc. This
should probably be changed to only apply to perf_events that are
accessible using rdpmc.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/mmu.h | 2 ++
arch/x86/include/asm/mmu_context.h | 19 ++++++++++++
arch/x86/kernel/cpu/perf_event.c | 60 +++++++++++++++++++++++++++++---------
3 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 876e74e8eec7..2e5e4925a63f 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -19,6 +19,8 @@ typedef struct {

struct mutex lock;
void __user *vdso;
+
+ atomic_t perf_mmap_count; /* number of mmapped perf_events */
} mm_context_t;

#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 04478103df37..10239af45a9d 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -19,6 +19,21 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
}
#endif /* !CONFIG_PARAVIRT */

+#ifdef CONFIG_PERF_EVENTS
+extern struct static_key rdpmc_enabled;
+
+static inline void load_mm_cr4(struct mm_struct *mm)
+{
+ if (static_key_true(&rdpmc_enabled) &&
+ atomic_read(&mm->context.perf_mmap_count))
+ cr4_set(X86_CR4_PCE);
+ else
+ cr4_clear(X86_CR4_PCE);
+}
+#else
+static inline void load_mm_cr4(struct mm_struct *mm) {}
+#endif
+
/*
* Used for LDT copy/destruction.
*/
@@ -53,6 +68,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));

+ /* Load per-mm CR4 state */
+ load_mm_cr4(next);
+
/*
* Load the LDT, if the LDT is different.
*
@@ -86,6 +104,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ load_mm_cr4(next);
load_LDT_nolock(&next->context);
}
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2401593df4a3..7d42cf60f3b3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/alternative.h>
+#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/desc.h>
@@ -44,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};

+struct static_key rdpmc_enabled = STATIC_KEY_INIT_TRUE;
+
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -133,6 +136,7 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)

static atomic_t active_events;
static DEFINE_MUTEX(pmc_reserve_mutex);
+static DEFINE_MUTEX(rdpmc_enable_mutex);

#ifdef CONFIG_X86_LOCAL_APIC

@@ -1326,8 +1330,6 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;

case CPU_STARTING:
- if (x86_pmu.attr_rdpmc)
- cr4_set(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;
@@ -1806,6 +1808,27 @@ static int x86_pmu_event_init(struct perf_event *event)
return err;
}

+static void refresh_pce(void *ignored)
+{
+ if (current->mm)
+ load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+ if (atomic_inc_return(&current->mm->context.perf_mmap_count) == 1)
+ on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+ if (!current->mm)
+ return;
+
+ if (atomic_dec_and_test(&current->mm->context.perf_mmap_count))
+ on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
static int x86_pmu_event_idx(struct perf_event *event)
{
int idx = event->hw.idx;
@@ -1828,16 +1851,6 @@ static ssize_t get_attr_rdpmc(struct device *cdev,
return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
}

-static void change_rdpmc(void *info)
-{
- bool enable = !!(unsigned long)info;
-
- if (enable)
- cr4_set(X86_CR4_PCE);
- else
- cr4_clear(X86_CR4_PCE);
-}
-
static ssize_t set_attr_rdpmc(struct device *cdev,
struct device_attribute *attr,
const char *buf, size_t count)
@@ -1852,10 +1865,26 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
if (x86_pmu.attr_rdpmc_broken)
return -ENOTSUPP;

+ mutex_lock(&rdpmc_enable_mutex);
if (!!val != !!x86_pmu.attr_rdpmc) {
- x86_pmu.attr_rdpmc = !!val;
- on_each_cpu(change_rdpmc, (void *)val, 1);
+ if (val) {
+ static_key_slow_inc(&rdpmc_enabled);
+ on_each_cpu(refresh_pce, NULL, 1);
+ smp_wmb();
+ x86_pmu.attr_rdpmc = 1;
+ } else {
+ /*
+ * This direction can race against existing
+ * rdpmc-capable mappings. Try our best regardless.
+ */
+ x86_pmu.attr_rdpmc = 0;
+ smp_wmb();
+ static_key_slow_dec(&rdpmc_enabled);
+ WARN_ON(static_key_true(&rdpmc_enabled));
+ on_each_cpu(refresh_pce, NULL, 1);
+ }
}
+ mutex_unlock(&rdpmc_enable_mutex);

return count;
}
@@ -1899,6 +1928,9 @@ static struct pmu pmu = {

.event_init = x86_pmu_event_init,

+ .event_mapped = x86_pmu_event_mapped,
+ .event_unmapped = x86_pmu_event_unmapped,
+
.add = x86_pmu_add,
.del = x86_pmu_del,
.start = x86_pmu_start,
--
1.9.3

2014-10-14 22:58:52

[permalink] [raw]

Subject: [RFC 2/5] x86: Store a per-cpu shadow copy of CR4

Context switches and TLB flushes can change individual bits of CR4.
CR4 reads take several cycles, so store a shadow copy of CR4 in a
per-cpu variable.

To avoid wasting a cache line, I added the CR4 shadow to
cpu_tlbstate, which is already touched during context switches.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/tlbflush.h | 52 ++++++++++++++++++++++++++++++-----------
arch/x86/kernel/cpu/common.c | 7 ++++++
arch/x86/kernel/head32.c | 1 +
arch/x86/kernel/head64.c | 2 ++
arch/x86/kvm/vmx.c | 4 ++--
arch/x86/mm/init.c | 8 +++++++
arch/x86/mm/tlb.c | 3 ---
7 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 95b672f8b493..a04cad4bcbc3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,14 +15,37 @@
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif

+struct tlb_state {
+#ifdef CONFIG_SMP
+ struct mm_struct *active_mm;
+ int state;
+#endif
+
+ /*
+ * Access to this CR4 shadow and to H/W CR4 is protected by
+ * disabling interrupts when modifying either one.
+ */
+ unsigned long cr4;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
+{
+ this_cpu_write(cpu_tlbstate.cr4, read_cr4());
+}
+
/* Set in this cpu's CR4. */
static inline void cr4_set(unsigned long mask)
{
unsigned long cr4;

- cr4 = read_cr4();
- cr4 |= mask;
- write_cr4(cr4);
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if (!(cr4 & mask)) {
+ cr4 |= mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ write_cr4(cr4);
+ }
}

/* Clear in this cpu's CR4. */
@@ -30,9 +53,18 @@ static inline void cr4_clear(unsigned long mask)
{
unsigned long cr4;

- cr4 = read_cr4();
- cr4 &= ~mask;
- write_cr4(cr4);
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if (cr4 & mask) {
+ cr4 &= ~mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ write_cr4(cr4);
+ }
+}
+
+/* Read the CR4 shadow. */
+static inline unsigned long cr4_read_shadow(void)
+{
+ return this_cpu_read(cpu_tlbstate.cr4);
}

/*
@@ -61,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
{
unsigned long cr4;

- cr4 = native_read_cr4();
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* clear PGE */
native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
@@ -221,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2

-struct tlb_state {
- struct mm_struct *active_mm;
- int state;
-};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-
static inline void reset_lazy_tlbstate(void)
{
this_cpu_write(cpu_tlbstate.state, 0);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7d8400a4b192..ec73485b00c5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -19,6 +19,7 @@
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/debugreg.h>
#include <asm/sections.h>
#include <asm/vsyscall.h>
@@ -1285,6 +1286,12 @@ void cpu_init(void)
int i;

/*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ /*
* Load microcode on this cpu if a valid microcode is available.
* This is early microcode loading procedure.
*/
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6c1b9836995..2911ef3a9f1c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void)

asmlinkage __visible void __init i386_start_kernel(void)
{
+ cr4_init_shadow();
sanitize_boot_params(&boot_params);

/* Call the subarch specific early setup function */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..3b241f0ca005 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -155,6 +155,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);

+ cr4_init_shadow();
+
/* Kill off the identity-map trampoline */
reset_early_page_tables();

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a48c26d01ab8..c3927506e0f4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2710,7 +2710,7 @@ static int hardware_enable(void *garbage)
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
u64 old, test_bits;

- if (read_cr4() & X86_CR4_VMXE)
+ if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;

INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
@@ -4237,7 +4237,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
struct desc_ptr dt;

vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */
- vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
+ vmcs_writel(HOST_CR4, cr4_read_shadow()); /* 22.2.3, 22.2.5 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */

vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f64386652bd5..866244267192 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -687,3 +687,11 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}

+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+ .active_mm = &init_mm,
+ .state = 0,
+#endif
+ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ee61c36d64f8..3250f2371aea 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,9 +14,6 @@
#include <asm/uv/uv.h>
#include <linux/debugfs.h>

-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
- = { &init_mm, 0, };
-
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
--
1.9.3

2014-10-15 07:37:50

by Hillf Danton

[permalink] [raw]

Subject: Re: [RFC 2/5] x86: Store a per-cpu shadow copy of CR4

Hey Andy
>
> Context switches and TLB flushes can change individual bits of CR4.
> CR4 reads take several cycles, so store a shadow copy of CR4 in a
> per-cpu variable.
>
> To avoid wasting a cache line, I added the CR4 shadow to
> cpu_tlbstate, which is already touched during context switches.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/include/asm/tlbflush.h | 52
> ++++++++++++++++++++++++++++++-----------
> arch/x86/kernel/cpu/common.c | 7 ++++++
> arch/x86/kernel/head32.c | 1 +
> arch/x86/kernel/head64.c | 2 ++
> arch/x86/kvm/vmx.c | 4 ++--
> arch/x86/mm/init.c | 8 +++++++
> arch/x86/mm/tlb.c | 3 ---
> 7 files changed, 59 insertions(+), 18 deletions(-)
>
> diff --git a/arch/x86/include/asm/tlbflush.h
> b/arch/x86/include/asm/tlbflush.h
> index 95b672f8b493..a04cad4bcbc3 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -15,14 +15,37 @@
> #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
> #endif
>
> +struct tlb_state {
> +#ifdef CONFIG_SMP
> + struct mm_struct *active_mm;
> + int state;
> +#endif
> +
> + /*
> + * Access to this CR4 shadow and to H/W CR4 is protected by
> + * disabling interrupts when modifying either one.
> + */
> + unsigned long cr4;
> +};
> +DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
> +
> +/* Initialize cr4 shadow for this CPU. */
> +static inline void cr4_init_shadow(void)
> +{
> + this_cpu_write(cpu_tlbstate.cr4, read_cr4());
> +}
> +
> /* Set in this cpu's CR4. */
> static inline void cr4_set(unsigned long mask)
> {
> unsigned long cr4;
>
> - cr4 = read_cr4();
> - cr4 |= mask;
> - write_cr4(cr4);
> + cr4 = this_cpu_read(cpu_tlbstate.cr4);
> + if (!(cr4 & mask)) {

What if cr4 contains bit_A and mask contains bits A and B?

Hillf
> + cr4 |= mask;
> + this_cpu_write(cpu_tlbstate.cr4, cr4);
> + write_cr4(cr4);
> + }
> }
>
[...]

2014-10-15 14:43:44

[permalink] [raw]

Subject: Re: [RFC 2/5] x86: Store a per-cpu shadow copy of CR4

On Oct 15, 2014 12:37 AM, "Hillf Danton" <[email protected]> wrote:
>
> Hey Andy
> >
> > Context switches and TLB flushes can change individual bits of CR4.
> > CR4 reads take several cycles, so store a shadow copy of CR4 in a
> > per-cpu variable.
> >
> > To avoid wasting a cache line, I added the CR4 shadow to
> > cpu_tlbstate, which is already touched during context switches.
> >
> > Signed-off-by: Andy Lutomirski <[email protected]>
> > ---
> > arch/x86/include/asm/tlbflush.h | 52
> > ++++++++++++++++++++++++++++++-----------
> > arch/x86/kernel/cpu/common.c | 7 ++++++
> > arch/x86/kernel/head32.c | 1 +
> > arch/x86/kernel/head64.c | 2 ++
> > arch/x86/kvm/vmx.c | 4 ++--
> > arch/x86/mm/init.c | 8 +++++++
> > arch/x86/mm/tlb.c | 3 ---
> > 7 files changed, 59 insertions(+), 18 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/tlbflush.h
> > b/arch/x86/include/asm/tlbflush.h
> > index 95b672f8b493..a04cad4bcbc3 100644
> > --- a/arch/x86/include/asm/tlbflush.h
> > +++ b/arch/x86/include/asm/tlbflush.h
> > @@ -15,14 +15,37 @@
> > #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
> > #endif
> >
> > +struct tlb_state {
> > +#ifdef CONFIG_SMP
> > + struct mm_struct *active_mm;
> > + int state;
> > +#endif
> > +
> > + /*
> > + * Access to this CR4 shadow and to H/W CR4 is protected by
> > + * disabling interrupts when modifying either one.
> > + */
> > + unsigned long cr4;
> > +};
> > +DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
> > +
> > +/* Initialize cr4 shadow for this CPU. */
> > +static inline void cr4_init_shadow(void)
> > +{
> > + this_cpu_write(cpu_tlbstate.cr4, read_cr4());
> > +}
> > +
> > /* Set in this cpu's CR4. */
> > static inline void cr4_set(unsigned long mask)
> > {
> > unsigned long cr4;
> >
> > - cr4 = read_cr4();
> > - cr4 |= mask;
> > - write_cr4(cr4);
> > + cr4 = this_cpu_read(cpu_tlbstate.cr4);
> > + if (!(cr4 & mask)) {
>
> What if cr4 contains bit_A and mask contains bits A and B?

A malfunction. Whoops :) Will fix.

--Andy

>
> Hillf
> > + cr4 |= mask;
> > + this_cpu_write(cpu_tlbstate.cr4, cr4);
> > + write_cr4(cr4);
> > + }
> > }
> >
> [...]

2014-10-16 08:16:46

[permalink] [raw]

Subject: Re: [RFC 1/5] x86: Clean up cr4 manipulation

On Tue, Oct 14, 2014 at 03:57:35PM -0700, Andy Lutomirski wrote:
> +/* Set in this cpu's CR4. */
> +static inline void cr4_set(unsigned long mask)
> +{
> + unsigned long cr4;
> +
> + cr4 = read_cr4();
> + cr4 |= mask;
> + write_cr4(cr4);
> +}
> +
> +/* Clear in this cpu's CR4. */
> +static inline void cr4_clear(unsigned long mask)
> +{
> + unsigned long cr4;
> +
> + cr4 = read_cr4();
> + cr4 &= ~mask;
> + write_cr4(cr4);
> +}

I would have called these cr4_or() and cr4_nand(). When first reading
this I expected cr4_set() to be a pure write_cr4() and cr4_clear() to do
write_cr4(0) -- which obviously doesn't make too much sense.

cr4_{set,clear}_mask() might maybe be clearer but is more typing, and
the logical ops as suggested should have unambiguous meaning.

2014-10-16 08:26:33

[permalink] [raw]

Subject: Re: [RFC 2/5] x86: Store a per-cpu shadow copy of CR4

On Tue, Oct 14, 2014 at 03:57:36PM -0700, Andy Lutomirski wrote:
> Context switches and TLB flushes can change individual bits of CR4.
> CR4 reads take several cycles, so store a shadow copy of CR4 in a
> per-cpu variable.
>
> To avoid wasting a cache line, I added the CR4 shadow to
> cpu_tlbstate, which is already touched during context switches.

I'm a little confused. We should be more specific I suppose, context
switches don't always change mm, but CR4 state is per task.

>From a quick look, only switch_mm() pokes at tlb, switch_to() does not.

2014-10-16 08:42:37

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Tue, Oct 14, 2014 at 03:57:39PM -0700, Andy Lutomirski wrote:
> We currently allow any process to use rdpmc. This significantly
> weakens the protection offered by PR_TSC_DISABLED, and it could be
> helpful to users attempting to exploit timing attacks.
>
> Since we can't enable access to individual counters, use a very
> coarse heuristic to limit access to rdpmc: allow access only when
> a perf_event is mmapped. This protects seccomp sandboxes.
>
> There is plenty of room to further tighen these restrictions. For
> example, on x86, *all* perf_event mappings set cap_user_rdpmc. This
> should probably be changed to only apply to perf_events that are
> accessible using rdpmc.

So I suppose this patch is a little over engineered,

> @@ -1852,10 +1865,26 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
> if (x86_pmu.attr_rdpmc_broken)
> return -ENOTSUPP;
>
> + mutex_lock(&rdpmc_enable_mutex);
> if (!!val != !!x86_pmu.attr_rdpmc) {
> - x86_pmu.attr_rdpmc = !!val;
> - on_each_cpu(change_rdpmc, (void *)val, 1);
> + if (val) {
> + static_key_slow_inc(&rdpmc_enabled);
> + on_each_cpu(refresh_pce, NULL, 1);
> + smp_wmb();
> + x86_pmu.attr_rdpmc = 1;
> + } else {
> + /*
> + * This direction can race against existing
> + * rdpmc-capable mappings. Try our best regardless.
> + */
> + x86_pmu.attr_rdpmc = 0;
> + smp_wmb();
> + static_key_slow_dec(&rdpmc_enabled);
> + WARN_ON(static_key_true(&rdpmc_enabled));
> + on_each_cpu(refresh_pce, NULL, 1);
> + }
> }
> + mutex_unlock(&rdpmc_enable_mutex);
>
> return count;
> }

why do you care about that rdpmc_enabled static key thing? Also you
should not expose static key control to userspace like this, they can
totally wreck the system. At the very least it should be
static_key_slow_dec_deferred() -- gawd I hate the static_key API.

2014-10-16 11:18:56

[permalink] [raw]

Subject: Re: [RFC 1/5] x86: Clean up cr4 manipulation

On Thu, Oct 16, 2014 at 10:16:33AM +0200, Peter Zijlstra wrote:
> On Tue, Oct 14, 2014 at 03:57:35PM -0700, Andy Lutomirski wrote:
> > +/* Set in this cpu's CR4. */
> > +static inline void cr4_set(unsigned long mask)
> > +{
> > + unsigned long cr4;
> > +
> > + cr4 = read_cr4();
> > + cr4 |= mask;
> > + write_cr4(cr4);
> > +}
> > +
> > +/* Clear in this cpu's CR4. */
> > +static inline void cr4_clear(unsigned long mask)
> > +{
> > + unsigned long cr4;
> > +
> > + cr4 = read_cr4();
> > + cr4 &= ~mask;
> > + write_cr4(cr4);
> > +}
>
> I would have called these cr4_or() and cr4_nand(). When first reading
> this I expected cr4_set() to be a pure write_cr4() and cr4_clear() to do
> write_cr4(0) -- which obviously doesn't make too much sense.
>
>
> cr4_{set,clear}_mask() might maybe be clearer but is more typing, and
> the logical ops as suggested should have unambiguous meaning.

Or maybe ...{set,clear}_bits() because we're setting/clearing a mask of
a bunch of bits...

--
Regards/Gruss,
Boris.
--

2014-10-16 11:29:10

[permalink] [raw]

Subject: Re: [RFC 1/5] x86: Clean up cr4 manipulation

On Thu, Oct 16, 2014 at 01:18:58PM +0200, Borislav Petkov wrote:
> On Thu, Oct 16, 2014 at 10:16:33AM +0200, Peter Zijlstra wrote:
> > On Tue, Oct 14, 2014 at 03:57:35PM -0700, Andy Lutomirski wrote:
> > > +/* Set in this cpu's CR4. */
> > > +static inline void cr4_set(unsigned long mask)
> > > +{
> > > + unsigned long cr4;
> > > +
> > > + cr4 = read_cr4();
> > > + cr4 |= mask;
> > > + write_cr4(cr4);
> > > +}
> > > +
> > > +/* Clear in this cpu's CR4. */
> > > +static inline void cr4_clear(unsigned long mask)
> > > +{
> > > + unsigned long cr4;
> > > +
> > > + cr4 = read_cr4();
> > > + cr4 &= ~mask;
> > > + write_cr4(cr4);
> > > +}

Btw, on a related note, we probably should check whether the bits we
want to set/clear are already set/cleared and then save us the CR4
write.

It probably won't be the case all that much but still...

--
Regards/Gruss,
Boris.
--

2014-10-16 11:49:23

[permalink] [raw]

Subject: Re: [RFC 2/5] x86: Store a per-cpu shadow copy of CR4

On Tue, Oct 14, 2014 at 03:57:36PM -0700, Andy Lutomirski wrote:
> Context switches and TLB flushes can change individual bits of CR4.
> CR4 reads take several cycles, so store a shadow copy of CR4 in a
> per-cpu variable.
>
> To avoid wasting a cache line, I added the CR4 shadow to
> cpu_tlbstate, which is already touched during context switches.

So does this even show in any workloads as any improvement?

Also, what's the rule with reading the shadow CR4? kvm only? Because
svm_set_cr4() in svm.c reads the host CR4 too.

Should we make all code access the shadow CR4 maybe...

--
Regards/Gruss,
Boris.
--

2014-10-16 15:31:10

[permalink] [raw]

Subject: Re: [RFC 2/5] x86: Store a per-cpu shadow copy of CR4

On Oct 16, 2014 4:49 AM, "Borislav Petkov" <[email protected]> wrote:
>
> On Tue, Oct 14, 2014 at 03:57:36PM -0700, Andy Lutomirski wrote:
> > Context switches and TLB flushes can change individual bits of CR4.
> > CR4 reads take several cycles, so store a shadow copy of CR4 in a
> > per-cpu variable.
> >
> > To avoid wasting a cache line, I added the CR4 shadow to
> > cpu_tlbstate, which is already touched during context switches.
>
> So does this even show in any workloads as any improvement?
>

Unlear. Assuming that mov from cr4 is more expensive than a cache
miss (which may or may not be true), then kernel TLB flushes will get
cheaper. The main reason I did this is to make switching TSD and PCE
a little cheaper, which might be worthwhile.

I think this may be a huge win on many workloads when running as an
SVM guest. IIUC SVM doesn't have accelerated guest CR4 access.

> Also, what's the rule with reading the shadow CR4? kvm only? Because
> svm_set_cr4() in svm.c reads the host CR4 too.

Whoops.

>
> Should we make all code access the shadow CR4 maybe...

That was the intent. In v2, I'll probably rename read_cr4 to
__read_cr4 to make it difficult to miss things.

>
> --
> Regards/Gruss,
> Boris.
> --

2014-10-16 15:33:21

[permalink] [raw]

Subject: Re: [RFC 1/5] x86: Clean up cr4 manipulation

On Thu, Oct 16, 2014 at 4:29 AM, Borislav Petkov <[email protected]> wrote:
> On Thu, Oct 16, 2014 at 01:18:58PM +0200, Borislav Petkov wrote:
>> On Thu, Oct 16, 2014 at 10:16:33AM +0200, Peter Zijlstra wrote:
>> > On Tue, Oct 14, 2014 at 03:57:35PM -0700, Andy Lutomirski wrote:
>> > > +/* Set in this cpu's CR4. */
>> > > +static inline void cr4_set(unsigned long mask)
>> > > +{
>> > > + unsigned long cr4;
>> > > +
>> > > + cr4 = read_cr4();
>> > > + cr4 |= mask;
>> > > + write_cr4(cr4);
>> > > +}
>> > > +
>> > > +/* Clear in this cpu's CR4. */
>> > > +static inline void cr4_clear(unsigned long mask)
>> > > +{
>> > > + unsigned long cr4;
>> > > +
>> > > + cr4 = read_cr4();
>> > > + cr4 &= ~mask;
>> > > + write_cr4(cr4);
>> > > +}
>
> Btw, on a related note, we probably should check whether the bits we
> want to set/clear are already set/cleared and then save us the CR4
> write.
>
> It probably won't be the case all that much but still...

That's in patch 2. I can move it here, though.

>
> --
> Regards/Gruss,
> Boris.
> --

--
Andy Lutomirski
AMA Capital Management, LLC

2014-10-16 15:38:23

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Thu, Oct 16, 2014 at 1:42 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, Oct 14, 2014 at 03:57:39PM -0700, Andy Lutomirski wrote:
>> We currently allow any process to use rdpmc. This significantly
>> weakens the protection offered by PR_TSC_DISABLED, and it could be
>> helpful to users attempting to exploit timing attacks.
>>
>> Since we can't enable access to individual counters, use a very
>> coarse heuristic to limit access to rdpmc: allow access only when
>> a perf_event is mmapped. This protects seccomp sandboxes.
>>
>> There is plenty of room to further tighen these restrictions. For
>> example, on x86, *all* perf_event mappings set cap_user_rdpmc. This
>> should probably be changed to only apply to perf_events that are
>> accessible using rdpmc.
>
> So I suppose this patch is a little over engineered,

:) I won't argue.

>
>> @@ -1852,10 +1865,26 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
>> if (x86_pmu.attr_rdpmc_broken)
>> return -ENOTSUPP;
>>
>> + mutex_lock(&rdpmc_enable_mutex);
>> if (!!val != !!x86_pmu.attr_rdpmc) {
>> - x86_pmu.attr_rdpmc = !!val;
>> - on_each_cpu(change_rdpmc, (void *)val, 1);
>> + if (val) {
>> + static_key_slow_inc(&rdpmc_enabled);
>> + on_each_cpu(refresh_pce, NULL, 1);
>> + smp_wmb();
>> + x86_pmu.attr_rdpmc = 1;
>> + } else {
>> + /*
>> + * This direction can race against existing
>> + * rdpmc-capable mappings. Try our best regardless.
>> + */
>> + x86_pmu.attr_rdpmc = 0;
>> + smp_wmb();
>> + static_key_slow_dec(&rdpmc_enabled);
>> + WARN_ON(static_key_true(&rdpmc_enabled));
>> + on_each_cpu(refresh_pce, NULL, 1);
>> + }
>> }
>> + mutex_unlock(&rdpmc_enable_mutex);
>>
>> return count;
>> }
>
> why do you care about that rdpmc_enabled static key thing? Also you
> should not expose static key control to userspace like this, they can
> totally wreck the system. At the very least it should be
> static_key_slow_dec_deferred() -- gawd I hate the static_key API.

This particular control is only available to root, so I don't think it
matters too much. I did it this way to avoid hitting an extra dcache
line on every switch_mm.

A nicer solution might be to track whether rdpmc is allowed for each
perf_event and to count the number that allow rdpmc. That would cause
'echo 0 > rdpmc' to only work for new perf_events, but it fixes a
race.

Doing this will require passing more info to
arch_perf_update_userpage, I think. Should I do that? It'll probably
get a better result, but this patchset will get even longer and even
more overengineered.

--Andy

--
Andy Lutomirski
AMA Capital Management, LLC

2014-10-16 15:47:41

[permalink] [raw]

Subject: Re: [RFC 1/5] x86: Clean up cr4 manipulation

On Thu, Oct 16, 2014 at 08:32:57AM -0700, Andy Lutomirski wrote:
> That's in patch 2. I can move it here, though.

Right, saw it after sending. And nah, don't bother.

--
Regards/Gruss,
Boris.
--

2014-10-16 15:49:12

[permalink] [raw]

Subject: Re: [RFC 3/5] x86: Add a comment clarifying LDT context switching

On Tue, Oct 14, 2014 at 03:57:37PM -0700, Andy Lutomirski wrote:
> The code is correct, but only for a rather subtle reason. This
> confused me for quite a while when I read switch_mm, so clarify the
> code to avoid confusing other people, too.
>
> TBH, I wouldn't be surprised if this code was only correct by
> accident.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/include/asm/mmu_context.h | 11 ++++++++++-
> 1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index 166af2a8e865..04478103df37 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -53,7 +53,16 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> /* Stop flush ipis for the previous mm */
> cpumask_clear_cpu(cpu, mm_cpumask(prev));
>
> - /* Load the LDT, if the LDT is different: */
> + /*
> + * Load the LDT, if the LDT is different.
> + *
> + * It's possible leave_mm(prev) has been called. If so,
> + * then prev->context.ldt could be out of sync with the
> + * LDT descriptor or the LDT register. This can only happen

I'm staring at the code and trying to figure out where on the leave_mm()
path this could happen. Got any code pointers?

:-)

> + * if prev->context.ldt is non-null, since we never free
> + * an LDT. But LDTs can't be shared across mms, so
> + * prev->context.ldt won't be equal to next->context.ldt.
> + */
> if (unlikely(prev->context.ldt != next->context.ldt))
> load_LDT_nolock(&next->context);
> }
> --
> 1.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

--
Regards/Gruss,
Boris.
--

2014-10-16 15:57:54

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Thu, Oct 16, 2014 at 10:42:27AM +0200, Peter Zijlstra wrote:
> gawd I hate the static_key API.

Yahaa, I like to twist my brain for fun just by staring at
sched_clock_stable() logic.

Oh, and it needs fixing btw because if you look at the asm that comes
out, after the jump labels have run, you get an unconditional jump to
the likely code doing RDTSC.

This should be avoided and we should have the likely code be laid out
first with the NOP and have the jump labels patch in the unconditional
JMP only on the small amount of systems where we're unlikely to be using
the TSC... For that we'd need to pound on jump labels a bit more though.

--
Regards/Gruss,
Boris.
--

2014-10-16 16:22:06

[permalink] [raw]

Subject: Re: [RFC 3/5] x86: Add a comment clarifying LDT context switching

On Thu, Oct 16, 2014 at 8:49 AM, Borislav Petkov <[email protected]> wrote:
> On Tue, Oct 14, 2014 at 03:57:37PM -0700, Andy Lutomirski wrote:
>> The code is correct, but only for a rather subtle reason. This
>> confused me for quite a while when I read switch_mm, so clarify the
>> code to avoid confusing other people, too.
>>
>> TBH, I wouldn't be surprised if this code was only correct by
>> accident.
>>
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>> arch/x86/include/asm/mmu_context.h | 11 ++++++++++-
>> 1 file changed, 10 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>> index 166af2a8e865..04478103df37 100644
>> --- a/arch/x86/include/asm/mmu_context.h
>> +++ b/arch/x86/include/asm/mmu_context.h
>> @@ -53,7 +53,16 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
>> /* Stop flush ipis for the previous mm */
>> cpumask_clear_cpu(cpu, mm_cpumask(prev));
>>
>> - /* Load the LDT, if the LDT is different: */
>> + /*
>> + * Load the LDT, if the LDT is different.
>> + *
>> + * It's possible leave_mm(prev) has been called. If so,
>> + * then prev->context.ldt could be out of sync with the
>> + * LDT descriptor or the LDT register. This can only happen
>
> I'm staring at the code and trying to figure out where on the leave_mm()
> path this could happen. Got any code pointers?

I think it's the same as in the other case in switch_mm. leave_mm
does cpumask_clear_cpu(cpu, mm_cpumask(active_mm)), and, once that has
happened, modify_ldt won't send an IPI to this CPU. So, if leave_mm
runs, and then another CPU calls modify_ldt on the mm that is in lazy
mode here, it won't update our LDT register, so the LDT register and
prev->context.ldt might not match.

I think that, if that's the case, then prev->context.ldt can't be NULL
and can't have the same non-NULL value as next->context.ldt. I hope.

I'll try to make the comment clearer in v2.

--Andy

>
> :-)
>
>> + * if prev->context.ldt is non-null, since we never free
>> + * an LDT. But LDTs can't be shared across mms, so
>> + * prev->context.ldt won't be equal to next->context.ldt.
>> + */
>> if (unlikely(prev->context.ldt != next->context.ldt))
>> load_LDT_nolock(&next->context);
>> }
>> --
>> 1.9.3
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>>
>
> --
> Regards/Gruss,
> Boris.
> --

--
Andy Lutomirski
AMA Capital Management, LLC

2014-10-17 00:01:19

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

The current cap_user_rdpmc code seems rather confused to me. On x86,
*all* events set cap_user_rdpmc if the global rdpmc control is set.
But only x86_pmu events define .event_idx, so amd uncore events won't
actually expose their rdpmc index to userspace.

Would it make more sense to add a flag PERF_X86_EVENT_RDPMC_PERMITTED
that gets set on all events created while rdpmc == 1, to change
x86_pmu_event_idx to do something like:

if (event->hw.flags & PERF_X86_EVENT_RDPMC_PERMITTED)
return event->hw.event_base_rdpmc + 1;
else
return 0;

and to change arch_perf_update_userpage cap_user_rdpmc to match
PERF_X86_EVENT_RDPMC_PERMITTED?

Then we could ditch the static key and greatly simplify writes to the
rdpmc flag by just counting PERF_X86_EVENT_RDPMC_PERMITTED events.

This would be a user-visible change on AMD, and I can't test it.

On a semi-related note: would this all be nicer if there were vdso
function __u64 __vdso_perf_event__read_count(int fd, void *userpage)?
This is very easy to do nowadays. If we got *really* fancy, it would
be possible to have an rdpmc_safe in the vdso, which has some
benefits, although it would be a bit evil and wouldn't work if
userspace tracers like pin are in use.

--Andy

2014-10-19 20:23:42

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Thu, Oct 16, 2014 at 5:00 PM, Andy Lutomirski <[email protected]> wrote:
> The current cap_user_rdpmc code seems rather confused to me. On x86,
> *all* events set cap_user_rdpmc if the global rdpmc control is set.
> But only x86_pmu events define .event_idx, so amd uncore events won't
> actually expose their rdpmc index to userspace.
>
> Would it make more sense to add a flag PERF_X86_EVENT_RDPMC_PERMITTED
> that gets set on all events created while rdpmc == 1, to change
> x86_pmu_event_idx to do something like:
>
> if (event->hw.flags & PERF_X86_EVENT_RDPMC_PERMITTED)
> return event->hw.event_base_rdpmc + 1;
> else
> return 0;
>
> and to change arch_perf_update_userpage cap_user_rdpmc to match
> PERF_X86_EVENT_RDPMC_PERMITTED?
>
> Then we could ditch the static key and greatly simplify writes to the
> rdpmc flag by just counting PERF_X86_EVENT_RDPMC_PERMITTED events.
>
> This would be a user-visible change on AMD, and I can't test it.
>
>
> On a semi-related note: would this all be nicer if there were vdso
> function __u64 __vdso_perf_event__read_count(int fd, void *userpage)?
> This is very easy to do nowadays. If we got *really* fancy, it would
> be possible to have an rdpmc_safe in the vdso, which has some
> benefits, although it would be a bit evil and wouldn't work if
> userspace tracers like pin are in use.
>

Also, I don't understand the purpose of cap_user_time. Wouldn't it be
easier to just record the last CLOCK_MONOTONIC time and let the user
call __vdso_clock_gettime if they need an updated time?

--Andy

2014-10-19 21:33:48

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Sun, Oct 19, 2014 at 01:23:17PM -0700, Andy Lutomirski wrote:
> On Thu, Oct 16, 2014 at 5:00 PM, Andy Lutomirski <[email protected]> wrote:
> > The current cap_user_rdpmc code seems rather confused to me. On x86,
> > *all* events set cap_user_rdpmc if the global rdpmc control is set.
> > But only x86_pmu events define .event_idx, so amd uncore events won't
> > actually expose their rdpmc index to userspace.
> >
> > Would it make more sense to add a flag PERF_X86_EVENT_RDPMC_PERMITTED
> > that gets set on all events created while rdpmc == 1, to change
> > x86_pmu_event_idx to do something like:
> >
> > if (event->hw.flags & PERF_X86_EVENT_RDPMC_PERMITTED)
> > return event->hw.event_base_rdpmc + 1;
> > else
> > return 0;
> >
> > and to change arch_perf_update_userpage cap_user_rdpmc to match
> > PERF_X86_EVENT_RDPMC_PERMITTED?
> >
> > Then we could ditch the static key and greatly simplify writes to the
> > rdpmc flag by just counting PERF_X86_EVENT_RDPMC_PERMITTED events.
> >
> > This would be a user-visible change on AMD, and I can't test it.
> >
> >
> > On a semi-related note: would this all be nicer if there were vdso
> > function __u64 __vdso_perf_event__read_count(int fd, void *userpage)?
> > This is very easy to do nowadays. If we got *really* fancy, it would
> > be possible to have an rdpmc_safe in the vdso, which has some
> > benefits, although it would be a bit evil and wouldn't work if
> > userspace tracers like pin are in use.
> >
>
> Also, I don't understand the purpose of cap_user_time. Wouldn't it be
> easier to just record the last CLOCK_MONOTONIC time and let the user
> call __vdso_clock_gettime if they need an updated time?

Because perf doesn't use CLOCK_MONOTONIC. Due to performance
considerations we used the sched_clock stuff, which tries its best to
make the best of the TSC without reverting to HPET and the like.

Not to mention that CLOCK_MONOTONIC was not available from NMI context
until very recently.

Also, things like c73deb6aecda ("perf/x86: Add ability to calculate TSC
from perf sample timestamps") seem to suggest people actually use TSC
for things as well.

Now we might change to using the new NMI safe CLOCK_MONOTONIC (with a
fallback to use the sched_clock stuff on time challenged hardware) in
order to ease the correlation between other trace thingies, but even
then it makes sense to have this, having it here and reading the TSC
within the seqcount loop ensures you've got consistent data and touch
less cachelines for reading.

2014-10-19 21:35:37

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Thu, Oct 16, 2014 at 05:00:56PM -0700, Andy Lutomirski wrote:
> The current cap_user_rdpmc code seems rather confused to me. On x86,
> *all* events set cap_user_rdpmc if the global rdpmc control is set.
> But only x86_pmu events define .event_idx, so amd uncore events won't
> actually expose their rdpmc index to userspace.
>
> Would it make more sense to add a flag PERF_X86_EVENT_RDPMC_PERMITTED
> that gets set on all events created while rdpmc == 1, to change
> x86_pmu_event_idx to do something like:
>
> if (event->hw.flags & PERF_X86_EVENT_RDPMC_PERMITTED)
> return event->hw.event_base_rdpmc + 1;
> else
> return 0;
>
> and to change arch_perf_update_userpage cap_user_rdpmc to match
> PERF_X86_EVENT_RDPMC_PERMITTED?
>
> Then we could ditch the static key and greatly simplify writes to the
> rdpmc flag by just counting PERF_X86_EVENT_RDPMC_PERMITTED events.
>
> This would be a user-visible change on AMD, and I can't test it.

I have AMD hardware to test this. But yes something like that seems
fine.

2014-10-19 22:06:08

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Oct 19, 2014 2:33 PM, "Peter Zijlstra" <[email protected]> wrote:
>
> On Sun, Oct 19, 2014 at 01:23:17PM -0700, Andy Lutomirski wrote:
> > On Thu, Oct 16, 2014 at 5:00 PM, Andy Lutomirski <[email protected]> wrote:
> > > The current cap_user_rdpmc code seems rather confused to me. On x86,
> > > *all* events set cap_user_rdpmc if the global rdpmc control is set.
> > > But only x86_pmu events define .event_idx, so amd uncore events won't
> > > actually expose their rdpmc index to userspace.
> > >
> > > Would it make more sense to add a flag PERF_X86_EVENT_RDPMC_PERMITTED
> > > that gets set on all events created while rdpmc == 1, to change
> > > x86_pmu_event_idx to do something like:
> > >
> > > if (event->hw.flags & PERF_X86_EVENT_RDPMC_PERMITTED)
> > > return event->hw.event_base_rdpmc + 1;
> > > else
> > > return 0;
> > >
> > > and to change arch_perf_update_userpage cap_user_rdpmc to match
> > > PERF_X86_EVENT_RDPMC_PERMITTED?
> > >
> > > Then we could ditch the static key and greatly simplify writes to the
> > > rdpmc flag by just counting PERF_X86_EVENT_RDPMC_PERMITTED events.
> > >
> > > This would be a user-visible change on AMD, and I can't test it.
> > >
> > >
> > > On a semi-related note: would this all be nicer if there were vdso
> > > function __u64 __vdso_perf_event__read_count(int fd, void *userpage)?
> > > This is very easy to do nowadays. If we got *really* fancy, it would
> > > be possible to have an rdpmc_safe in the vdso, which has some
> > > benefits, although it would be a bit evil and wouldn't work if
> > > userspace tracers like pin are in use.
> > >
> >
> > Also, I don't understand the purpose of cap_user_time. Wouldn't it be
> > easier to just record the last CLOCK_MONOTONIC time and let the user
> > call __vdso_clock_gettime if they need an updated time?
>
> Because perf doesn't use CLOCK_MONOTONIC. Due to performance
> considerations we used the sched_clock stuff, which tries its best to
> make the best of the TSC without reverting to HPET and the like.
>
> Not to mention that CLOCK_MONOTONIC was not available from NMI context
> until very recently.

I'm only talking about the userspace access to when an event was
enabled and how long it's been running. I think that's what the
cap_user_time stuff is for. I don't think those parameters are
touched from NMI, right?

Point taken about sched_clock, though.

>
> Also, things like c73deb6aecda ("perf/x86: Add ability to calculate TSC
> from perf sample timestamps") seem to suggest people actually use TSC
> for things as well.
>
> Now we might change to using the new NMI safe CLOCK_MONOTONIC (with a
> fallback to use the sched_clock stuff on time challenged hardware) in
> order to ease the correlation between other trace thingies, but even
> then it makes sense to have this, having it here and reading the TSC
> within the seqcount loop ensures you've got consistent data and touch
> less cachelines for reading.

True.

OTOH, people (i.e. I) have optimized the crap out of
__vdso_clock_gettime, and __vdso_perf_event_whatever could be
similarly optimized.

--Andy

2014-10-19 22:20:16

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Sun, Oct 19, 2014 at 03:05:42PM -0700, Andy Lutomirski wrote:
> On Oct 19, 2014 2:33 PM, "Peter Zijlstra" <[email protected]> wrote:

> > > Also, I don't understand the purpose of cap_user_time. Wouldn't it be
> > > easier to just record the last CLOCK_MONOTONIC time and let the user
> > > call __vdso_clock_gettime if they need an updated time?
> >
> > Because perf doesn't use CLOCK_MONOTONIC. Due to performance
> > considerations we used the sched_clock stuff, which tries its best to
> > make the best of the TSC without reverting to HPET and the like.
> >
> > Not to mention that CLOCK_MONOTONIC was not available from NMI context
> > until very recently.
>
> I'm only talking about the userspace access to when an event was
> enabled and how long it's been running. I think that's what the
> cap_user_time stuff is for. I don't think those parameters are
> touched from NMI, right?
>
> Point taken about sched_clock, though.

Well, mixing two time bases, one TSC based and one CLOCK_MONOTONIC is
just asking for trouble IMO ;-)

> > Also, things like c73deb6aecda ("perf/x86: Add ability to calculate TSC
> > from perf sample timestamps") seem to suggest people actually use TSC
> > for things as well.
> >
> > Now we might change to using the new NMI safe CLOCK_MONOTONIC (with a
> > fallback to use the sched_clock stuff on time challenged hardware) in
> > order to ease the correlation between other trace thingies, but even
> > then it makes sense to have this, having it here and reading the TSC
> > within the seqcount loop ensures you've got consistent data and touch
> > less cachelines for reading.
>
> True.
>
> OTOH, people (i.e. I) have optimized the crap out of
> __vdso_clock_gettime, and __vdso_perf_event_whatever could be
> similarly optimized.

Maybe, but at that point we commit to yet another ABI... I'd rather just
put a 'sane' implementation in a library or so.

2014-10-19 22:58:17

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Sun, Oct 19, 2014 at 3:20 PM, Peter Zijlstra <[email protected]> wrote:
> On Sun, Oct 19, 2014 at 03:05:42PM -0700, Andy Lutomirski wrote:
>> On Oct 19, 2014 2:33 PM, "Peter Zijlstra" <[email protected]> wrote:
>
>> > > Also, I don't understand the purpose of cap_user_time. Wouldn't it be
>> > > easier to just record the last CLOCK_MONOTONIC time and let the user
>> > > call __vdso_clock_gettime if they need an updated time?
>> >
>> > Because perf doesn't use CLOCK_MONOTONIC. Due to performance
>> > considerations we used the sched_clock stuff, which tries its best to
>> > make the best of the TSC without reverting to HPET and the like.
>> >
>> > Not to mention that CLOCK_MONOTONIC was not available from NMI context
>> > until very recently.
>>
>> I'm only talking about the userspace access to when an event was
>> enabled and how long it's been running. I think that's what the
>> cap_user_time stuff is for. I don't think those parameters are
>> touched from NMI, right?
>>
>> Point taken about sched_clock, though.
>
> Well, mixing two time bases, one TSC based and one CLOCK_MONOTONIC is
> just asking for trouble IMO ;-)

Fair enough.

>
>> > Also, things like c73deb6aecda ("perf/x86: Add ability to calculate TSC
>> > from perf sample timestamps") seem to suggest people actually use TSC
>> > for things as well.
>> >
>> > Now we might change to using the new NMI safe CLOCK_MONOTONIC (with a
>> > fallback to use the sched_clock stuff on time challenged hardware) in
>> > order to ease the correlation between other trace thingies, but even
>> > then it makes sense to have this, having it here and reading the TSC
>> > within the seqcount loop ensures you've got consistent data and touch
>> > less cachelines for reading.
>>
>> True.
>>
>> OTOH, people (i.e. I) have optimized the crap out of
>> __vdso_clock_gettime, and __vdso_perf_event_whatever could be
>> similarly optimized.
>
> Maybe, but at that point we commit to yet another ABI... I'd rather just
> put a 'sane' implementation in a library or so.

This cuts both ways, though. For vdso timekeeping, the underlying
data structure has changed repeatedly, sometimes to add features, and
sometimes for performance, and the vdso has done a good job insulating
userspace from it. (In fact, until 3.16, even the same exact kernel
version couldn't be relied on to have the same data structure with
different configs, and even now, no one really wants to teach user
libraries how to parse the pvclock data structures.)

I would certainly not suggest putting anything beyond the bare minimum
into the vdso.

FWIW, something should probably specify exactly when it's safe to try
a userspace rdpmc. I think that the answer is that, for a perf event
watching a pid, only that pid can do it (in particular, other threads
must not try). For a perf event monitoring a whole cpu, the answer is
less clear to me.

--Andy

2014-10-20 00:08:31

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Sun, Oct 19, 2014 at 2:35 PM, Peter Zijlstra <[email protected]> wrote:
> On Thu, Oct 16, 2014 at 05:00:56PM -0700, Andy Lutomirski wrote:
>> The current cap_user_rdpmc code seems rather confused to me. On x86,
>> *all* events set cap_user_rdpmc if the global rdpmc control is set.
>> But only x86_pmu events define .event_idx, so amd uncore events won't
>> actually expose their rdpmc index to userspace.
>>
>> Would it make more sense to add a flag PERF_X86_EVENT_RDPMC_PERMITTED
>> that gets set on all events created while rdpmc == 1, to change
>> x86_pmu_event_idx to do something like:
>>
>> if (event->hw.flags & PERF_X86_EVENT_RDPMC_PERMITTED)
>> return event->hw.event_base_rdpmc + 1;
>> else
>> return 0;
>>
>> and to change arch_perf_update_userpage cap_user_rdpmc to match
>> PERF_X86_EVENT_RDPMC_PERMITTED?
>>
>> Then we could ditch the static key and greatly simplify writes to the
>> rdpmc flag by just counting PERF_X86_EVENT_RDPMC_PERMITTED events.
>>
>> This would be a user-visible change on AMD, and I can't test it.
>
> I have AMD hardware to test this. But yes something like that seems
> fine.

Before I totally screw this up: is .event_idx used for anything except
userspace rdpmc? There are a whole bunch of implementations of that
callback:

- perf_event_idx_default seems fishy
- power_pmu_event_idx seems even fishier
- cpumsf_pmu_event_idx is the same as power_pmu_event_idx.
- perf_swevent_event_idx returns 0.

etc.

x86 is the only implementation of arch_perf_update_userpage, which
makes me think that the .event_idx callback should just be removed and
that arch_perf_update_userpage should be responsible for filling it in
if needed.

--Andy

--
Andy Lutomirski
AMA Capital Management, LLC

2014-10-20 08:33:36

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Sun, Oct 19, 2014 at 03:57:54PM -0700, Andy Lutomirski wrote:
> > Maybe, but at that point we commit to yet another ABI... I'd rather just
> > put a 'sane' implementation in a library or so.
>
> This cuts both ways, though. For vdso timekeeping, the underlying
> data structure has changed repeatedly, sometimes to add features, and
> sometimes for performance, and the vdso has done a good job insulating
> userspace from it. (In fact, until 3.16, even the same exact kernel
> version couldn't be relied on to have the same data structure with
> different configs, and even now, no one really wants to teach user
> libraries how to parse the pvclock data structures.)

Fair enough, but as it stands we've already committed to the data
structure exposed to userspace.

> I would certainly not suggest putting anything beyond the bare minimum
> into the vdso.

Depends on what you really want to do I suppose, if you've got a pinned
event and know there cannot be multiplexing, not doing the time reads
the multiplications and all that saves a ton of cycles. But in generic I
suppose you have to do all that.

> FWIW, something should probably specify exactly when it's safe to try
> a userspace rdpmc. I think that the answer is that, for a perf event
> watching a pid, only that pid can do it (in particular, other threads
> must not try). For a perf event monitoring a whole cpu, the answer is
> less clear to me.

This all was really only meant to be used for self-monitoring, so where
an event is attached to the very same task, anything else and I'm find
disabling it.

2014-10-20 08:48:35

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Sun, Oct 19, 2014 at 05:08:08PM -0700, Andy Lutomirski wrote:
> Before I totally screw this up: is .event_idx used for anything except
> userspace rdpmc?

It should only be used for that.

> There are a whole bunch of implementations of that
> callback:
>
> - perf_event_idx_default seems fishy

I suppose I did that to encode the rule that 0 := disabled, figuring
that if there is no actual instruction to access the data, all of this
would be pointless anyhow.

> - power_pmu_event_idx seems even fishier

Agreed, I preserved behaviour in 35edc2a5095e ("perf, arch: Rework
perf_event_index()") and at that time asked about why this is. Nobody
replied, lets try again.

Michael, Anton?

> - cpumsf_pmu_event_idx is the same as power_pmu_event_idx.

Oh cute, lets ask the s390 people, do you guys have a userspace
instruction to read the actual counter value?

> - perf_swevent_event_idx returns 0.

Right, guaranteed no way to access any of that, maybe we should make
the default like that too.

> etc.
>
> x86 is the only implementation of arch_perf_update_userpage, which
> makes me think that the .event_idx callback should just be removed and
> that arch_perf_update_userpage should be responsible for filling it in
> if needed.

Its a per pmu thing, with the x86 hardware pmu being the only one that's
actually 'known' working to me.

All the other x86 pmus like the software, uncore, etc. don't support
this.

That said, there's definitely room for improvement here.

2014-10-20 09:24:23

by Martin Schwidefsky

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Mon, 20 Oct 2014 10:48:13 +0200
Peter Zijlstra <[email protected]> wrote:

> > - cpumsf_pmu_event_idx is the same as power_pmu_event_idx.
>
> Oh cute, lets ask the s390 people, do you guys have a userspace
> instruction to read the actual counter value?

The "extract cpu counter" ECCTR instruction can be authorized to
be used in user-space with a bit in CR0. With the current code
this bit is not set, the cpu measurement facility is not usable
in user space without a patch to the kernel.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2014-10-20 10:51:23

by Hendrik Brueckner

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Mon, Oct 20, 2014 at 10:48:13AM +0200, Peter Zijlstra wrote:
> On Sun, Oct 19, 2014 at 05:08:08PM -0700, Andy Lutomirski wrote:
> > There are a whole bunch of implementations of that
> > callback:
> >
> > - perf_event_idx_default seems fishy
>
> I suppose I did that to encode the rule that 0 := disabled, figuring
> that if there is no actual instruction to access the data, all of this
> would be pointless anyhow.

See comment on "perf_swevent_event_idx returns 0".

>
> > - cpumsf_pmu_event_idx is the same as power_pmu_event_idx.
>
> Oh cute, lets ask the s390 people, do you guys have a userspace
> instruction to read the actual counter value?

For the hardware sampling pmu (cpumsf) there is no instruction to read
sampling data in user space. Also the event->hw.index is always set
to zero anyway. I am going to remove this function and let the core
event code handle the proper default.

As Martin Schwidefsky pointed out in his mail, there is an instruction
to read some hardware counter data (different to the sampling data).
User space is prohibited to use this instruction. Instead, user space
can use another approach which is called runtime instrumentation, for
self-monitoring.

>
> > - perf_swevent_event_idx returns 0.
>
> Right, guaranteed no way to access any of that, maybe we should make
> the default like that too.

I think it would makes sense to return 0 as default in the
perf_event_idx_default() and let each PMU/arch that actually supports
reading PMCs from user space return the proper index. And according
to tools/perf/design.txt, index must be non-zero to trigger a user space
read.

Thanks and kind regards,
Hendrik

2014-10-20 16:49:52

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Mon, Oct 20, 2014 at 1:33 AM, Peter Zijlstra <[email protected]> wrote:
> On Sun, Oct 19, 2014 at 03:57:54PM -0700, Andy Lutomirski wrote:
>> > Maybe, but at that point we commit to yet another ABI... I'd rather just
>> > put a 'sane' implementation in a library or so.
>>
>> This cuts both ways, though. For vdso timekeeping, the underlying
>> data structure has changed repeatedly, sometimes to add features, and
>> sometimes for performance, and the vdso has done a good job insulating
>> userspace from it. (In fact, until 3.16, even the same exact kernel
>> version couldn't be relied on to have the same data structure with
>> different configs, and even now, no one really wants to teach user
>> libraries how to parse the pvclock data structures.)
>
> Fair enough, but as it stands we've already committed to the data
> structure exposed to userspace.

True.

OTOH, if a vdso function gets added, a few releases go by, and all the
userspace tools get updated, then the old data structure could be
dropped if needed by clearing cap_user_rdpmc.

Anyway, this is so far out of scope for the current project that I'm
going to ignore it.

>> FWIW, something should probably specify exactly when it's safe to try
>> a userspace rdpmc. I think that the answer is that, for a perf event
>> watching a pid, only that pid can do it (in particular, other threads
>> must not try). For a perf event monitoring a whole cpu, the answer is
>> less clear to me.
>
> This all was really only meant to be used for self-monitoring, so where
> an event is attached to the very same task, anything else and I'm find
> disabling it.

Actually implementing this might be a touch awkward. I can check
whether an event has a task context that matches the creating task,
but that's not necessarily the same thing as the task that mmaps it.

--Andy

2014-10-20 17:40:14

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Mon, Oct 20, 2014 at 9:49 AM, Andy Lutomirski <[email protected]> wrote:
> On Mon, Oct 20, 2014 at 1:33 AM, Peter Zijlstra <[email protected]> wrote:
>> On Sun, Oct 19, 2014 at 03:57:54PM -0700, Andy Lutomirski wrote:
>>> > Maybe, but at that point we commit to yet another ABI... I'd rather just
>>> > put a 'sane' implementation in a library or so.
>>>
>>> This cuts both ways, though. For vdso timekeeping, the underlying
>>> data structure has changed repeatedly, sometimes to add features, and
>>> sometimes for performance, and the vdso has done a good job insulating
>>> userspace from it. (In fact, until 3.16, even the same exact kernel
>>> version couldn't be relied on to have the same data structure with
>>> different configs, and even now, no one really wants to teach user
>>> libraries how to parse the pvclock data structures.)
>>
>> Fair enough, but as it stands we've already committed to the data
>> structure exposed to userspace.
>
> True.
>
> OTOH, if a vdso function gets added, a few releases go by, and all the
> userspace tools get updated, then the old data structure could be
> dropped if needed by clearing cap_user_rdpmc.
>
> Anyway, this is so far out of scope for the current project that I'm
> going to ignore it.

OK, I lied.

I haven't tested it, but it looks like any existing users of
cap_user_rdpmc may have serious issues. That flag is set to 1 for
essentially all perf_events on x86, even events that aren't part of
the x86_pmu. Since the default .event_idx callback doesn't return
zero, lots of other events will appear to be rdpmcable. This includes
the AMD uncore pmu, which looks like it actually supports rdpmc, but
hwc->idx seems to be missing an offset.

If this is the case, then user code can't reliably use the userpage
rdpmc mechanism, so maybe it should be deprecated (or at least get a
new flag bit).

--Andy

2014-10-21 04:12:13

[permalink] [raw]

Subject: Re: [RFC 0/5] CR4 handling improvements

On Tue, 14 Oct 2014, Andy Lutomirski wrote:

> This little series tightens up rdpmc permissions. With it applied,
> rdpmc can only be used if a perf_event is actually mmapped. For now,
> this is only really useful for seccomp.

So just to be difficult...

I am aware of at least one group who is doing low-latency performance
measures using rdpmc on Linux.

They start the counters manually by poking the MSRs directly (bypassing
perf_event_open()).

They use rdpmc, grateful for the fact that currently CR4 is set up so they
can do this w/o patching the kernel.

These patches of course would break this use case...

Vince

2014-10-21 04:28:54

[permalink] [raw]

Subject: Re: [RFC 0/5] CR4 handling improvements

On Mon, Oct 20, 2014 at 9:06 PM, Vince Weaver <[email protected]> wrote:
> On Tue, 14 Oct 2014, Andy Lutomirski wrote:
>
>> This little series tightens up rdpmc permissions. With it applied,
>> rdpmc can only be used if a perf_event is actually mmapped. For now,
>> this is only really useful for seccomp.
>
> So just to be difficult...
>
> I am aware of at least one group who is doing low-latency performance
> measures using rdpmc on Linux.
>
> They start the counters manually by poking the MSRs directly (bypassing
> perf_event_open()).
>
> They use rdpmc, grateful for the fact that currently CR4 is set up so they
> can do this w/o patching the kernel.
>
> These patches of course would break this use case...

ISTM it would be a lot better to use the perf subsystem for this. You
can probably pin an event to a pmu. I don't know whether you can pin
an event systemwide. I also don't know whether pinning an event will
prevent perf from changing its value periodically, and it certainly
doesn't magically make wraparound issues go away.

It would be easy to add a new setting for rdpmc to deal with the PCE part.

E.g. rdpmc = 2 could allow all tasks to use rdpmc regardless of
whether they have an event mapped. (And I would have to re-add the
static key. Sigh.)

--Andy

P.S. Hey, Intel, you forgot to add rdpmcp :-/

2014-10-21 05:41:44

[permalink] [raw]

Subject: Re: [RFC 3/5] x86: Add a comment clarifying LDT context switching

On Thu, Oct 16, 2014 at 09:21:42AM -0700, Andy Lutomirski wrote:
> I think it's the same as in the other case in switch_mm. leave_mm does
> cpumask_clear_cpu(cpu, mm_cpumask(active_mm)), and, once that has
> happened, modify_ldt won't send an IPI to this CPU. So, if leave_mm
> runs, and then another CPU calls modify_ldt on the mm that is in lazy
> mode here, it won't update our LDT register, so the LDT register and
> prev->context.ldt might not match.

Ok, let me see if I can follow with an example:

We call leave_mm() on, say, cpu 3 and mm_cpumask(active_mm) has cpu 3 and
4 set. Then, on cpu 4 we call modify_ldt on that same mm and there in
alloc_ldt() we have this:

if (!cpumask_equal(mm_cpumask(current->mm),
cpumask_of(smp_processor_id())))
smp_call_function(flush_ldt, current->mm, 1);

and since we've cleared cpu 3 from the cpumask, we don't flush_ldt()
on it and there you have the difference.

Am I close?

Thanks.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2014-10-21 05:44:42

[permalink] [raw]

Subject: Re: [RFC 3/5] x86: Add a comment clarifying LDT context switching

On Mon, Oct 20, 2014 at 10:41 PM, Borislav Petkov <[email protected]> wrote:
> On Thu, Oct 16, 2014 at 09:21:42AM -0700, Andy Lutomirski wrote:
>> I think it's the same as in the other case in switch_mm. leave_mm does
>> cpumask_clear_cpu(cpu, mm_cpumask(active_mm)), and, once that has
>> happened, modify_ldt won't send an IPI to this CPU. So, if leave_mm
>> runs, and then another CPU calls modify_ldt on the mm that is in lazy
>> mode here, it won't update our LDT register, so the LDT register and
>> prev->context.ldt might not match.
>
> Ok, let me see if I can follow with an example:
>
> We call leave_mm() on, say, cpu 3 and mm_cpumask(active_mm) has cpu 3 and
> 4 set. Then, on cpu 4 we call modify_ldt on that same mm and there in
> alloc_ldt() we have this:
>
> if (!cpumask_equal(mm_cpumask(current->mm),
> cpumask_of(smp_processor_id())))
> smp_call_function(flush_ldt, current->mm, 1);
>
> and since we've cleared cpu 3 from the cpumask, we don't flush_ldt()
> on it and there you have the difference.
>
> Am I close?

You're exactly correct, or at least you seem to understand it the way I do :)

--Andy

2014-10-21 06:05:56

[permalink] [raw]

Subject: Re: [RFC 3/5] x86: Add a comment clarifying LDT context switching

On Mon, Oct 20, 2014 at 10:44:18PM -0700, Andy Lutomirski wrote:
> You're exactly correct, or at least you seem to understand it the way I do :)

Ok, cool.

Now, if I had more time, I'd take a guest and add some debugging
code to see when exactly that happens and how prev->context.ldt and
next->context.ldt look... Maybe later. :-)

Thanks.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2014-10-21 08:59:21

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Mon, Oct 20, 2014 at 10:39:48AM -0700, Andy Lutomirski wrote:
> On Mon, Oct 20, 2014 at 9:49 AM, Andy Lutomirski <[email protected]> wrote:
> > On Mon, Oct 20, 2014 at 1:33 AM, Peter Zijlstra <[email protected]> wrote:
> >> On Sun, Oct 19, 2014 at 03:57:54PM -0700, Andy Lutomirski wrote:
> >>> > Maybe, but at that point we commit to yet another ABI... I'd rather just
> >>> > put a 'sane' implementation in a library or so.
> >>>
> >>> This cuts both ways, though. For vdso timekeeping, the underlying
> >>> data structure has changed repeatedly, sometimes to add features, and
> >>> sometimes for performance, and the vdso has done a good job insulating
> >>> userspace from it. (In fact, until 3.16, even the same exact kernel
> >>> version couldn't be relied on to have the same data structure with
> >>> different configs, and even now, no one really wants to teach user
> >>> libraries how to parse the pvclock data structures.)
> >>
> >> Fair enough, but as it stands we've already committed to the data
> >> structure exposed to userspace.
> >
> > True.
> >
> > OTOH, if a vdso function gets added, a few releases go by, and all the
> > userspace tools get updated, then the old data structure could be
> > dropped if needed by clearing cap_user_rdpmc.
> >
> > Anyway, this is so far out of scope for the current project that I'm
> > going to ignore it.
>
> OK, I lied.
>
> I haven't tested it, but it looks like any existing users of
> cap_user_rdpmc may have serious issues. That flag is set to 1 for
> essentially all perf_events on x86, even events that aren't part of
> the x86_pmu. Since the default .event_idx callback doesn't return
> zero, lots of other events will appear to be rdpmcable. This includes
> the AMD uncore pmu, which looks like it actually supports rdpmc, but
> hwc->idx seems to be missing an offset.
>
> If this is the case, then user code can't reliably use the userpage
> rdpmc mechanism, so maybe it should be deprecated (or at least get a
> new flag bit).

Seeing how we've not actually had bugreports on this, I suspect we can
still fix that.

2014-10-21 09:14:45

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Mon, Oct 20, 2014 at 12:51:10PM +0200, Hendrik Brueckner wrote:
> I think it would makes sense to return 0 as default in the
> perf_event_idx_default() and let each PMU/arch that actually supports
> reading PMCs from user space return the proper index. And according
> to tools/perf/design.txt, index must be non-zero to trigger a user space
> read.

OK, I've got something like the below. Michael/Anton, would you please
clarify the ppc book3s capabilities?

---
Subject: perf: Clean up pmu::event_idx
From: Peter Zijlstra <[email protected]>
Date: Tue Oct 21 11:10:21 CEST 2014

Andy reported that the current state of event_idx is rather confused.
So remove all but the x86_pmu implementation and change the default to
return 0 (the safe option).

Reported-by: Andy Lutomirski <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/powerpc/perf/hv-24x7.c | 6 ------
arch/powerpc/perf/hv-gpci.c | 6 ------
arch/s390/kernel/perf_cpum_sf.c | 6 ------
kernel/events/core.c | 15 +--------------
kernel/events/hw_breakpoint.c | 7 -------
5 files changed, 1 insertion(+), 39 deletions(-)

--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -417,11 +417,6 @@ static int h_24x7_event_add(struct perf_
return 0;
}

-static int h_24x7_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu h_24x7_pmu = {
.task_ctx_nr = perf_invalid_context,

@@ -433,7 +428,6 @@ static struct pmu h_24x7_pmu = {
.start = h_24x7_event_start,
.stop = h_24x7_event_stop,
.read = h_24x7_event_update,
- .event_idx = h_24x7_event_idx,
};

static int hv_24x7_init(void)
--- a/arch/powerpc/perf/hv-gpci.c
+++ b/arch/powerpc/perf/hv-gpci.c
@@ -246,11 +246,6 @@ static int h_gpci_event_init(struct perf
return 0;
}

-static int h_gpci_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu h_gpci_pmu = {
.task_ctx_nr = perf_invalid_context,

@@ -262,7 +257,6 @@ static struct pmu h_gpci_pmu = {
.start = h_gpci_event_start,
.stop = h_gpci_event_stop,
.read = h_gpci_event_update,
- .event_idx = h_gpci_event_idx,
};

static int hv_gpci_init(void)
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1411,11 +1411,6 @@ static void cpumsf_pmu_del(struct perf_e
perf_pmu_enable(event->pmu);
}

-static int cpumsf_pmu_event_idx(struct perf_event *event)
-{
- return event->hw.idx;
-}
-
CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF);
CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG);

@@ -1458,7 +1453,6 @@ static struct pmu cpumf_sampling = {
.stop = cpumsf_pmu_stop,
.read = cpumsf_pmu_read,

- .event_idx = cpumsf_pmu_event_idx,
.attr_groups = cpumsf_pmu_attr_groups,
};

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6071,11 +6071,6 @@ static int perf_swevent_init(struct perf
return 0;
}

-static int perf_swevent_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,

@@ -6085,8 +6080,6 @@ static struct pmu perf_swevent = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .event_idx = perf_swevent_event_idx,
};

#ifdef CONFIG_EVENT_TRACING
@@ -6204,8 +6197,6 @@ static struct pmu perf_tracepoint = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .event_idx = perf_swevent_event_idx,
};

static inline void perf_tp_register(void)
@@ -6431,8 +6422,6 @@ static struct pmu perf_cpu_clock = {
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
-
- .event_idx = perf_swevent_event_idx,
};

/*
@@ -6511,8 +6500,6 @@ static struct pmu perf_task_clock = {
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
-
- .event_idx = perf_swevent_event_idx,
};

static void perf_pmu_nop_void(struct pmu *pmu)
@@ -6542,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct p

static int perf_event_idx_default(struct perf_event *event)
{
- return event->hw.idx + 1;
+ return 0;
}

/*
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct pe
bp->hw.state = PERF_HES_STOPPED;
}

-static int hw_breakpoint_event_idx(struct perf_event *bp)
-{
- return 0;
-}
-
static struct pmu perf_breakpoint = {
.task_ctx_nr = perf_sw_context, /* could eventually get its own */

@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
-
- .event_idx = hw_breakpoint_event_idx,
};

int __init init_hw_breakpoint(void)

2014-10-21 14:59:48

[permalink] [raw]

Subject: Re: [RFC 0/5] CR4 handling improvements

On Mon, 20 Oct 2014, Andy Lutomirski wrote:

> ISTM it would be a lot better to use the perf subsystem for this. You
> can probably pin an event to a pmu.

No, you cannot pin an event to a counter with perf_event.
That's one of the big differences between perf_event and, say, perfmon2.

With perf_event the kernel controls which events go in which counters and
the user has no say. That's part of why you need to check the mmap page
every time you want to use rdpmc because there's no other way of knowing
which counter to read to get the event you want.

perf_event is also fairly high overhead for setting up and starting
events, and mildly high overhead when doing a proper rdpmc call (due to
the required looking at mmap, and the fact that you need to do two rdpmc
calls before/after to get your value). This is why people really worried
about low-latency measurements bypass as much of perf_event as possible.

Vince

2014-10-21 15:53:23

[permalink] [raw]

Subject: Re: [RFC 5/5] x86,perf: Only allow rdpmc if a perf_event is mapped

On Tue, Oct 21, 2014 at 2:14 AM, Peter Zijlstra <[email protected]> wrote:
> On Mon, Oct 20, 2014 at 12:51:10PM +0200, Hendrik Brueckner wrote:
>> I think it would makes sense to return 0 as default in the
>> perf_event_idx_default() and let each PMU/arch that actually supports
>> reading PMCs from user space return the proper index. And according
>> to tools/perf/design.txt, index must be non-zero to trigger a user space
>> read.
>
> OK, I've got something like the below. Michael/Anton, would you please
> clarify the ppc book3s capabilities?
>
> ---
> Subject: perf: Clean up pmu::event_idx
> From: Peter Zijlstra <[email protected]>
> Date: Tue Oct 21 11:10:21 CEST 2014
>
> Andy reported that the current state of event_idx is rather confused.
> So remove all but the x86_pmu implementation and change the default to
> return 0 (the safe option).

I wrote essentially the same patch yesterday, so looks good to me :)

>
> Reported-by: Andy Lutomirski <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/powerpc/perf/hv-24x7.c | 6 ------
> arch/powerpc/perf/hv-gpci.c | 6 ------
> arch/s390/kernel/perf_cpum_sf.c | 6 ------
> kernel/events/core.c | 15 +--------------
> kernel/events/hw_breakpoint.c | 7 -------
> 5 files changed, 1 insertion(+), 39 deletions(-)
>
> --- a/arch/powerpc/perf/hv-24x7.c
> +++ b/arch/powerpc/perf/hv-24x7.c
> @@ -417,11 +417,6 @@ static int h_24x7_event_add(struct perf_
> return 0;
> }
>
> -static int h_24x7_event_idx(struct perf_event *event)
> -{
> - return 0;
> -}
> -
> static struct pmu h_24x7_pmu = {
> .task_ctx_nr = perf_invalid_context,
>
> @@ -433,7 +428,6 @@ static struct pmu h_24x7_pmu = {
> .start = h_24x7_event_start,
> .stop = h_24x7_event_stop,
> .read = h_24x7_event_update,
> - .event_idx = h_24x7_event_idx,
> };
>
> static int hv_24x7_init(void)
> --- a/arch/powerpc/perf/hv-gpci.c
> +++ b/arch/powerpc/perf/hv-gpci.c
> @@ -246,11 +246,6 @@ static int h_gpci_event_init(struct perf
> return 0;
> }
>
> -static int h_gpci_event_idx(struct perf_event *event)
> -{
> - return 0;
> -}
> -
> static struct pmu h_gpci_pmu = {
> .task_ctx_nr = perf_invalid_context,
>
> @@ -262,7 +257,6 @@ static struct pmu h_gpci_pmu = {
> .start = h_gpci_event_start,
> .stop = h_gpci_event_stop,
> .read = h_gpci_event_update,
> - .event_idx = h_gpci_event_idx,
> };
>
> static int hv_gpci_init(void)
> --- a/arch/s390/kernel/perf_cpum_sf.c
> +++ b/arch/s390/kernel/perf_cpum_sf.c
> @@ -1411,11 +1411,6 @@ static void cpumsf_pmu_del(struct perf_e
> perf_pmu_enable(event->pmu);
> }
>
> -static int cpumsf_pmu_event_idx(struct perf_event *event)
> -{
> - return event->hw.idx;
> -}
> -
> CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF);
> CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG);
>
> @@ -1458,7 +1453,6 @@ static struct pmu cpumf_sampling = {
> .stop = cpumsf_pmu_stop,
> .read = cpumsf_pmu_read,
>
> - .event_idx = cpumsf_pmu_event_idx,
> .attr_groups = cpumsf_pmu_attr_groups,
> };
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6071,11 +6071,6 @@ static int perf_swevent_init(struct perf
> return 0;
> }
>
> -static int perf_swevent_event_idx(struct perf_event *event)
> -{
> - return 0;
> -}
> -
> static struct pmu perf_swevent = {
> .task_ctx_nr = perf_sw_context,
>
> @@ -6085,8 +6080,6 @@ static struct pmu perf_swevent = {
> .start = perf_swevent_start,
> .stop = perf_swevent_stop,
> .read = perf_swevent_read,
> -
> - .event_idx = perf_swevent_event_idx,
> };
>
> #ifdef CONFIG_EVENT_TRACING
> @@ -6204,8 +6197,6 @@ static struct pmu perf_tracepoint = {
> .start = perf_swevent_start,
> .stop = perf_swevent_stop,
> .read = perf_swevent_read,
> -
> - .event_idx = perf_swevent_event_idx,
> };
>
> static inline void perf_tp_register(void)
> @@ -6431,8 +6422,6 @@ static struct pmu perf_cpu_clock = {
> .start = cpu_clock_event_start,
> .stop = cpu_clock_event_stop,
> .read = cpu_clock_event_read,
> -
> - .event_idx = perf_swevent_event_idx,
> };
>
> /*
> @@ -6511,8 +6500,6 @@ static struct pmu perf_task_clock = {
> .start = task_clock_event_start,
> .stop = task_clock_event_stop,
> .read = task_clock_event_read,
> -
> - .event_idx = perf_swevent_event_idx,
> };
>
> static void perf_pmu_nop_void(struct pmu *pmu)
> @@ -6542,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct p
>
> static int perf_event_idx_default(struct perf_event *event)
> {
> - return event->hw.idx + 1;
> + return 0;
> }
>
> /*
> --- a/kernel/events/hw_breakpoint.c
> +++ b/kernel/events/hw_breakpoint.c
> @@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct pe
> bp->hw.state = PERF_HES_STOPPED;
> }
>
> -static int hw_breakpoint_event_idx(struct perf_event *bp)
> -{
> - return 0;
> -}
> -
> static struct pmu perf_breakpoint = {
> .task_ctx_nr = perf_sw_context, /* could eventually get its own */
>
> @@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
> .start = hw_breakpoint_start,
> .stop = hw_breakpoint_stop,
> .read = hw_breakpoint_pmu_read,
> -
> - .event_idx = hw_breakpoint_event_idx,
> };
>
> int __init init_hw_breakpoint(void)

--
Andy Lutomirski
AMA Capital Management, LLC

2014-10-21 16:04:29

[permalink] [raw]

Subject: Re: [RFC 0/5] CR4 handling improvements

On Tue, Oct 21, 2014 at 11:00:26AM -0400, Vince Weaver wrote:
> On Mon, 20 Oct 2014, Andy Lutomirski wrote:
>
> > ISTM it would be a lot better to use the perf subsystem for this. You
> > can probably pin an event to a pmu.
>
> No, you cannot pin an event to a counter with perf_event.
> That's one of the big differences between perf_event and, say, perfmon2.
>
> With perf_event the kernel controls which events go in which counters and
> the user has no say. That's part of why you need to check the mmap page
> every time you want to use rdpmc because there's no other way of knowing
> which counter to read to get the event you want.
>
> perf_event is also fairly high overhead for setting up and starting
> events,

Which you only do once at the start, so is that really a problem?

> and mildly high overhead when doing a proper rdpmc call (due to
> the required looking at mmap, and the fact that you need to do two rdpmc
> calls before/after to get your value). This is why people really worried
> about low-latency measurements bypass as much of perf_event as possible.

I still don't get that argument, 2 rdpmc's is cheaper than doing wrmsr,
not to mention doing wrmsr through a syscall. And looking at that mmap
page is 1 cacheline. Is that cacheline read (assuming you miss) the real
problem?

2014-10-21 17:05:07

[permalink] [raw]

Subject: Re: [RFC 0/5] CR4 handling improvements

On Tue, 21 Oct 2014, Peter Zijlstra wrote:

> > perf_event is also fairly high overhead for setting up and starting
> > events,
>
> Which you only do once at the start, so is that really a problem?

There are various reasons why you might want to start events at times
other than the beginning of the program. Some people don't like kernel
multiplexing so they start/stop manually if they want to switch eventsets.

But no, I suppose you could ask anyone wanting to use rdpmc to open some
sort of dummy event at startup just to get cr4 enabled.

> I still don't get that argument, 2 rdpmc's is cheaper than doing wrmsr,
> not to mention doing wrmsr through a syscall. And looking at that mmap
> page is 1 cacheline. Is that cacheline read (assuming you miss) the real
> problem?

Well at least by default the first read of the mmap page causes a
pagefault which adds a few thousand cycles of latency. Though you can
somewhat get around this by prefaulting it in at some point.

Anyway I'm just reporting numbers I get when measuring the overhead of
the old perfctr interface vs perf_event on typical PAPI workloads. It's
true you can re-arrange calls and such so that perf_event behaves better
but that involves redoing a lot of existing code.

I do appreciate the trouble you've gone through keeping self-monitoring
working considering the fact that I'm the only user admitting to using it.

Adding perf_event rdpmc support to PAPI has been stalled for a while due
to various reasons. So that's why I haven't been finding the various bugs
that have been turning up. The PAPI perf_event component really needs a
complete from-scratch re-write, but that's made tricky because we have to
be backwards compatible and workaround all the pre-2.6.36 perf_event bugs.
You wouldn't think anyone would care, but the most vocal users are all
RHEL 6 users running the monstrosity of a 2.6.32 kernel that is patched
full of all kinds of crazy back-ported perf_event patches, and that is
always breaking PAPI in fun and exciting ways.

Vince

2014-10-23 11:42:42

[permalink] [raw]

Subject: Re: [RFC 0/5] CR4 handling improvements

On Tue, Oct 21, 2014 at 01:05:49PM -0400, Vince Weaver wrote:
> On Tue, 21 Oct 2014, Peter Zijlstra wrote:
>
> > > perf_event is also fairly high overhead for setting up and starting
> > > events,
> >
> > Which you only do once at the start, so is that really a problem?
>
> There are various reasons why you might want to start events at times
> other than the beginning of the program. Some people don't like kernel
> multiplexing so they start/stop manually if they want to switch eventsets.

I suppose you could pre-create all events and use ioctl()s to start/stop
them where/when desired, this should be faster I think. But yes, this is
not a use-case I've though much about.

> But no, I suppose you could ask anyone wanting to use rdpmc to open some
> sort of dummy event at startup just to get cr4 enabled.

That's one work-around :-)

> > I still don't get that argument, 2 rdpmc's is cheaper than doing wrmsr,
> > not to mention doing wrmsr through a syscall. And looking at that mmap
> > page is 1 cacheline. Is that cacheline read (assuming you miss) the real
> > problem?
>
> Well at least by default the first read of the mmap page causes a
> pagefault which adds a few thousand cycles of latency. Though you can
> somewhat get around this by prefaulting it in at some point.

MAP_POPULATE is your friend there, but yes manually prefaulting is
perfectly fine too, and the HPC people are quite familiar with the
concept, they do it for a lot of things.

> Anyway I'm just reporting numbers I get when measuring the overhead of
> the old perfctr interface vs perf_event on typical PAPI workloads. It's
> true you can re-arrange calls and such so that perf_event behaves better
> but that involves redoing a lot of existing code.

OK agreed, having to change existing code is often subject to various
forms of inertia/resistance. And yes I cannot deny that some of the
features perf has come at the expense of various overheads, however hard
we're trying to keep costs down.

> I do appreciate the trouble you've gone through keeping self-monitoring
> working considering the fact that I'm the only user admitting to using it.

I have some code somewhere that uses it too, I've tried pushing it off
to other people but so far there are no takers :-)

2014-10-24 12:40:28

[permalink] [raw]

Subject: Re: [RFC 0/5] CR4 handling improvements

On Thu, 23 Oct 2014, Peter Zijlstra wrote:

> On Tue, Oct 21, 2014 at 01:05:49PM -0400, Vince Weaver wrote:

> > There are various reasons why you might want to start events at times
> > other than the beginning of the program. Some people don't like kernel
> > multiplexing so they start/stop manually if they want to switch eventsets.
>
> I suppose you could pre-create all events and use ioctl()s to start/stop
> them where/when desired, this should be faster I think. But yes, this is
> not a use-case I've though much about.

The scheduling step is most of what makes the perf_event start call have
high overhead. The other annoyance is the fact that due to the NMI
watchdog your can successfully perf_event_open() an event group but still
have it fail at start time, so a lot of code has to be done that does
extraneous open/start/close calls to make sure the events really fit.

> MAP_POPULATE is your friend there, but yes manually prefaulting is
> perfectly fine too, and the HPC people are quite familiar with the
> concept, they do it for a lot of things.

MAP_POPULATE actually has noticably more overhead than manually
prefaulting. It's on my todo list to drop ftrace on there and find out
why, but I've been stuck chasing kernel-crashing fuzzer bugs instead in my
spare time.

perfctr and possibly perfmon2 would automatically pre-fault the mmap page
for you in the kernel, so there was no need for the user to do it.

In any case I wasn't really trying to make trouble here, it's just I came
across the people using rdpmc w/o perf_event just the other day (on USENET
of all places). They were so happy it worked w/o patches now, that I felt
bad breaking it to them that there were patches floating around that were
going to make their usecase not work anymore.

I guess like all things though, you can't have anything fun and useful in
the kernel without the security people taking it away.

Vince

2014-10-24 22:14:57