2014-04-15 13:59:15

by Rui Wang

[permalink] [raw]
Subject: [PATCH v3 0/5] I/O Hook: Trace h/w access and emulate h/w events

Hi All,
This is the 3rd version of I/O Hook, a patch set aimed at intercepting
h/w access by the OS. Some examples of how it can be used:
1) To emulate h/w events (e.g. hotplug)
2) To inject h/w errors to the kernel
3) To trace h/w access by the OS for performance tuning or debugging.
Details of the examples can be found in Documentation/PCI/iohook.txt.

A set of user space tools using I/O Hook for various use cases will be hosted
on https://github.com/iohook. It initially contains inject-aer which takes the
trace event output of /sys/kernel/debug/tracing/events/ras/aer_event/ and
regenerate the same PCIE AER event so that PCIE AER can be easily tested.

Changes since v2:
- Added the hook for MSRs (can specify a cpu number)
- Can trigger IPI by a vector (so that exceptions can be emulated)
- Defined iohook_event as the trace event to trace h/w access. A new
attribute 'tc' can be defined for any Register Override for tracing
purpose.

Rui Wang (5):
I/O Hook: core functions and Register Override
I/O Hook: Help functions to manage the hook
I/O Hook: sysfs interface to emulate h/w events
I/O Hook: Override MSRs while triggering MCEs
IO Hook: Tracing hw access

Documentation/PCI/iohook.txt | 353 ++++++++++
arch/Kconfig | 10 +
arch/x86/boot/compressed/Makefile | 1 +
arch/x86/include/asm/io.h | 57 ++-
arch/x86/include/asm/msr.h | 100 +++-
arch/x86/lib/msr-smp.c | 31 +
arch/x86/vdso/Makefile | 2 +
drivers/misc/Makefile | 1 +
drivers/misc/iohook/Makefile | 1 +
drivers/misc/iohook/iohook.c | 1367 +++++++++++++++++++++++++++++++++++++
drivers/misc/iohook/iohook.h | 6 +
drivers/pci/access.c | 66 ++
include/linux/reg_ovrd.h | 55 ++
include/trace/events/iohook.h | 58 ++
14 files changed, 2101 insertions(+), 7 deletions(-)
create mode 100644 Documentation/PCI/iohook.txt
create mode 100644 drivers/misc/iohook/Makefile
create mode 100644 drivers/misc/iohook/iohook.c
create mode 100644 drivers/misc/iohook/iohook.h
create mode 100644 include/linux/reg_ovrd.h
create mode 100644 include/trace/events/iohook.h

--
1.7.5.4


2014-04-15 13:59:25

by Rui Wang

[permalink] [raw]
Subject: [PATCH v3 1/5] I/O Hook: core functions and Register Override

This is the 3rd version of I/O Hook, a patch set aimed at intercepting
h/w access by the OS. Some examples of how it can be used:
1) To emulate h/w events (e.g. hotplug)
2) To inject h/w errors to the kernel
3) To trace h/w access by the OS for performance tuning or debugging
Details of the examples can be found in Documentation/PCI/iohook.txt.

A set of user space tools using I/O Hook for various use cases will be hosted
on https://github.com/iohook. It initially contains inject-aer which takes the
trace event output of /sys/kernel/debug/tracing/events/ras/aer_event/ and
regenerate the same PCIE AER event so that PCIE AER can be easily tested.

This patch provides a hook in the core h/w access functions in the kernel.
It also introduces Register Override, which is a set of bits defined in RAM
to override the real value of a h/w register. With the hook in place, access
to h/w registers can be redirected to Register Overrides with user-defined
values, so that h/w states can be emulated easily.

A Register Override can be defined in whatever bit-width, identified by its
address, bitmask, initial value and attributs like read-only, read-write,
write-clear, etc., similar to how a hardware register behaves when accessed.

Jump Label is used, so when the hook is disabled (by default), this adds
only a NOP to the core functions, with zero performance penalty.

This patch is the first step towards the goal of emulating h/w events.

Signed-off-by: Rui Wang <[email protected]>
---
arch/Kconfig | 10 +
arch/x86/boot/compressed/Makefile | 1 +
arch/x86/include/asm/io.h | 57 +++++-
arch/x86/vdso/Makefile | 2 +
drivers/misc/Makefile | 1 +
drivers/misc/iohook/Makefile | 1 +
drivers/misc/iohook/iohook.c | 392 +++++++++++++++++++++++++++++++++++++
drivers/misc/iohook/iohook.h | 6 +
drivers/pci/access.c | 66 ++++++
include/linux/reg_ovrd.h | 54 +++++
10 files changed, 588 insertions(+), 2 deletions(-)
create mode 100644 drivers/misc/iohook/Makefile
create mode 100644 drivers/misc/iohook/iohook.c
create mode 100644 drivers/misc/iohook/iohook.h
create mode 100644 include/linux/reg_ovrd.h

diff --git a/arch/Kconfig b/arch/Kconfig
index 97ff872..55b224a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -46,6 +46,16 @@ config KPROBES
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".

+config IO_HOOK
+ bool "Method for emulating hardware events"
+ default n
+ depends on PCI
+ help
+ I/O Hook is a mechanism to intercept i/o register access functions
+ in the kernel. By overriding h/w register bits with user-defined
+ bits in RAM called Register Override, it is possible to emulate
+ h/w states without modifying the driver specific to that hardware.
+
config JUMP_LABEL
bool "Optimize very unlikely/likely branches"
depends on HAVE_ARCH_JUMP_LABEL
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0fcd913..ea53270 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -10,6 +10,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
+KBUILD_CFLAGS += -DNO_IO_HOOK
cflags-$(CONFIG_X86_32) := -march=i386
cflags-$(CONFIG_X86_64) := -mcmodel=small
KBUILD_CFLAGS += $(cflags-y)
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index b8237d8..e3db0ab 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -41,14 +41,41 @@
#include <asm/page.h>
#include <asm/early_ioremap.h>

+#if !defined(NO_IO_HOOK) && defined(CONFIG_IO_HOOK)
+#include <linux/jump_label.h>
+#include <linux/reg_ovrd.h>
+
+#define mem_read_ovrd(type, addr) \
+{ \
+ type val;\
+ if (static_key_false(&ovrdhw_enabled) \
+ && !read_ovrd_common(OVRD_SPACE_MEM, (u64)addr, \
+ sizeof(type), &val, NULL)) \
+ return val; \
+}
+
+#define mem_write_ovrd(type, addr, val) \
+{ \
+ if (static_key_false(&ovrdhw_enabled) \
+ && !write_ovrd_common(OVRD_SPACE_MEM, (u64)addr,\
+ sizeof(type), &val, NULL)) \
+ return; \
+}
+#else /* CONFIG_IO_HOOK */
+#define mem_read_ovrd(type, addr)
+#define mem_write_ovrd(type, addr, val)
+#endif /* CONFIG_IO_HOOK */
+
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
-{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+{ type ret; mem_read_ovrd(type, addr); \
+asm volatile("mov" size " %1,%0" : reg(ret) \
:"m" (*(volatile type __force *)addr) barrier); return ret; }

#define build_mmio_write(name, size, type, reg, barrier) \
static inline void name(type val, volatile void __iomem *addr) \
-{ asm volatile("mov" size " %0,%1": :reg (val), \
+{ mem_write_ovrd(type, addr, val); \
+asm volatile("mov" size " %0,%1" : : reg(val), \
"m" (*(volatile type __force *)addr) barrier); }

build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
@@ -266,9 +293,34 @@ static inline void slow_down_io(void)

#endif

+#if !defined(NO_IO_HOOK) && defined(CONFIG_IO_HOOK)
+
+#define io_write_ovrd(type, value, port) \
+{ \
+ if (static_key_false(&ovrdhw_enabled) \
+ && !write_ovrd_common(OVRD_SPACE_IO, (u64)port, \
+ sizeof(type), &value, NULL)) \
+ return; \
+}
+
+#define io_read_ovrd(type, port) \
+{ \
+ type val; \
+ if (static_key_false(&ovrdhw_enabled) \
+ && !read_ovrd_common(OVRD_SPACE_IO, (u64)port, \
+ sizeof(type), &val, NULL)) \
+ return val; \
+}
+
+#else
+#define io_write_ovrd(type, value, port)
+#define io_read_ovrd(type, port)
+#endif
+
#define BUILDIO(bwl, bw, type) \
static inline void out##bwl(unsigned type value, int port) \
{ \
+ io_write_ovrd(type, value, port); \
asm volatile("out" #bwl " %" #bw "0, %w1" \
: : "a"(value), "Nd"(port)); \
} \
@@ -276,6 +328,7 @@ static inline void out##bwl(unsigned type value, int port) \
static inline unsigned type in##bwl(int port) \
{ \
unsigned type value; \
+ io_read_ovrd(type, port); \
asm volatile("in" #bwl " %w1, %" #bw "0" \
: "=a"(value) : "Nd"(port)); \
return value; \
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index c580d12..521405d 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -4,6 +4,8 @@

KBUILD_CFLAGS += $(DISABLE_LTO)

+KBUILD_CFLAGS += -DNO_IO_HOOK
+
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
VDSO32-$(CONFIG_X86_32) := y
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 7eb4b69..baaa135 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -49,6 +49,7 @@ obj-y += carma/
obj-$(CONFIG_USB_SWITCH_FSA9480) += fsa9480.o
obj-$(CONFIG_ALTERA_STAPL) +=altera-stapl/
obj-$(CONFIG_INTEL_MEI) += mei/
+obj-$(CONFIG_IO_HOOK) += iohook/
obj-$(CONFIG_VMWARE_VMCI) += vmw_vmci/
obj-$(CONFIG_LATTICE_ECP3_CONFIG) += lattice-ecp3-config.o
obj-$(CONFIG_SRAM) += sram.o
diff --git a/drivers/misc/iohook/Makefile b/drivers/misc/iohook/Makefile
new file mode 100644
index 0000000..80e2a7d
--- /dev/null
+++ b/drivers/misc/iohook/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_IO_HOOK) += iohook.o
diff --git a/drivers/misc/iohook/iohook.c b/drivers/misc/iohook/iohook.c
new file mode 100644
index 0000000..e6a626f
--- /dev/null
+++ b/drivers/misc/iohook/iohook.c
@@ -0,0 +1,392 @@
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/reg_ovrd.h>
+#include <linux/pci.h>
+#include "iohook.h"
+
+static DEFINE_RAW_SPINLOCK(io_hook_lock);
+
+LIST_HEAD(ovrd_io_reg_map);
+LIST_HEAD(ovrd_mem_reg_map);
+LIST_HEAD(ovrd_pci_conf_reg_map);
+
+struct static_key ovrdhw_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(ovrdhw_enabled);
+
+/* len should only be 1, 2, 4, 8 */
+static int mem_read(u64 address, int len, void *data)
+{
+ int ret = 0;
+
+ switch (len) {
+ case 1:
+ *(u8 *)data = *(u8 *)address;
+ break;
+ case 2:
+ *(u16 *)data = *(u16 *)address;
+ break;
+ case 4:
+ *(u32 *)data = *(u32 *)address;
+ break;
+ case 8:
+ *(u64 *)data = *(u64 *)address;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+
+}
+
+static int mem_write(u64 address, int len, void *value)
+{
+ int ret = 0;
+
+ switch (len) {
+ case 1:
+ *(u8 *)address = *(u8 *)value;
+ break;
+ case 2:
+ *(u16 *)address = *(u16 *)value;
+ break;
+ case 4:
+ *(u32 *)address = *(u32 *)value;
+ break;
+ case 8:
+ *(u64 *)address = *(u64 *)value;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+
+}
+
+#ifdef CONFIG_X86
+
+/* len should only be 1, 2, 4 */
+static int io_read(u64 address, int len, void *data)
+{
+ int ret = 0;
+ u16 port;
+ u8 bvalue;
+ u16 wvalue;
+ u32 lvalue;
+
+
+ port = (u16)address;
+
+ switch (len) {
+ case 1:
+ asm volatile ("inb %w1, %b0" : "=a"(bvalue) : "Nd"(port));
+ *(u8 *)data = bvalue;
+ break;
+ case 2:
+ asm volatile ("inw %w1, %w0" : "=a"(wvalue) : "Nd"(port));
+ *(u16 *)data = wvalue;
+ break;
+ case 4:
+ asm volatile ("inl %w1, %0" : "=a"(lvalue) : "Nd"(port));
+ *(u32 *)data = lvalue;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+
+}
+
+static int io_write(u64 address, int len, void *data)
+{
+ int ret = 0;
+ u8 bvalue;
+ u16 wvalue, port;
+ u32 lvalue;
+
+ port = (u16)address;
+
+ switch (len) {
+ case 1:
+ bvalue = *(u8 *)data;
+ asm volatile ("outb %b0, %w1" : : "a"(bvalue), "Nd"(port));
+ break;
+ case 2:
+ wvalue = *(u16 *)data;
+ asm volatile ("outw %w0, %w1" : : "a"(wvalue), "Nd"(port));
+ break;
+ case 4:
+ lvalue = *(u32 *)data;
+ asm volatile ("outl %0, %w1" : : "a"(lvalue), "Nd"(port));
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+
+}
+
+#else
+
+static int io_read(u64 address, int len, void *data)
+{
+ return -EINVAL;
+}
+
+static int io_write(u64 address, int len, void *data)
+{
+ return -EINVAL;
+}
+
+#endif /* CONFIG_X86 */
+
+static u64 v2p(u64 vaddr)
+{
+ return slow_virt_to_phys((void *)vaddr);
+}
+
+/* shift left if i>=0, otherwise shift right */
+#define BYTE_SHIFT(value, i) \
+ ((i) >= 0 ? (value) << (i)*8 : (value) >> (-i)*8)
+
+int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
+{
+ struct list_head *ovrd_list;
+ struct reg_ovrd *ovrd_reg;
+ struct pci_bus *pcib;
+ unsigned long lock_flags = 0, flags = 0;
+ u64 faddress, vaddr = 0;
+ u64 data, bit_mask, attrib, val;
+ unsigned int devfn = 0, pos = 0;
+ int i, flength, res, ret;
+
+ ret = -EINVAL;
+
+ if (spaceid == OVRD_SPACE_MEM) {
+ /* in the case of memory, 'address' is virtual */
+ vaddr = address;
+ address = v2p(address);
+ ovrd_list = &ovrd_mem_reg_map;
+ } else if (spaceid == OVRD_SPACE_IO) {
+ ovrd_list = &ovrd_io_reg_map;
+ } else if (spaceid == OVRD_SPACE_PCICONF) {
+ devfn = PCI_DECODE_DEVFN(address);
+ pos = PCI_DECODE_POS(address);
+ ovrd_list = &ovrd_pci_conf_reg_map;
+ } else {
+ return ret;
+ }
+
+ raw_spin_lock_irqsave(&io_hook_lock, lock_flags);
+ list_for_each_entry(ovrd_reg, ovrd_list, node) {
+
+ faddress = ovrd_reg->address;
+ flength = ovrd_reg->length;
+ val = ovrd_reg->val;
+ bit_mask = ovrd_reg->bit_mask;
+ attrib = ovrd_reg->attrib;
+
+ if (address >= faddress + flength ||
+ address + len <= faddress) {
+ /* no overlap, skip */
+ continue;
+ }
+
+ raw_spin_unlock_irqrestore(&io_hook_lock,
+ lock_flags);
+
+ /* at least one byte falls into the overridden range */
+ data = 0;
+ ret = 0;
+ if (!(address >= faddress && address+len <= faddress+flength &&
+ bit_mask == (u64)((1<<flength*8) - 1))) {
+ /* partially overridden. Read from HW for real bits */
+
+ if (spaceid == OVRD_SPACE_MEM) {
+ res = mem_read(vaddr, len, &data);
+ } else if (spaceid == OVRD_SPACE_IO) {
+ res = io_read(address, len, &data);
+ } else if (spaceid == OVRD_SPACE_PCICONF) {
+ raw_spin_lock_irqsave(&pci_lock, flags);
+ pcib = (struct pci_bus *)bus;
+ res = pcib->ops->read(pcib, devfn, pos, len,
+ (u32 *)&data);
+ raw_spin_unlock_irqrestore(&pci_lock, flags);
+ } else
+ goto out;
+
+ if (res) {
+ /* failed to read from HW, clear the result */
+ data = 0;
+ }
+ }
+
+ for (i = 0; i < len; i++) {
+ if (address+i >= faddress &&
+ address+i < faddress+flength) {
+ int j, k;
+
+ j = address + i - faddress;
+ k = faddress - address;
+ if (flength <= 8) {
+ /* <= 8 bytes, use bit_mask */
+ u64 byte_mask;
+
+ byte_mask =
+ bit_mask & BYTE_SHIFT(0xff, j);
+ data &= ~BYTE_SHIFT(byte_mask, k);
+ data |= BYTE_SHIFT(val & byte_mask, k);
+ if (attrib == OVRD_RC)
+ ovrd_reg->val &= ~byte_mask;
+
+ } else {
+ /* If flength is > 8, this is
+ * used to override a consecutive
+ * range of readonly identical
+ * bytes.
+ */
+ data |= (val & 0xff) << i*8;
+ }
+ }
+ }
+
+ switch (len) {
+ case 1:
+ *(u8 *)value = (u8)data;
+ break;
+ case 2:
+ *(u16 *)value = (u16)data;
+ break;
+ case 4:
+ *(u32 *)value = (u32)data;
+ break;
+ case 8:
+ *(u64 *)value = data;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ raw_spin_lock_irqsave(&io_hook_lock,
+ lock_flags);
+ }
+
+ raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);
+out:
+ return ret;
+}
+EXPORT_SYMBOL(read_ovrd_common);
+
+int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)
+{
+ struct list_head *ovrd_list;
+ struct reg_ovrd *ovrd_reg;
+ struct pci_bus *pcib;
+ unsigned long lock_flags = 0, flags = 0;
+ u64 faddress, vaddr = 0;
+ u64 bit_mask, val, attrib;
+ unsigned int devfn = 0, pos = 0;
+ int i, flength, res, ret;
+ u64 value;
+
+ ret = -EINVAL;
+
+ if (spaceid == OVRD_SPACE_MEM) {
+ /* in the case of memory, 'address' is virtual */
+ vaddr = address;
+ address = v2p(address);
+ ovrd_list = &ovrd_mem_reg_map;
+ } else if (spaceid == OVRD_SPACE_IO) {
+ ovrd_list = &ovrd_io_reg_map;
+ } else if (spaceid == OVRD_SPACE_PCICONF) {
+ devfn = PCI_DECODE_DEVFN(address);
+ pos = PCI_DECODE_POS(address);
+ ovrd_list = &ovrd_pci_conf_reg_map;
+ } else {
+ return ret;
+ }
+
+ raw_spin_lock_irqsave(&io_hook_lock, lock_flags);
+ list_for_each_entry(ovrd_reg, ovrd_list, node) {
+
+ faddress = ovrd_reg->address;
+ flength = ovrd_reg->length;
+ val = ovrd_reg->val;
+ bit_mask = ovrd_reg->bit_mask;
+ attrib = ovrd_reg->attrib;
+ value = *(u64 *)data;
+
+ if (address >= faddress + flength ||
+ address + len <= faddress) {
+ /* no overlap, skip */
+ continue;
+ }
+
+ ret = 0;
+
+ if (!(address >= faddress && address+len <= faddress+flength &&
+ bit_mask == (u64)((1<<flength*8) - 1))) {
+ /* partially overridden. write to HW for real bits */
+ if (spaceid == OVRD_SPACE_MEM) {
+ res = mem_write(vaddr, len, data);
+ } else if (spaceid == OVRD_SPACE_IO) {
+ res = io_write(address, len, data);
+ } else if (spaceid == OVRD_SPACE_PCICONF) {
+ raw_spin_unlock_irqrestore(&io_hook_lock,
+ lock_flags);
+ raw_spin_lock_irqsave(&pci_lock, flags);
+ pcib = (struct pci_bus *)bus;
+ pcib->ops->write(pcib, devfn, pos, len,
+ (u32)value);
+ raw_spin_unlock_irqrestore(&pci_lock, flags);
+ raw_spin_lock_irqsave(&io_hook_lock,
+ lock_flags);
+ } else
+ break;
+ }
+
+ for (i = 0; i < len; i++) {
+ if (address+i >= faddress &&
+ address+i < faddress+flength) {
+ int j, k;
+
+ j = address + i - faddress;
+ k = faddress - address;
+ if (flength <= 8) {
+ /* <= 8 bytes, use bit_mask */
+ u64 byte_mask;
+
+ byte_mask =
+ bit_mask & BYTE_SHIFT(0xff, j);
+ if (attrib == OVRD_RW) {
+ ovrd_reg->val &= ~byte_mask;
+ ovrd_reg->val |=
+ BYTE_SHIFT(value, k)
+ & byte_mask;
+ } else if (attrib == OVRD_WC) {
+ ovrd_reg->val &=
+ ~(BYTE_SHIFT(value, k)
+ & byte_mask);
+ }
+
+ }
+ /* if flength > 8, must be OVRD_RO */
+ }
+ }
+
+ }
+
+ raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(write_ovrd_common);
diff --git a/drivers/misc/iohook/iohook.h b/drivers/misc/iohook/iohook.h
new file mode 100644
index 0000000..46c97be
--- /dev/null
+++ b/drivers/misc/iohook/iohook.h
@@ -0,0 +1,6 @@
+#ifndef _IOHOOK_H
+#define _IOHOOK_H
+
+extern raw_spinlock_t pci_lock;
+
+#endif
diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 7f8b78c..c1e0a2b 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -15,6 +15,68 @@

DEFINE_RAW_SPINLOCK(pci_lock);

+#ifdef CONFIG_IO_HOOK
+#include <linux/reg_ovrd.h>
+#include <linux/jump_label.h>
+
+int pci_bus_read_config_ovrd(struct pci_bus *bus, unsigned int devfn,
+ int pos, int len, void *value)
+{
+ u64 address;
+ int ret;
+
+ address = PCI_ENCODE_ADDR(pci_domain_nr(bus), bus->number, devfn, pos);
+
+ ret = read_ovrd_common(OVRD_SPACE_PCICONF, address, len,
+ value, (void *)bus);
+ if (!ret)
+ pr_info("read from %x:%x+%x-%x, ret=%x, val=0x%x\n",
+ bus->number, devfn, pos, len, ret, *(u32 *)value);
+ return ret;
+
+}
+
+int pci_bus_write_config_ovrd(struct pci_bus *bus, unsigned int devfn,
+ int pos, int len, u32 value)
+{
+ u64 address;
+ int ret;
+
+ address = PCI_ENCODE_ADDR(pci_domain_nr(bus), bus->number, devfn, pos);
+ ret = write_ovrd_common(OVRD_SPACE_PCICONF, address, len,
+ &value, (void *)bus);
+ if (!ret)
+ pr_info("write to %x:%x+%x-%x, ret=0x%x, val=0x%x\n",
+ bus->number, devfn, pos, len, ret, value);
+ return ret;
+
+
+}
+
+
+#define pci_read_ovrd(bus, devfn, pos, len, value) \
+{ \
+ if (static_key_false(&ovrdhw_enabled) \
+ && !pci_bus_read_config_ovrd(bus, devfn, pos, \
+ len, value)) \
+ return 0; \
+}
+
+#define pci_write_ovrd(bus, devfn, pos, len, value) \
+{ \
+ if (static_key_false(&ovrdhw_enabled) \
+ && !pci_bus_write_config_ovrd(bus, devfn, pos, \
+ len, value)) \
+ return 0; \
+}
+
+#else
+
+#define pci_read_ovrd(bus, devfn, pos, len, value)
+#define pci_write_ovrd(bus, devfn, pos, len, value)
+
+#endif /* CONFIG_IO_HOOK */
+
/*
* Wrappers for all PCI configuration access functions. They just check
* alignment, do locking and call the low-level functions pointed to
@@ -33,6 +95,7 @@ int pci_bus_read_config_##size \
unsigned long flags; \
u32 data = 0; \
if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \
+ pci_read_ovrd(bus, devfn, pos, len, value); \
raw_spin_lock_irqsave(&pci_lock, flags); \
res = bus->ops->read(bus, devfn, pos, len, &data); \
*value = (type)data; \
@@ -47,6 +110,7 @@ int pci_bus_write_config_##size \
int res; \
unsigned long flags; \
if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \
+ pci_write_ovrd(bus, devfn, pos, len, value) \
raw_spin_lock_irqsave(&pci_lock, flags); \
res = bus->ops->write(bus, devfn, pos, len, value); \
raw_spin_unlock_irqrestore(&pci_lock, flags); \
@@ -152,6 +216,7 @@ int pci_user_read_config_##size \
u32 data = -1; \
if (PCI_##size##_BAD) \
return -EINVAL; \
+ pci_read_ovrd(dev->bus, dev->devfn, pos, sizeof(type), (void *)val);\
raw_spin_lock_irq(&pci_lock); \
if (unlikely(dev->block_cfg_access)) \
pci_wait_cfg(dev); \
@@ -173,6 +238,7 @@ int pci_user_write_config_##size \
int ret = -EIO; \
if (PCI_##size##_BAD) \
return -EINVAL; \
+ pci_write_ovrd(dev->bus, dev->devfn, pos, sizeof(type), val);\
raw_spin_lock_irq(&pci_lock); \
if (unlikely(dev->block_cfg_access)) \
pci_wait_cfg(dev); \
diff --git a/include/linux/reg_ovrd.h b/include/linux/reg_ovrd.h
new file mode 100644
index 0000000..2707d6c
--- /dev/null
+++ b/include/linux/reg_ovrd.h
@@ -0,0 +1,54 @@
+#ifndef __REG_OVRD_H__
+#define __REG_OVRD_H__
+
+#include <linux/types.h>
+#include <linux/spinlock_types.h>
+
+enum ovrd_attrib {
+ OVRD_RW, /* readwrite */
+ OVRD_RO, /* readonly */
+ OVRD_RC, /* read clear */
+ OVRD_WC, /* write clear */
+ OVRD_TC, /* trace only */
+};
+
+/*
+ * address - Starting phys address of the h/w register
+ * length - # of bytes to be overridden
+ * val - When length <= 8, use (val & bit_mask) as the overridden value.
+ When length > 8, we're overriding a range of bytes to a single
+ readonly value. So attrib must be OVRD_RO, and (val & 0xff)
+ is the contiguous readonly value.
+ * bit_mask - used when length <= 8 to indicate which bits are being overridden.
+ unused when length > 8
+ * attrib - when length <=8, is the common attribute of the overridden
+ bits matching bit_mask. When length > 8, must be OVRD_RO
+ */
+struct reg_ovrd {
+ struct list_head node;
+ u64 address;
+ u64 val;
+ u64 bit_mask;
+ u32 length;
+ u8 attrib;
+};
+
+/* address space id */
+#define OVRD_SPACE_IO 0
+#define OVRD_SPACE_MEM 1
+#define OVRD_SPACE_PCICONF 2
+
+#define PCI_ENCODE_ADDR(domain, bus, devfn, pos) \
+ (((u64)(domain))<<32|(bus)<<20|(devfn)<<12|(pos))
+#define PCI_DECODE_POS(x) ((u16)((x) & ((1 << 12) - 1)))
+#define PCI_DECODE_DEVFN(x) ((u8)(((x) >> 12) & 0xff))
+#define PCI_DECODE_BUSN(x) ((u8)(((x) >> 20) & 0xff))
+#define PCI_DECODE_DOMAIN(x) ((u32)((x) >> 32))
+
+extern int read_ovrd_common(int spaceid, u64 address, int len, void *value,
+ void *bus);
+extern int write_ovrd_common(int spaceid, u64 address, int len, void *data,
+ void *bus);
+extern struct static_key ovrdhw_enabled;
+
+#endif /* __REG_OVRD_H__ */
--
1.7.5.4

2014-04-15 13:59:31

by Rui Wang

[permalink] [raw]
Subject: [PATCH v3 4/5] I/O Hook: Override MSRs while triggering MCEs

MSRs can be overriden by specifying a cpu id and an MSR number.

bash# echo "0:17a-8[4/4]ro" > msr

would override MSR 0x17a on cpu0. Use 'all' as the cpu id to override
the MSR on all CPUs. e.g. "all:17a-8[4/4]ro"

When specifying the interrupt to be triggered, a positive number
denotes an IRQ and a negative number denotes a vector, thus

bash# echo -18 > irq

specifies that a machine check exception (int 18) will be triggered.

Signed-off-by: Rui Wang <[email protected]>
---
Documentation/PCI/iohook.txt | 31 ++++++--
arch/x86/include/asm/msr.h | 100 ++++++++++++++++++++-
arch/x86/lib/msr-smp.c | 31 +++++++
drivers/misc/iohook/iohook.c | 195 +++++++++++++++++++++++++++++++++++++++---
include/linux/reg_ovrd.h | 1 +
5 files changed, 335 insertions(+), 23 deletions(-)

diff --git a/Documentation/PCI/iohook.txt b/Documentation/PCI/iohook.txt
index ae9b4ad..9b3f232 100644
--- a/Documentation/PCI/iohook.txt
+++ b/Documentation/PCI/iohook.txt
@@ -52,11 +52,12 @@ a directory subtree is created under /sys/kernel/debug/iohook

bash# cd /sys/kernel/debug/iohook
bash# ls
-io irq mem pciconf trigger
+io irq mem msr pciconf trigger

Each file is used to manage a type of resource.
'io' is used to add/show Register Overrides in IO port space.
'mem' is used to add/show Register Overrides in memory space.
+'msr' is used to add/show Register Overrides in MSR space.
'pciconf' is used to add/show Register Overrides in pci config space.
'irq' is used to set the desired IRQ to be triggered via IPI.
'trigger' is used to turn on/off the I/O Hook.
@@ -74,8 +75,15 @@ for a Register Override in PCI config space, it's specified as:

domain|bus:dev.func+offset-length[value/mask]attribute

+for a Register Override in MSR space, it's specified as:
+
+ cpuid:regnum-length[value/mask]attribute
+
where
address - the 64bit address of the h/w register to be overridden
+ cpuid - the cpu on which the MSR specified by regnum is to be
+ overriden. Use 'all' as the cpuid to specify all cpus.
+ regnum - the MSR to be overriden.

length - the number of bytes affected. Affected here means that
at least one bit in that byte is overridden. For 'length'
@@ -124,24 +132,35 @@ The syntax is "domain|bus:dev.func+offset-length[value/mask]attribute"
The first register overrides only bit0 and the second register overrides the
first 2 bytes (mask == 0xffff), with an initial value of 0x0500.

+As another example, MSRs can be overriden by specifying a cpu id and an MSR
+number:
+
+bash# echo "0:17a-8[4/4]ro" > msr
+The above specifies that bit2 of MSR 0x17a on cpu0 is overriden as 1 (bit set).
+
Register Overrides are disabled when added. They can be enabled by using the
'trigger' file. See below.

2.3 Add IRQ and enable the Register Overrides

-To specify an IRQ to be triggered via IPI, just echo the IRQ number in decimal
-to the 'irq' file. For example:
+To specify an interrupt to be triggered via IPI, just echo the IRQ or vector
+number in decimal to the 'irq' file. A positive number denotes an IRQ and a
+negative number denotes a vector. For example:

bash# cd /sys/kernel/debug/iohook
bash# echo 9 > irq
This specifies that IRQ9 be triggered after the Register Overrides are enabled.

-To enable the Register Overrides in the kernel:
+bash# echo -18 > irq
+specifies that a machine check exception(vector 18) will be triggered.
+
+Register Overrides are disabled after added. Use the 'trigger' file to enabled
+them all:

bash# echo 1 > trigger

-This immediately enables all the Register Overrides and if an IRQ number was
-specified, generate the IPI.
+This immediately enables all the Register Overrides and if an IRQ (or vector)
+number was specified, generate the IPI.

To disable the Register Overrides in the kernel:

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index de36f22..cfbfc91 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -57,7 +57,69 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
#define EAX_EDX_RET(val, low, high) "=A" (val)
#endif

-static inline unsigned long long native_read_msr(unsigned int msr)
+#if !defined(NO_IO_HOOK) && defined(CONFIG_IO_HOOK)
+#ifndef CC_HAVE_ASM_GOTO
+#error "gcc 4.5 feature CC_HAVE_ASM_GOTO is required"
+#endif
+struct static_key;
+#include <asm/jump_label.h>
+#include <linux/reg_ovrd.h>
+
+#define msr_read_ovrd(addr) \
+{ \
+ u64 val;\
+ if (arch_static_branch(&ovrdhw_enabled) \
+ && !read_ovrd_common(OVRD_SPACE_MSR, (u64)addr, \
+ sizeof(u64), &val, NULL)) \
+ return val; \
+}
+
+#define msr_read_ovrd_safe(addr, err) \
+{ \
+ u64 val;\
+ if (arch_static_branch(&ovrdhw_enabled)) { \
+ struct msr_regs_info rv; \
+ /* Use rv.reg to tell from *_safe_regs() */ \
+ rv.regs = NULL; \
+ if (!read_ovrd_common(OVRD_SPACE_MSR, (u64)addr,\
+ sizeof(u64), &val, &rv)) { \
+ *err = rv.err; \
+ return val; \
+ } \
+ } \
+}
+#define msr_write_ovrd_safe(addr, low, high) \
+{ \
+ u64 val;\
+ if (arch_static_branch(&ovrdhw_enabled)) { \
+ struct msr_regs_info rv; \
+ /* Use rv.reg to tell from *_safe_regs() */ \
+ rv.regs = NULL; \
+ val = (low) | ((u64)(high) << 32); \
+ if (!write_ovrd_common(OVRD_SPACE_MSR, addr, \
+ sizeof(u64), &val, &rv)) \
+ return rv.err; \
+ } \
+}
+
+#define msr_write_ovrd(addr, low, high) \
+{ \
+ u64 val;\
+ if (arch_static_branch(&ovrdhw_enabled)) { \
+ val = (low) | ((u64)(high) << 32); \
+ if (!write_ovrd_common(OVRD_SPACE_MSR, addr, \
+ sizeof(u64), &val, NULL)) \
+ return; \
+ } \
+}
+#else /* CONFIG_IO_HOOK */
+#define msr_read_ovrd(addr)
+#define msr_read_ovrd_safe(addr, err)
+#define msr_write_ovrd(addr, low, high)
+#define msr_write_ovrd_safe(addr, low, high)
+#endif /* CONFIG_IO_HOOK */
+
+static inline unsigned long long native_do_read_msr(unsigned int msr)
{
DECLARE_ARGS(val, low, high);

@@ -65,7 +127,14 @@ static inline unsigned long long native_read_msr(unsigned int msr)
return EAX_EDX_VAL(val, low, high);
}

-static inline unsigned long long native_read_msr_safe(unsigned int msr,
+static inline unsigned long long native_read_msr(unsigned int msr)
+{
+
+ msr_read_ovrd(msr);
+ return native_do_read_msr(msr);
+}
+
+static inline unsigned long long native_do_read_msr_safe(unsigned int msr,
int *err)
{
DECLARE_ARGS(val, low, high);
@@ -81,14 +150,27 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
return EAX_EDX_VAL(val, low, high);
}

-static inline void native_write_msr(unsigned int msr,
+static inline unsigned long long native_read_msr_safe(unsigned int msr,
+ int *err)
+{
+ msr_read_ovrd_safe(msr, err);
+ return native_do_read_msr_safe(msr, err);
+}
+
+static inline void native_do_write_msr(unsigned int msr,
unsigned low, unsigned high)
{
asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
}

-/* Can be uninlined because referenced by paravirt */
-notrace static inline int native_write_msr_safe(unsigned int msr,
+static inline void native_write_msr(unsigned int msr,
+ unsigned low, unsigned high)
+{
+ msr_write_ovrd(msr, low, high);
+ native_do_write_msr(msr, low, high);
+}
+
+static inline int native_do_write_msr_safe(unsigned int msr,
unsigned low, unsigned high)
{
int err;
@@ -105,6 +187,14 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
return err;
}

+/* Can be uninlined because referenced by paravirt */
+static inline int native_write_msr_safe(unsigned int msr,
+ unsigned low, unsigned high)
+{
+ msr_write_ovrd_safe(msr, low, high);
+ return native_do_write_msr_safe(msr, low, high);
+}
+
extern unsigned long long native_read_tsc(void);

extern int rdmsr_safe_regs(u32 regs[8]);
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index 518532e..6a5f177 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -128,6 +128,33 @@ void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
}
EXPORT_SYMBOL(rdmsr_on_cpus);

+#if !defined(NO_IO_HOOK) && defined(CONFIG_IO_HOOK)
+
+#define rdmsr_safe_regs_ovrd(rv) \
+{ \
+ u64 val;\
+ if (arch_static_branch(&ovrdhw_enabled)) { \
+ if (!read_ovrd_common(OVRD_SPACE_MSR, 0, \
+ sizeof(u64), &val, rv)) \
+ return; \
+ } \
+}
+
+#define wrmsr_safe_regs_ovrd(rv) \
+{ \
+ u64 val;\
+ if (arch_static_branch(&ovrdhw_enabled)) { \
+ if (!write_ovrd_common(OVRD_SPACE_MSR, 0, \
+ sizeof(u64), &val, rv)) \
+ return; \
+ } \
+}
+
+#else
+#define rdmsr_safe_regs_ovrd(rv)
+#define wrmsr_safe_regs_ovrd(rv)
+#endif
+
/*
* wrmsr on a bunch of CPUs
*
@@ -229,6 +256,8 @@ static void __rdmsr_safe_regs_on_cpu(void *info)
{
struct msr_regs_info *rv = info;

+ rdmsr_safe_regs_ovrd(rv);
+
rv->err = rdmsr_safe_regs(rv->regs);
}

@@ -236,6 +265,8 @@ static void __wrmsr_safe_regs_on_cpu(void *info)
{
struct msr_regs_info *rv = info;

+ wrmsr_safe_regs_ovrd(rv);
+
rv->err = wrmsr_safe_regs(rv->regs);
}

diff --git a/drivers/misc/iohook/iohook.c b/drivers/misc/iohook/iohook.c
index 5584eb7..f81f553 100644
--- a/drivers/misc/iohook/iohook.c
+++ b/drivers/misc/iohook/iohook.c
@@ -22,6 +22,7 @@
#include <asm/hw_irq.h>
#include <linux/reg_ovrd.h>
#include <linux/pci.h>
+#include <linux/smp.h>
#include "iohook.h"

MODULE_LICENSE("GPL");
@@ -35,6 +36,7 @@ static DEFINE_RAW_SPINLOCK(engine_lock);
LIST_HEAD(ovrd_io_reg_map);
LIST_HEAD(ovrd_mem_reg_map);
LIST_HEAD(ovrd_pci_conf_reg_map);
+LIST_HEAD(ovrd_msr_map);

struct static_key ovrdhw_enabled = STATIC_KEY_INIT_FALSE;
EXPORT_SYMBOL(ovrdhw_enabled);
@@ -52,8 +54,10 @@ struct reg_ovrd *iohook_query_ovrd(int spaceid, int idx)
reg_ovrd = &ovrd_mem_reg_map;
else if (spaceid == OVRD_SPACE_IO)
reg_ovrd = &ovrd_io_reg_map;
- else
+ else if (spaceid == OVRD_SPACE_PCICONF)
reg_ovrd = &ovrd_pci_conf_reg_map;
+ else
+ reg_ovrd = &ovrd_msr_map;

regentry = NULL;
raw_spin_lock_irqsave(&io_hook_lock, lock_flags);
@@ -80,8 +84,10 @@ void iohook_cleanup_ovrd(int spaceid)
ovrd_list = &ovrd_mem_reg_map;
else if (spaceid == OVRD_SPACE_IO)
ovrd_list = &ovrd_io_reg_map;
- else
+ else if (spaceid == OVRD_SPACE_PCICONF)
ovrd_list = &ovrd_pci_conf_reg_map;
+ else
+ ovrd_list = &ovrd_msr_map;

list_for_each_safe(tmp, next, ovrd_list) {
struct reg_ovrd *ovrdreg =
@@ -103,8 +109,10 @@ void iohook_add_ovrd(int spaceid, u64 address, u64 value, u64 mask,
reg_ovrd = &ovrd_mem_reg_map;
else if (spaceid == OVRD_SPACE_IO)
reg_ovrd = &ovrd_io_reg_map;
- else
+ else if (spaceid == OVRD_SPACE_PCICONF)
reg_ovrd = &ovrd_pci_conf_reg_map;
+ else
+ reg_ovrd = &ovrd_msr_map;

raw_spin_lock_irqsave(&io_hook_lock, lock_flags);
list_for_each_entry(ovrdreg, reg_ovrd, node) {
@@ -353,6 +361,20 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
devfn = PCI_DECODE_DEVFN(address);
pos = PCI_DECODE_POS(address);
ovrd_list = &ovrd_pci_conf_reg_map;
+ } else if (spaceid == OVRD_SPACE_MSR) {
+ unsigned int cpuid;
+
+ ovrd_list = &ovrd_msr_map;
+ cpuid = smp_processor_id();
+
+ /*
+ * MSRs are 64bit wide, so x8 to form a contiguous
+ * address space
+ */
+ address *= 8;
+
+ /* upper 32bits contain the cpu id */
+ address |= (u64)cpuid << 32;
} else {
return ret;
}
@@ -392,6 +414,29 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
res = pcib->ops->read(pcib, devfn, pos, len,
(u32 *)&data);
raw_spin_unlock_irqrestore(&pci_lock, flags);
+ } else if (spaceid == OVRD_SPACE_MSR) {
+ struct msr_regs_info *rv;
+ unsigned int msr;
+
+ rv = (struct msr_regs_info *)bus;
+ msr = address & 0xffffffff;
+ if (rv && rv->regs) { /* rdmsr_safe_regs() */
+ u32 low, high;
+
+ rv->err = rdmsr_safe_regs(rv->regs);
+ low = rv->regs[0]; /* eax */
+ high = rv->regs[2]; /* edx */
+ data = low | ((u64)high << 32);
+ res = rv->err;
+ } else if (rv) { /* rv->regs == NULL */
+ data = native_do_read_msr_safe(msr,
+ &rv->err);
+ res = rv->err;
+ } else { /* rv == NULL */
+ data = native_do_read_msr(msr);
+ res = 0;
+ }
+
} else
goto out;

@@ -486,6 +531,20 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)
devfn = PCI_DECODE_DEVFN(address);
pos = PCI_DECODE_POS(address);
ovrd_list = &ovrd_pci_conf_reg_map;
+ } else if (spaceid == OVRD_SPACE_MSR) {
+ unsigned int cpuid;
+
+ cpuid = smp_processor_id();
+
+ /*
+ * MSRs are 64bit wide, so x8 to form a contiguous
+ * address space
+ */
+ address *= 8;
+
+ /* upper 32bits contain the cpu id */
+ address |= (u64)cpuid << 32;
+ ovrd_list = &ovrd_msr_map;
} else {
return ret;
}
@@ -525,6 +584,23 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)
raw_spin_unlock_irqrestore(&pci_lock, flags);
raw_spin_lock_irqsave(&io_hook_lock,
lock_flags);
+ } else if (spaceid == OVRD_SPACE_MSR) {
+ struct msr_regs_info *rv;
+ unsigned int msr, low, high;
+
+ rv = (struct msr_regs_info *)bus;
+ msr = address & 0xffffffff;
+ low = value & 0xffffffff;
+ high = value >> 32;
+
+ if (rv && rv->regs) { /* wrmsr_safe_regs() */
+ rv->err = wrmsr_safe_regs(rv->regs);
+ } else if (rv) { /* rv->regs == NULL */
+ rv->err = native_do_write_msr_safe(msr,
+ low, high);
+ } else { /* rv == NULL */
+ native_do_write_msr(msr, low, high);
+ }
} else
break;
}
@@ -566,12 +642,22 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)
}
EXPORT_SYMBOL(write_ovrd_common);

-static void trigger_irq_by_ipi(unsigned long irqnum)
+static void trigger_irq_by_ipi(long irqnum)
{
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;

+ if (irqnum < 0) { /* vector number */
+ pr_info("sending IPI to vector:%ld\n", -irqnum);
+ if (-irqnum == 18)
+ apic->send_IPI_all(-irqnum);
+ else
+ apic->send_IPI_self(-irqnum);
+ pr_info("Returned from sending IPI to vector:%ld\n", -irqnum);
+ return;
+ }
+
desc = irq_to_desc(irqnum);
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
@@ -637,7 +723,11 @@ ssize_t hook_irq_read(struct file *file, char __user *ubuf, size_t cnt,
char buf[64]; /* big enough to hold a number */
int r;

- r = sprintf(buf, "%u\n", g_irq_num);
+ if (g_irq_num >= 0)
+ r = sprintf(buf, "irq:%u\n", g_irq_num);
+ else
+ r = sprintf(buf, "vector:%u\n", -g_irq_num);
+
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

@@ -645,7 +735,7 @@ static ssize_t hook_irq_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
char *irq = NULL;
- unsigned long irqnum, ret;
+ long irqnum, ret;

irq = kmalloc(count+1, GFP_ATOMIC);
if (!irq) {
@@ -655,13 +745,19 @@ static ssize_t hook_irq_write(struct file *file, const char __user *user_buf,
irq[count] = '\0';
if (copy_from_user(irq, user_buf, count))
goto end;
- ret = kstrtoul(irq, 10, &irqnum);
+
+ ret = kstrtol(irq, 10, &irqnum);
if (ret) {
- pr_err("bogus irq number? %s\n", irq);
+ pr_err("bogus irq/vector number? %s\n", irq);
goto end;
}
- if (irqnum > NR_IRQS) {
- pr_err("irq num too big: %lu\n", irqnum);
+
+ /*
+ * if irqnum < 0 it's a vector number
+ * if irqnum > 0 it's a IRQ number
+ */
+ if (irqnum <= -NR_VECTORS || irqnum >= NR_IRQS) {
+ pr_err("irq/vector num too big: %ld\n", irqnum);
goto end;
}

@@ -733,6 +829,13 @@ hook_seq_show(struct seq_file *s, void *it)
regmap->address, regmap->length, regmap->val,
regmap->bit_mask, hook_attrib(regmap->attrib));
break;
+ case OVRD_SPACE_MSR:
+ seq_printf(s,
+ "cpu%d msr: 0x%x length: 0x%x value: 0x%llx mask: 0x%llx attrib: %s\n",
+ (u32)(regmap->address >> 32), (u32)(regmap->address)/8,
+ regmap->length, regmap->val, regmap->bit_mask,
+ hook_attrib(regmap->attrib));
+ break;
case OVRD_SPACE_PCICONF:
seq_printf(s,
"pciconf: 0x%04x|%02x:%02x.%02x offset: 0x%x lenght: 0x%x value: 0x%llx mask: 0x%llx attrib: %s\n",
@@ -772,6 +875,19 @@ hook_io_open(struct inode *inode, struct file *file)
}

static int
+hook_msr_open(struct inode *inode, struct file *file)
+{
+ struct hook_iter *iter;
+
+ iter = __seq_open_private(file, &hook_seq_ops,
+ sizeof(struct hook_iter));
+ if (iter)
+ iter->spaceid = OVRD_SPACE_MSR;
+
+ return iter ? 0 : -ENOMEM;
+}
+
+static int
hook_mem_open(struct inode *inode, struct file *file)
{
struct hook_iter *iter;
@@ -817,12 +933,12 @@ hook_parse_entry(char *entry, int spaceid)
{
char *field;
u64 address, length, value, mask;
- unsigned long domain, bus, dev, func;
+ unsigned long domain, bus, dev, func, cpu;
int attrib, ret;
u16 cval;

if (spaceid != OVRD_SPACE_PCICONF)
- goto mem_io;
+ goto msr;

field = strsep(&entry, "|");
ret = kstrtoul(field, 16, &domain);
@@ -843,6 +959,20 @@ hook_parse_entry(char *entry, int spaceid)
ret = kstrtoul(field, 16, &func);
if (ret || !entry)
return -1;
+msr:
+ if (spaceid != OVRD_SPACE_MSR)
+ goto mem_io;
+ field = strsep(&entry, ":");
+ if (strcasecmp(field, "all") == 0) {
+ /* All cpus */
+ cpu = (unsigned long)-1;
+ ret = 0;
+ } else {
+ ret = kstrtoul(field, 16, &cpu);
+ }
+ pr_info("parse_hook_entry() cpu=0x%lx\n", cpu);
+ if (ret || !entry)
+ return -1;

mem_io:
field = strsep(&entry, "-");
@@ -850,6 +980,13 @@ mem_io:
if (ret || !entry)
return -1;

+ /*
+ * MSRs are 64bit wide, so x8 to form a contiguous
+ * address space
+ */
+ if (spaceid == OVRD_SPACE_MSR)
+ address *= 8;
+
pr_info("parse_hook_entry() address=0x%llx\n", address);

field = strsep(&entry, "[");
@@ -888,6 +1025,17 @@ mem_io:
if (spaceid == OVRD_SPACE_PCICONF) {
address = PCI_ENCODE_ADDR(domain, bus, PCI_DEVFN(dev, func),
address);
+ } else if (spaceid == OVRD_SPACE_MSR) {
+ if (cpu == (unsigned long)-1) { /* all cpus */
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ address &= 0xffffffff;
+ address |= cpu << 32;
+ iohook_add_ovrd(spaceid, address, value, mask,
+ length, attrib);
+ }
+ return 0;
+ }
+ address |= cpu << 32;
}

iohook_add_ovrd(spaceid, address, value, mask, length, attrib);
@@ -945,6 +1093,14 @@ hook_io_write(struct file *file, const char __user *user_buf,
}

static ssize_t
+hook_msr_write(struct file *file, const char __user *user_buf,
+ size_t user_len, loff_t *offset)
+{
+
+ return hook_write(file, user_buf, user_len, offset, OVRD_SPACE_MSR);
+}
+
+static ssize_t
hook_mem_write(struct file *file, const char __user *user_buf,
size_t user_len, loff_t *offset)
{
@@ -968,6 +1124,14 @@ static const struct file_operations hook_io_fops = {
.write = hook_io_write,
};

+static const struct file_operations hook_msr_fops = {
+ .open = hook_msr_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+ .write = hook_msr_write,
+};
+
static const struct file_operations hook_mem_fops = {
.open = hook_mem_open,
.read = seq_read,
@@ -1007,11 +1171,18 @@ static int __init iohook_init(void)
if (hook_irq_dentry == NULL)
return -ENODEV;

+#ifdef CONFIG_X86
hook_reg_dentry = debugfs_create_file("io", S_IWUSR,
root, NULL, &hook_io_fops);
if (hook_reg_dentry == NULL)
return -ENODEV;

+ hook_reg_dentry = debugfs_create_file("msr", S_IWUSR,
+ root, NULL, &hook_msr_fops);
+ if (hook_reg_dentry == NULL)
+ return -ENODEV;
+#endif
+
hook_reg_dentry = debugfs_create_file("mem", S_IWUSR,
root, NULL, &hook_mem_fops);
if (hook_reg_dentry == NULL)
diff --git a/include/linux/reg_ovrd.h b/include/linux/reg_ovrd.h
index 2707d6c..ce655fe 100644
--- a/include/linux/reg_ovrd.h
+++ b/include/linux/reg_ovrd.h
@@ -37,6 +37,7 @@ struct reg_ovrd {
#define OVRD_SPACE_IO 0
#define OVRD_SPACE_MEM 1
#define OVRD_SPACE_PCICONF 2
+#define OVRD_SPACE_MSR 3

#define PCI_ENCODE_ADDR(domain, bus, devfn, pos) \
(((u64)(domain))<<32|(bus)<<20|(devfn)<<12|(pos))
--
1.7.5.4

2014-04-15 13:59:38

by Rui Wang

[permalink] [raw]
Subject: [PATCH v3 5/5] IO Hook: Tracing hw access

A new attribute "tc" can be defined for any Register Override so that IO Hook
can report each read from (or write to) the specified area in mem, io, msr, and
pciconf spaces. A trace event named 'iohook_event' is defined to report the
accesses. For example
echo "0x383ffff00000-4000[0/0]tc" > /sys/kernel/debug/tracing/iohook/mem
or
trace_event=iohook_event iohook.mem="383ffff00000-4000[0/0]tc" as boot
parameters in grub can be used to trace any access to physical address region
starting from 0x383ffff00000 with a length of 0x4000. A typical trace event
output looks like the following:

bash $ sudo cat /sys/kernel/debug/tracing/trace
...
swapper/0-1 [018] .... 11.594157: iohook_event: pci_read_config(0000:00:14.00, 0x6, 0x2) <== 0x290

swapper/0-1 [018] .... 11.599914: iohook_event: pci_read_config(0000:00:14.00, 0x34, 0x1) <== 0x70

modprobe-613 [013] d... 12.447408: iohook_event: pci_write_config(0000:00:14.00, 0x4, 0x2) ==> 0x6

modprobe-613 [013] .... 12.452565: iohook_event: pci_read_config(0000:00:14.00, 0xd, 0x1) <== 0x0

modprobe-613 [013] d... 12.458302: iohook_event: pci_write_config(0000:00:14.00, 0xd, 0x1) ==> 0x40

modprobe-613 [013] .... 12.477935: iohook_event: read(0x383ffff00000, 0x4) <== 0x1000080

modprobe-613 [013] .... 12.477936: iohook_event: read(0x383ffff00018, 0x4) <== 0x2000

modprobe-613 [013] d... 12.478001: iohook_event: write(0x383ffff02030, 0x4) ==> 0x37822000

modprobe-613 [013] d... 12.478001: iohook_event: write(0x383ffff02034, 0x4) ==> 0x0
...

Signed-off-by: Rui Wang <[email protected]>
---
Documentation/PCI/iohook.txt | 54 +++++++-
drivers/misc/iohook/iohook.c | 319 +++++++++++++++++++++++++++++++----------
include/trace/events/iohook.h | 58 ++++++++
3 files changed, 352 insertions(+), 79 deletions(-)
create mode 100644 include/trace/events/iohook.h

diff --git a/Documentation/PCI/iohook.txt b/Documentation/PCI/iohook.txt
index 9b3f232..8e0d2aa 100644
--- a/Documentation/PCI/iohook.txt
+++ b/Documentation/PCI/iohook.txt
@@ -104,7 +104,10 @@ where

attribute - used to specify the attribute of the overridden bits.
It can be ro, rw, wc, rc to mean read-only, read-write,
- write-clear, and read-clear respectively.
+ write-clear, and read-clear respectively. A special attr-
+ ibute tc is defined to trace hw access through the trace
+ event named iohook_event. When tc is used as the attribute
+ value/mask are ignored.

domain - pci domain number
bus - pci bus number
@@ -299,3 +302,52 @@ bash # echo 1 > trigger
dmesg shows:
pcieport 0000:00:05.0: AER: Corrected error received: id=0500

+3.4 Tracing hardware access by the OS
+
+I/O Hook can be used to trace access to hw registers by the OS. When compared
+to mmio trace, this IO Hook trace has both benefits and shortcomings:
+ a) I/O Hook is SMP safe while mmio trace isn't.
+ b) mmio trace can trace arbitrary assembly instruction accessing
+ arbitrary address, while IO Hook can only trace standard
+ functions like readl(), inb(), pci_read_config_byte(), etc.
+ C) I/O Hook can trace IO ports and PCI Config space, while mmio trace
+ cannot.
+
+A trace event named 'iohook_event' is defined to report the accesses. A
+special attribute tc can be used to specify the register to be traced.
+For example in order to trace access to physical address region starting
+from 0x383ffff00000 with a length of 0x4000, the following cmd sequence is used:
+
+bash # echo "0x383ffff00000-4000[0/0]tc" > /sys/kernel/debug/tracing/iohook/mem
+bash # echo 1 > /sys/kernel/debug/iohook/trigger
+
+To trace hw access during boot, the following boot parameters can be used in
+grub:
+ iohook.mem, iohook.io, iohook.pciconf, iohook.msr
+For example
+ trace_event=iohook_event iohook.mem="383ffff00000-4000[0/0]tc" as boot
+parameters in grub can be used to trace the same physical address region
+starting from 0x383ffff00000. A typical trace event output looks like the
+following:
+
+bash $ sudo cat /sys/kernel/debug/tracing/trace
+...
+swapper/0-1 [018] .... 11.594157: iohook_event: pci_read_config(0000:00:14.00, 0x6, 0x2) <== 0x290
+
+swapper/0-1 [018] .... 11.599914: iohook_event: pci_read_config(0000:00:14.00, 0x34, 0x1) <== 0x70
+
+modprobe-613 [013] d... 12.447408: iohook_event: pci_write_config(0000:00:14.00, 0x4, 0x2) ==> 0x6
+
+modprobe-613 [013] .... 12.452565: iohook_event: pci_read_config(0000:00:14.00, 0xd, 0x1) <== 0x0
+
+modprobe-613 [013] d... 12.458302: iohook_event: pci_write_config(0000:00:14.00, 0xd, 0x1) ==> 0x40
+
+modprobe-613 [013] .... 12.477935: iohook_event: read(0x383ffff00000, 0x4) <== 0x1000080
+
+modprobe-613 [013] .... 12.477936: iohook_event: read(0x383ffff00018, 0x4) <== 0x2000
+
+modprobe-613 [013] d... 12.478001: iohook_event: write(0x383ffff02030, 0x4) ==> 0x37822000
+
+modprobe-613 [013] d... 12.478001: iohook_event: write(0x383ffff02034, 0x4) ==> 0x0
+...
+
diff --git a/drivers/misc/iohook/iohook.c b/drivers/misc/iohook/iohook.c
index f81f553..af8b2cf 100644
--- a/drivers/misc/iohook/iohook.c
+++ b/drivers/misc/iohook/iohook.c
@@ -25,6 +25,9 @@
#include <linux/smp.h>
#include "iohook.h"

+#define CREATE_TRACE_POINTS
+#include <trace/events/iohook.h>
+
MODULE_LICENSE("GPL");

static struct dentry *hook_irq_dentry;
@@ -42,6 +45,37 @@ struct static_key ovrdhw_enabled = STATIC_KEY_INIT_FALSE;
EXPORT_SYMBOL(ovrdhw_enabled);

int g_ovrd_on;
+
+char *hook_func_names[] = {
+ "read",
+ "write",
+ "in",
+ "out",
+ "pci_read_config",
+ "pci_write_config",
+ "rdmsr",
+ "wrmsr"
+};
+
+const char *iohook_trace_parse_addr(struct trace_seq *p, int type, u64 address)
+{
+ const char *ret = p->buffer + p->len;
+
+ if (type == PCI_RD || type == PCI_WR)
+ trace_seq_printf(p, "%04x:%02x:%02x.%02x, 0x%x",
+ PCI_DECODE_DOMAIN(address),
+ PCI_DECODE_BUSN(address),
+ PCI_SLOT(PCI_DECODE_DEVFN(address)),
+ PCI_FUNC(PCI_DECODE_DEVFN(address)),
+ PCI_DECODE_POS(address));
+ else
+ trace_seq_printf(p, "0x%llx", address);
+ trace_seq_putc(p, 0);
+
+ return ret;
+
+}
+
/* query a Register Override given spaceid and index */
struct reg_ovrd *iohook_query_ovrd(int spaceid, int idx)
{
@@ -330,6 +364,85 @@ static u64 v2p(u64 vaddr)
return slow_virt_to_phys((void *)vaddr);
}

+static int pci_rd(struct pci_bus *pcib, u64 address, int len, void *data)
+{
+ unsigned int devfn = 0, pos = 0;
+ unsigned long flags;
+ int res;
+
+ devfn = PCI_DECODE_DEVFN(address);
+ pos = PCI_DECODE_POS(address);
+
+ raw_spin_lock_irqsave(&pci_lock, flags);
+ res = pcib->ops->read(pcib, devfn, pos, len,
+ (u32 *)data);
+ raw_spin_unlock_irqrestore(&pci_lock, flags);
+ return res;
+}
+
+static int pci_wr(struct pci_bus *pcib, u64 address, int len, u32 value)
+{
+ unsigned int devfn = 0, pos = 0;
+ unsigned long flags;
+ int res;
+
+ devfn = PCI_DECODE_DEVFN(address);
+ pos = PCI_DECODE_POS(address);
+
+ raw_spin_lock_irqsave(&pci_lock, flags);
+ res = pcib->ops->write(pcib, devfn, pos, len, value);
+ raw_spin_unlock_irqrestore(&pci_lock, flags);
+ return res;
+}
+
+static int msr_rd(void *bus, u64 address, u64 *data)
+{
+ struct msr_regs_info *rv;
+ unsigned int msr;
+ int res;
+
+ rv = (struct msr_regs_info *)bus;
+ msr = address & 0xffffffff;
+ if (rv && rv->regs) { /* rdmsr_safe_regs() */
+ u32 low, high;
+
+ rv->err = rdmsr_safe_regs(rv->regs);
+ low = rv->regs[0]; /* eax */
+ high = rv->regs[2]; /* edx */
+ *data = low | ((u64)high << 32);
+ res = rv->err;
+ } else if (rv) { /* rv->regs == NULL */
+ *data = native_do_read_msr_safe(msr,
+ &rv->err);
+ res = rv->err;
+ } else { /* rv == NULL */
+ *data = native_do_read_msr(msr);
+ res = 0;
+ }
+
+ return res;
+}
+
+static void msr_wr(void *bus, u64 address, u64 value)
+{
+ struct msr_regs_info *rv;
+ unsigned int msr, low, high;
+
+ rv = (struct msr_regs_info *)bus;
+ msr = address & 0xffffffff;
+ low = value & 0xffffffff;
+ high = value >> 32;
+
+ if (rv && rv->regs) { /* wrmsr_safe_regs() */
+ rv->err = wrmsr_safe_regs(rv->regs);
+ } else if (rv) { /* rv->regs == NULL */
+ rv->err = native_do_write_msr_safe(msr,
+ low, high);
+ } else { /* rv == NULL */
+ native_do_write_msr(msr, low, high);
+ }
+}
+
/* shift left if i>=0, otherwise shift right */
#define BYTE_SHIFT(value, i) \
((i) >= 0 ? (value) << (i)*8 : (value) >> (-i)*8)
@@ -338,11 +451,9 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
{
struct list_head *ovrd_list;
struct reg_ovrd *ovrd_reg;
- struct pci_bus *pcib;
- unsigned long lock_flags = 0, flags = 0;
+ unsigned long lock_flags = 0;
u64 faddress, vaddr = 0;
u64 data, bit_mask, attrib, val;
- unsigned int devfn = 0, pos = 0;
int i, flength, res, ret;

ret = -EINVAL;
@@ -358,8 +469,6 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
} else if (spaceid == OVRD_SPACE_IO) {
ovrd_list = &ovrd_io_reg_map;
} else if (spaceid == OVRD_SPACE_PCICONF) {
- devfn = PCI_DECODE_DEVFN(address);
- pos = PCI_DECODE_POS(address);
ovrd_list = &ovrd_pci_conf_reg_map;
} else if (spaceid == OVRD_SPACE_MSR) {
unsigned int cpuid;
@@ -398,6 +507,27 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
lock_flags);

/* at least one byte falls into the overridden range */
+
+ if (attrib == OVRD_TC) {
+ /* trace hw access */
+ if (spaceid == OVRD_SPACE_MEM) {
+ ret = mem_read(vaddr, len, &data);
+ trace_iohook_event(MM_RD, len, address, data);
+ } else if (spaceid == OVRD_SPACE_IO) {
+ ret = io_read(address, len, &data);
+ trace_iohook_event(IO_RD, len, address, data);
+ } else if (spaceid == OVRD_SPACE_PCICONF) {
+ ret = pci_rd(bus, address, len, &data);
+ trace_iohook_event(PCI_RD, len, address, data);
+ } else if (spaceid == OVRD_SPACE_MSR) {
+ ret = msr_rd(bus, address, &data);
+ trace_iohook_event(MSR_RD, len, address, data);
+ } else
+ trace_iohook_event(MSR_WR + 1, len, address,
+ data);
+ goto read_done;
+ }
+
data = 0;
ret = 0;
if (!(address >= faddress && address+len <= faddress+flength &&
@@ -409,33 +539,9 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
} else if (spaceid == OVRD_SPACE_IO) {
res = io_read(address, len, &data);
} else if (spaceid == OVRD_SPACE_PCICONF) {
- raw_spin_lock_irqsave(&pci_lock, flags);
- pcib = (struct pci_bus *)bus;
- res = pcib->ops->read(pcib, devfn, pos, len,
- (u32 *)&data);
- raw_spin_unlock_irqrestore(&pci_lock, flags);
+ res = pci_rd(bus, address, len, &data);
} else if (spaceid == OVRD_SPACE_MSR) {
- struct msr_regs_info *rv;
- unsigned int msr;
-
- rv = (struct msr_regs_info *)bus;
- msr = address & 0xffffffff;
- if (rv && rv->regs) { /* rdmsr_safe_regs() */
- u32 low, high;
-
- rv->err = rdmsr_safe_regs(rv->regs);
- low = rv->regs[0]; /* eax */
- high = rv->regs[2]; /* edx */
- data = low | ((u64)high << 32);
- res = rv->err;
- } else if (rv) { /* rv->regs == NULL */
- data = native_do_read_msr_safe(msr,
- &rv->err);
- res = rv->err;
- } else { /* rv == NULL */
- data = native_do_read_msr(msr);
- res = 0;
- }
+ res = msr_rd(bus, address, &data);

} else
goto out;
@@ -475,6 +581,7 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
}
}

+read_done:
switch (len) {
case 1:
*(u8 *)value = (u8)data;
@@ -490,11 +597,10 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
break;
default:
ret = -EINVAL;
- goto out;
+ break;
}

- raw_spin_lock_irqsave(&io_hook_lock,
- lock_flags);
+ goto out;
}

raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);
@@ -507,8 +613,7 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)
{
struct list_head *ovrd_list;
struct reg_ovrd *ovrd_reg;
- struct pci_bus *pcib;
- unsigned long lock_flags = 0, flags = 0;
+ unsigned long lock_flags = 0;
u64 faddress, vaddr = 0;
u64 bit_mask, val, attrib;
unsigned int devfn = 0, pos = 0;
@@ -567,42 +672,28 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)

ret = 0;

- if (!(address >= faddress && address+len <= faddress+flength &&
- bit_mask == (u64)((1<<flength*8) - 1))) {
- /* partially overridden. write to HW for real bits */
+ if (attrib == OVRD_TC) {
+ /* trace hw access */
if (spaceid == OVRD_SPACE_MEM) {
res = mem_write(vaddr, len, data);
+ trace_iohook_event(MM_WR, len, address, value);
} else if (spaceid == OVRD_SPACE_IO) {
res = io_write(address, len, data);
+ trace_iohook_event(IO_WR, len, address, value);
} else if (spaceid == OVRD_SPACE_PCICONF) {
raw_spin_unlock_irqrestore(&io_hook_lock,
lock_flags);
- raw_spin_lock_irqsave(&pci_lock, flags);
- pcib = (struct pci_bus *)bus;
- pcib->ops->write(pcib, devfn, pos, len,
- (u32)value);
- raw_spin_unlock_irqrestore(&pci_lock, flags);
+ pci_wr(bus, address, len, (u32)value);
raw_spin_lock_irqsave(&io_hook_lock,
lock_flags);
+ trace_iohook_event(PCI_WR, len, address, value);
} else if (spaceid == OVRD_SPACE_MSR) {
- struct msr_regs_info *rv;
- unsigned int msr, low, high;
-
- rv = (struct msr_regs_info *)bus;
- msr = address & 0xffffffff;
- low = value & 0xffffffff;
- high = value >> 32;
-
- if (rv && rv->regs) { /* wrmsr_safe_regs() */
- rv->err = wrmsr_safe_regs(rv->regs);
- } else if (rv) { /* rv->regs == NULL */
- rv->err = native_do_write_msr_safe(msr,
- low, high);
- } else { /* rv == NULL */
- native_do_write_msr(msr, low, high);
- }
+ msr_wr(bus, address, value);
+ trace_iohook_event(MSR_WR, len, address, value);
} else
- break;
+ trace_iohook_event(MSR_WR + 2, len, address,
+ value);
+ break;
}

for (i = 0; i < len; i++) {
@@ -634,10 +725,30 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)
}
}

+ /* finished using ovrd_reg->, safe to unlock */
+ raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);
+
+ if (!(address >= faddress && address+len <= faddress+flength &&
+ bit_mask == (u64)((1<<flength*8) - 1))) {
+ /* partially overridden. write to HW for real bits */
+ if (spaceid == OVRD_SPACE_MEM)
+ res = mem_write(vaddr, len, data);
+ else if (spaceid == OVRD_SPACE_IO)
+ res = io_write(address, len, data);
+ else if (spaceid == OVRD_SPACE_PCICONF)
+ pci_wr(bus, address, len, (u32)value);
+ else if (spaceid == OVRD_SPACE_MSR)
+ msr_wr(bus, address, value);
+ else
+ break;
+ }
+
+ goto out;
}

raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);

+out:
return ret;
}
EXPORT_SYMBOL(write_ovrd_common);
@@ -808,6 +919,8 @@ char *hook_attrib(int attrib)
return "wc";
case OVRD_RC:
return "rc";
+ case OVRD_TC:
+ return "tc";
}

return "(null)";
@@ -1019,6 +1132,8 @@ mem_io:
attrib = OVRD_RC;
else if (cval == *(u16 *)"wc")
attrib = OVRD_WC;
+ else if (cval == *(u16 *)"tc")
+ attrib = OVRD_TC;
else
return -1;

@@ -1044,10 +1159,32 @@ mem_io:
}

static ssize_t
+hook_write_overrides(char *buf, size_t len, int spaceid)
+{
+ char *pstr, *entry;
+
+ /* first delete all overrides */
+ iohook_cleanup_ovrd(spaceid);
+
+ buf[len] = '\0';
+ pstr = buf;
+ while (pstr && *pstr) {
+ pstr = skip_spaces(pstr);
+ entry = strsep(&pstr, " \t\x0D\x0A");
+ pr_info("hook_write_overrides: %s\n", entry);
+ if (hook_parse_entry(entry, spaceid))
+ return -EINVAL;
+ }
+
+ return len;
+
+}
+
+static ssize_t
hook_write(struct file *file, const char __user *user_buf,
size_t user_len, loff_t *offset, int spaceid)
{
- char *buf, *pstr, *entry;
+ char *buf;
ssize_t rc;

if (*offset)
@@ -1062,22 +1199,7 @@ hook_write(struct file *file, const char __user *user_buf,
goto out_free;
}

- /* first delete all overridden registers */
- iohook_cleanup_ovrd(spaceid);
-
- buf[user_len] = '\0';
- pstr = buf;
- while (pstr && *pstr) {
- pstr = skip_spaces(pstr);
- entry = strsep(&pstr, " \t\x0D\x0A");
- pr_info("hook_write input: %s\n", entry);
- if (hook_parse_entry(entry, spaceid)) {
- rc = -EINVAL;
- goto out_free;
- }
- }
-
- rc = user_len;
+ rc = hook_write_overrides(buf, user_len, spaceid);

out_free:
vfree(buf);
@@ -1158,6 +1280,45 @@ static const struct file_operations hook_trigger_fops = {
.write = hook_trigger_write,
};

+/* These params define persistent overrides across reboot */
+static char *io = "", *mem = "", *pciconf = "", *msr = "";
+module_param(io, charp, 0);
+MODULE_PARM_DESC(io, "define overrides in io port space");
+module_param(mem, charp, 0);
+MODULE_PARM_DESC(mem, "define overrides in mem space");
+module_param(pciconf, charp, 0);
+MODULE_PARM_DESC(pciconf, "define overrides in pci config space");
+module_param(msr, charp, 0);
+MODULE_PARM_DESC(msr, "define overrides in MSR space");
+
+static void
+add_persistent_overrides(void)
+{
+ int persist = 0;
+
+ if (*io) {
+ hook_write_overrides(io, strlen(io), OVRD_SPACE_IO);
+ persist++;
+ }
+ if (*mem) {
+ hook_write_overrides(mem, strlen(mem), OVRD_SPACE_MEM);
+ persist++;
+ }
+ if (*pciconf) {
+ hook_write_overrides(pciconf, strlen(pciconf),
+ OVRD_SPACE_PCICONF);
+ persist++;
+ }
+ if (*msr) {
+ hook_write_overrides(msr, strlen(msr), OVRD_SPACE_MSR);
+ persist++;
+ }
+
+ if (persist)
+ iohook_start_ovrd();
+
+}
+
static int __init iohook_init(void)
{
struct dentry *root;
@@ -1198,6 +1359,8 @@ static int __init iohook_init(void)
if (hook_reg_dentry == NULL)
return -ENODEV;

+ add_persistent_overrides();
+
return 0;
}

diff --git a/include/trace/events/iohook.h b/include/trace/events/iohook.h
new file mode 100644
index 0000000..8655ba5
--- /dev/null
+++ b/include/trace/events/iohook.h
@@ -0,0 +1,58 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM iohook
+
+#if !defined(_TRACE_IOHOOK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IOHOOK_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#define MM_RD 0
+#define MM_WR 1
+#define IO_RD 2
+#define IO_WR 3
+#define PCI_RD 4
+#define PCI_WR 5
+#define MSR_RD 6
+#define MSR_WR 7
+
+extern char *hook_func_names[];
+
+const char *iohook_trace_parse_addr(struct trace_seq*, int, u64);
+#define __parse_addr iohook_trace_parse_addr(p, __entry->type, \
+ __entry->address)
+
+/**
+ * @type: type of hw access (MM_RD, MM_WR, IO_RD, PCI_RD ...)
+ */
+TRACE_EVENT(iohook_event,
+ TP_PROTO(const int type,
+ const int len,
+ const u64 address,
+ const u64 value),
+
+ TP_ARGS(type, len, address, value),
+
+ TP_STRUCT__entry(
+ __field( int, type)
+ __field( int, len)
+ __field( u64, address)
+ __field( u64, value)
+ ),
+
+ TP_fast_assign(
+ __entry->type = type;
+ __entry->len = len;
+ __entry->address = address;
+ __entry->value = value;
+ ),
+
+ TP_printk("%s(%s, 0x%x) %s 0x%llx\n", __entry->type > MSR_WR ?
+ "unknown" : hook_func_names[__entry->type], __parse_addr,
+ __entry->len, __entry->type % 2 ? "==>" : "<==", /* event/odd */
+ __entry->value & ((1ull << (8 * __entry->len)) - 1))
+);
+#endif /* _TRACE_IOHOOK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--
1.7.5.4

2014-04-15 14:02:12

by Rui Wang

[permalink] [raw]
Subject: [PATCH v3 3/5] I/O Hook: sysfs interface to emulate h/w events

Add sysfs interface used to emulate h/w events. Here's how it works:

The sysfs interface can be used to add/delete Register Overrides with user-
defined values. The user can also specify which IRQ to be triggered via
Inter-Processor Interrupt (IPI) while the h/w registers are being overridden.
When the irq handler is triggered by the IPI it looks for the registers
specific to some h/w events. As long as the Register Overrides are setup
correctly, the irq handler will believe that the h/w is in a state
corresponding to a predefined interrupt, thus process the event.

This can be typically used to generate ACPI events, PCI interrupts, PCIe
AER injection etc., and can thus be used to help test RAS features like
the hotplug of CPU/MEM/IOH on machines that are not capable of generating
the events.

See Documentation/PCI/iohook.txt for usage details.

Signed-off-by: Rui Wang <[email protected]>
---
Documentation/PCI/iohook.txt | 282 ++++++++++++++++++++++++
drivers/misc/iohook/iohook.c | 490 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 772 insertions(+), 0 deletions(-)
create mode 100644 Documentation/PCI/iohook.txt

diff --git a/Documentation/PCI/iohook.txt b/Documentation/PCI/iohook.txt
new file mode 100644
index 0000000..ae9b4ad
--- /dev/null
+++ b/Documentation/PCI/iohook.txt
@@ -0,0 +1,282 @@
+Emulating h/w events via iohook
+=================================
+
+1. Introduction
+2. How to use it
+3. Use cases
+
+1. Introduction
+---------------------
+I/O Hook is a mechanism to intercept i/o register access functions in the
+kernel. By overriding h/w register bits with user-defined bits in RAM called
+Register Overrides, it is possible to emulate h/w states without modifying
+the driver specific to that hardware.
+
+iohook is a driver that exports sysfs interface used to talk to the I/O Hook
+in the kernel in order to emulate h/w events. Here's how it works:
+
+The sysfs interface can be used to add/delete Register Overrides with user-
+defined values. The user can also specify which IRQ to be triggered via
+Inter-Processor Interrupt (IPI) while the h/w registers are being over-
+ridden. When the irq handler is triggered by the IPI it looks for the
+registers specific to some h/w events. As long as the Register Overrides are
+setup correctly, the irq handler will believe that the h/w is in a state
+corresponding to a predefined interrupt, thus process the event.
+
+A Register Override can be defined in whatever bit-width, identified by its
+address, bitmask, initial value and attributes like read-only, read-write,
+write-clear, etc., similar to how a hardware register behaves when accessed.
+
+A Register Override may not use every bit in a byte. Its bitmask identifies
+which bits are used (overridden). The unused bits are accessed on the h/w
+and combined with the overridden bits to form the final result. The reason
+to support this combination is that many h/w events are controlled by only
+a few bits. For example the ACPI GPEx_STS and GPEx_EN are encoded such that
+each bit represents a different General Purpose Event. The user is supposed
+to fully understand the side-effect, if any, of reading adjacent bits when
+he or she adds a Register Override not in the entirety of a byte, a word, a
+dword, or a qword.
+
+iohook can be typically used to generate ACPI events, PCI interrupts, PCIe
+AER injection etc., and can thus be used to help test RAS features like
+the hotplug of CPU/MEM/IOH on machines that are not capable of generating
+these events.
+
+2. How to use it
+-------------------
+
+2.1 Kernel compilation and sysfs interface
+
+First compile the kernel with CONFIG_IO_HOOK. After the new kernel is loaded
+a directory subtree is created under /sys/kernel/debug/iohook
+
+bash# cd /sys/kernel/debug/iohook
+bash# ls
+io irq mem pciconf trigger
+
+Each file is used to manage a type of resource.
+'io' is used to add/show Register Overrides in IO port space.
+'mem' is used to add/show Register Overrides in memory space.
+'pciconf' is used to add/show Register Overrides in pci config space.
+'irq' is used to set the desired IRQ to be triggered via IPI.
+'trigger' is used to turn on/off the I/O Hook.
+
+2.2 Add Register Overrides
+
+A Register Override can be specified on the command line with the following
+syntax (all numbers are in hex without space between each element)
+
+for a Register Override in IO port or memory space, it's specified as:
+
+ address-length[value/mask]attribute
+
+for a Register Override in PCI config space, it's specified as:
+
+ domain|bus:dev.func+offset-length[value/mask]attribute
+
+where
+ address - the 64bit address of the h/w register to be overridden
+
+ length - the number of bytes affected. Affected here means that
+ at least one bit in that byte is overridden. For 'length'
+ less than 8, the overridden bits are determined by the
+ corresponding bits set in 'mask'. Other bits are unaffected
+ and accessed on the h/w. For 'length' >= 8 then 'mask' is
+ ignored and the entire range of bytes are overridden to be
+ a single value specified by the first byte of 'value'. This
+ can be used, for example, to set the entire PCI Config
+ space of a device to 0xff.
+
+ value - the user-defined value to replace the content of the
+ corresponding h/w register. for 'length' < 8, only the bits
+ masked by 'mask' are used.
+
+ mask - is the bit-mask specifying the bits to be overridden when
+ 'length' < 8.
+
+ attribute - used to specify the attribute of the overridden bits.
+ It can be ro, rw, wc, rc to mean read-only, read-write,
+ write-clear, and read-clear respectively.
+
+ domain - pci domain number
+ bus - pci bus number
+ dev - pci device number
+ func - pci function number
+ offset - used to specify the offset of the affected bytes in the
+ PCI config space.
+
+Multiple registers can be specified on one line with each separated by at
+least one space. For example, to override two registers in IO space at port
+0x420 and port 0x428, with the former in write-clear mode and the latter in
+read-only mode:
+
+bash# cd /sys/kernel/debug/iohook
+bash# echo "420-1[04/04]wc 428-1[04/04]ro" > io
+The syntax is "address-length[value/mask]attribute".
+Since only one bit is overridden (mask is 0x04), the affected byte is 1.
+So 'length' is 1.
+
+As another example, to add two Register Overrides in the PCI config space of
+device 00:05.0 at offsets 0x130 and 0x134 respectively:
+
+bash# echo "0000|00:05.0+130-1[01/01]wc 0000|00:05.0+134-2[0500/ffff]ro">pciconf
+The syntax is "domain|bus:dev.func+offset-length[value/mask]attribute"
+The first register overrides only bit0 and the second register overrides the
+first 2 bytes (mask == 0xffff), with an initial value of 0x0500.
+
+Register Overrides are disabled when added. They can be enabled by using the
+'trigger' file. See below.
+
+2.3 Add IRQ and enable the Register Overrides
+
+To specify an IRQ to be triggered via IPI, just echo the IRQ number in decimal
+to the 'irq' file. For example:
+
+bash# cd /sys/kernel/debug/iohook
+bash# echo 9 > irq
+This specifies that IRQ9 be triggered after the Register Overrides are enabled.
+
+To enable the Register Overrides in the kernel:
+
+bash# echo 1 > trigger
+
+This immediately enables all the Register Overrides and if an IRQ number was
+specified, generate the IPI.
+
+To disable the Register Overrides in the kernel:
+
+bash echo 0 > trigger
+This immediately disables all Register Overrides. The kernel starts to see
+real h/w registers again. This does not delete the Register Overrides. They
+can be re-enabled again by echo 1 > trigger.
+
+3. Use cases
+-----------------
+
+3.1 Generate ACPI Events
+
+A typical use case is to generate ACPI events. Suppose we want to test IOH
+hotplug on a machine whose BIOS doesn't support it. We can override its DSDT
+and add a GPE to notify the OS the hot-add/removal of the IOH device. We can
+then use iohook to trigger the imaginary GPE and the OS will have to process
+the hotplug event. (For detailed instructions on how to override DSDT see
+Documentation/acpi/initrd_table_override.txt.) The following is an example:
+
+We first extract the DSDT
+bash # cat /sys/firmware/acpi/tables/DSDT > DSDT
+bash # iasl -d DSDT
+Now we have disassembled the DSDT into DSDT.dsl. We vi DSDT.dsl and notice
+that there's a IOH device named \_SB.IOH1. Since GPEs are named _Lxx with xx
+being the GPE numbers, we notice that there's no _L02 in DSDT.dsl, so we can
+use this spare GPE to notify the OS the hotplug of \_SB.IOH1. We add a new
+method in DSDT.dsl:
+
+ Method (_L02, 0, NotSerialized) // _Lxx: Level-Triggered GPE
+ {
+ Notify (\_SB.IOH1, 0x0) // 0x0: hot-add event
+ }
+
+ACPI uses a single interrupt (SCI) to dispatch all GPEs. The SCI IRQ number
+can be found from:
+
+bash # grep acpi /proc/interrupts | awk '{ print $1; }'
+9:
+
+which means SCI is IRQ9. What we need to do is to generate IRQ9 via IPI and
+cause the SCI interrupt handler to call _L02. How can we do that? Each _Lxx
+has a status bit in GPEx_STS to reflect if it is asserted and a controlling
+bit in GPEx_EN to reflect if it is enabled. The SCI interrupt handler reads
+GPEx_STS and GPEx_EN to decide whether to call a _Lxx. A _Lxx is called if
+it is both enabled and asserted. We can use Register Overrides to override
+the bits controlling _L02 in GPEx_STS and GPEx_EN so that the SCI handler
+will believe that _L02 is asserted and enabled, thus it will call the ACPI
+method that we added.
+
+_L02's controlling bits are in GPE0_STS and GPE0_EN, whose addresses can be
+found from FACP as follows.
+
+bash # cat /sys/firmware/acpi/tables/FACP > FACP
+bash # iasl -d FACP
+bash # grep GPE FACP.dsl
+[050h 080 4] GPE0 Block Address : 00000420
+[054h 084 4] GPE1 Block Address : 00000000
+[05Ch 092 1] GPE0 Block Length : 10
+
+So according to FACP, GPE0 block is at port 0x420; GPE0 block length is 0x10.
+GPE0_STS/GPE0_EN each occupies half the block length, with GPE0_STS at 0x420
+and GPE0_EN at 0x428. _L02 is controlled by bit2 of each of them. We need to
+override bit2 of both IO port 0x420 and IO port 0x428.
+
+Before adding the Register Override we need to replace the DSDT provided by
+BIOS with our modified DSDT.dsl, by following initrd_table_override.txt.
+Once the new DSDT is injected into initrd we reboot the system and add the
+Register Override as follows.
+
+bash # cd /sys/kernel/debug/iohook/
+bash # echo "420-1[4/4]wc 428-1[4/4]ro" > io
+bash # echo 9 > irq
+bash # echo 1 > trigger
+
+The last command immediately triggers the _L02 method that we provided and
+Linux sees a hot-add ACPI event for the IOH device.
+
+
+3.2 Generate PCIe Native Hotplug
+
+The pciehp driver allocates an irq to handle each native PCIe hotplug slot.
+The driver prints the status of each hotplug slot when debugging is enabled
+so we can see which irq it's using.
+
+bash # modprobe pciehp pciehp_debug=1
+
+dmesg shows:
+
+pciehp 0000:00:1c.0:pcie04: Hotplug Controller:
+pciehp 0000:00:1c.0:pcie04: Seg/Bus/Dev/Func/IRQ : 0000:00:1c.0 IRQ 70
+
+So we pick this hotplug slot at 00:1c.0 which uses IRQ70. Its PCIe Slot
+Status Register is at offset 0x5a of its pci config space. We can easily
+inject a Hot-Add (Presence Detect) event into the system by adding a
+Register Override to override the register at offset 0x5a in the PCI config
+space of 00:1c.0
+
+bash # cd /sys/kernel/debug/iohook/
+bash # echo "00|00:1c.0+5a-1[48/ff]wc" > pciconf
+bash # echo 70 > irq
+bash # echo 1 > trigger
+
+dmesg shows:
+
+pciehp 0000:00:1c.0:pcie04: Card present on Slot(1)
+pciehp 0000:00:1c.0:pcie04: Device 0000:0f:00.0 already exists at 0000:0f:00,
+cannot hot-add
+pciehp 0000:00:1c.0:pcie04: Cannot add device at 0000:0f:00
+
+We can also inject an Attention Button Pressed event:
+
+bash # echo "00|00:1c.0+5a-1[41/ff]wc" > pciconf
+bash # echo 70 > irq
+bash # echo 1 > trigger
+
+dmesg shows:
+
+pciehp 0000:00:1c.0:pcie04: Button pressed on Slot(1)
+pciehp 0000:00:1c.0:pcie04: PCI slot #1 - powering off due to button press
+
+3.3 PCIe AER error injection
+
+The aerdrv driver allocates a few irqs to handle AER. Each of them handles a
+PCIe root port device. In an example aerdrv uses irq66 to handle AER on root
+port 00:05.0. As seen in lspci, the AER capability is at offset 0x100 of its
+pci config space. So the Root Error Status Register is at offset 0x130, and
+the Error Source Identification Register is at offset 0x134. We can inject a
+Correctable Error with a source id to identify its child at 05:00.0.
+
+bash # cd /sys/kernel/debug/iohook/
+bash # echo "00|00:05.0+130-1[01/01]wc 00|00:05.0+134-2[0500/ffff]ro" > pciconf
+bash # echo 66 > irq
+bash # echo 1 > trigger
+
+dmesg shows:
+pcieport 0000:00:05.0: AER: Corrected error received: id=0500
+
diff --git a/drivers/misc/iohook/iohook.c b/drivers/misc/iohook/iohook.c
index d8c41d0..5584eb7 100644
--- a/drivers/misc/iohook/iohook.c
+++ b/drivers/misc/iohook/iohook.c
@@ -1,10 +1,34 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/of.h>
+#include <linux/seq_file.h>
+#include <linux/smp.h>
+#include <linux/ftrace.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/irq.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
#include <linux/reg_ovrd.h>
#include <linux/pci.h>
#include "iohook.h"

+MODULE_LICENSE("GPL");
+
+static struct dentry *hook_irq_dentry;
+static struct dentry *hook_reg_dentry;
+
static DEFINE_RAW_SPINLOCK(io_hook_lock);
static DEFINE_RAW_SPINLOCK(engine_lock);

@@ -541,3 +565,469 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)
return ret;
}
EXPORT_SYMBOL(write_ovrd_common);
+
+static void trigger_irq_by_ipi(unsigned long irqnum)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+
+ desc = irq_to_desc(irqnum);
+ data = irq_desc_get_irq_data(desc);
+ chip = irq_data_get_irq_chip(data);
+
+ raw_spin_lock(&desc->lock);
+
+ if (chip->irq_retrigger)
+ chip->irq_retrigger(data);
+ else
+ pr_err("platform doesn't support irq_retrigger?\n");
+
+ raw_spin_unlock(&desc->lock);
+
+}
+
+/* currently only one irq can be triggered */
+int g_irq_num;
+
+ssize_t hook_trigger_read(struct file *file, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ char buf[64]; /* big enough to hold a number */
+ int r;
+
+ r = sprintf(buf, "%u\n", iohook_get_status() ? 1 : 0);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+}
+
+static ssize_t hook_trigger_write(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char *trigger = NULL;
+ unsigned long on, ret = -EINVAL;
+
+ trigger = kmalloc(count+1, GFP_ATOMIC);
+ if (!trigger) {
+ pr_err("hook_trigger: Memory allocation failed\n");
+ return count;
+ }
+ trigger[count] = '\0';
+ if (copy_from_user(trigger, user_buf, count))
+ goto end;
+ ret = kstrtoul(trigger, 10, &on);
+ if (ret)
+ goto end;
+
+ if (on == 1) {
+ iohook_start_ovrd();
+ if (g_irq_num)
+ trigger_irq_by_ipi(g_irq_num);
+ } else if (on == 0) {
+ iohook_stop_ovrd();
+ }
+end:
+ kfree(trigger);
+ return count;
+}
+
+ssize_t hook_irq_read(struct file *file, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ char buf[64]; /* big enough to hold a number */
+ int r;
+
+ r = sprintf(buf, "%u\n", g_irq_num);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t hook_irq_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char *irq = NULL;
+ unsigned long irqnum, ret;
+
+ irq = kmalloc(count+1, GFP_ATOMIC);
+ if (!irq) {
+ pr_err("hook_irq_write: Memory allocation failed\n");
+ return count;
+ }
+ irq[count] = '\0';
+ if (copy_from_user(irq, user_buf, count))
+ goto end;
+ ret = kstrtoul(irq, 10, &irqnum);
+ if (ret) {
+ pr_err("bogus irq number? %s\n", irq);
+ goto end;
+ }
+ if (irqnum > NR_IRQS) {
+ pr_err("irq num too big: %lu\n", irqnum);
+ goto end;
+ }
+
+ g_irq_num = irqnum;
+end:
+ kfree(irq);
+ return count;
+}
+
+struct hook_iter {
+ int spaceid; /* io/mem/pciconf */
+};
+
+static void
+hook_seq_stop(struct seq_file *s, void *it)
+{
+}
+
+static void *
+hook_seq_next(struct seq_file *s, void *it, loff_t *offset)
+{
+ struct hook_iter *iter = s->private;
+ int idx;
+ struct reg_ovrd *regmap;
+
+ idx = *offset;
+ regmap = iohook_query_ovrd(iter->spaceid, idx);
+ if (regmap)
+ (*offset)++;
+
+ return regmap;
+}
+
+static void *
+hook_seq_start(struct seq_file *s, loff_t *offset)
+{
+ return hook_seq_next(s, NULL, offset);
+}
+
+char *hook_attrib(int attrib)
+{
+ switch (attrib) {
+ case OVRD_RO:
+ return "ro";
+ case OVRD_RW:
+ return "rw";
+ case OVRD_WC:
+ return "wc";
+ case OVRD_RC:
+ return "rc";
+ }
+
+ return "(null)";
+
+}
+
+static int
+hook_seq_show(struct seq_file *s, void *it)
+{
+ struct hook_iter *iter = s->private;
+ struct reg_ovrd *regmap;
+
+ regmap = (struct reg_ovrd *)it;
+ switch (iter->spaceid) {
+ case OVRD_SPACE_IO:
+ case OVRD_SPACE_MEM:
+ seq_printf(s,
+ "addr: 0x%llx lenght: 0x%x value: 0x%llx mask: 0x%llx attrib: %s\n",
+ regmap->address, regmap->length, regmap->val,
+ regmap->bit_mask, hook_attrib(regmap->attrib));
+ break;
+ case OVRD_SPACE_PCICONF:
+ seq_printf(s,
+ "pciconf: 0x%04x|%02x:%02x.%02x offset: 0x%x lenght: 0x%x value: 0x%llx mask: 0x%llx attrib: %s\n",
+ PCI_DECODE_DOMAIN(regmap->address),
+ PCI_DECODE_BUSN(regmap->address),
+ PCI_SLOT(PCI_DECODE_DEVFN(regmap->address)),
+ PCI_FUNC(PCI_DECODE_DEVFN(regmap->address)),
+ PCI_DECODE_POS(regmap->address), regmap->length,
+ regmap->val,
+ regmap->bit_mask, hook_attrib(regmap->attrib));
+ break;
+ default:
+ seq_puts(s, "error: unknown spaceid\n");
+ break;
+ }
+ return 0;
+}
+
+static const struct seq_operations hook_seq_ops = {
+ .start = hook_seq_start,
+ .stop = hook_seq_stop,
+ .next = hook_seq_next,
+ .show = hook_seq_show,
+};
+
+static int
+hook_io_open(struct inode *inode, struct file *file)
+{
+ struct hook_iter *iter;
+
+ iter = __seq_open_private(file, &hook_seq_ops,
+ sizeof(struct hook_iter));
+ if (iter)
+ iter->spaceid = OVRD_SPACE_IO;
+
+ return iter ? 0 : -ENOMEM;
+}
+
+static int
+hook_mem_open(struct inode *inode, struct file *file)
+{
+ struct hook_iter *iter;
+
+ iter = __seq_open_private(file, &hook_seq_ops,
+ sizeof(struct hook_iter));
+ if (iter)
+ iter->spaceid = OVRD_SPACE_MEM;
+
+ return iter ? 0 : -ENOMEM;
+}
+
+static int
+hook_pciconf_open(struct inode *inode, struct file *file)
+{
+ struct hook_iter *iter;
+
+ iter = __seq_open_private(file, &hook_seq_ops,
+ sizeof(struct hook_iter));
+ if (iter)
+ iter->spaceid = OVRD_SPACE_PCICONF;
+
+ return iter ? 0 : -ENOMEM;
+}
+
+/*
+ * IO & MEM entries are formatted as follows (all values are in hex):
+ *
+ * address-length[value/mask]attrib
+ *
+ * For example: 420-1[20/20]rw
+ * It means io port 0x420, 1 byte, value 0x20, mask 0x20, read/write
+ * PCI config entries are formatted as (all values are in hex):
+ *
+ * domain|bus:dev.func+offset-length[value/mask]attrib
+ *
+ * For example: 0000|00:1e.0+10-4[fe930000/ffffffff]rw
+ * It means pci domain 0, bus 0, dev 1e, func 0, offset 10, 4 bytes
+ * with a overridden value of 0xfe930000, mask 0xffffffff, read/write
+ */
+static int
+hook_parse_entry(char *entry, int spaceid)
+{
+ char *field;
+ u64 address, length, value, mask;
+ unsigned long domain, bus, dev, func;
+ int attrib, ret;
+ u16 cval;
+
+ if (spaceid != OVRD_SPACE_PCICONF)
+ goto mem_io;
+
+ field = strsep(&entry, "|");
+ ret = kstrtoul(field, 16, &domain);
+ if (ret || !entry)
+ return -1;
+
+ field = strsep(&entry, ":");
+ ret = kstrtoul(field, 16, &bus);
+ if (ret || !entry)
+ return -1;
+
+ field = strsep(&entry, ".");
+ ret = kstrtoul(field, 16, &dev);
+ if (ret || !entry)
+ return -1;
+
+ field = strsep(&entry, "+");
+ ret = kstrtoul(field, 16, &func);
+ if (ret || !entry)
+ return -1;
+
+mem_io:
+ field = strsep(&entry, "-");
+ ret = kstrtoull(field, 16, &address);
+ if (ret || !entry)
+ return -1;
+
+ pr_info("parse_hook_entry() address=0x%llx\n", address);
+
+ field = strsep(&entry, "[");
+ ret = kstrtoull(field, 16, &length);
+ if (ret || !entry)
+ return -1;
+
+ pr_info("parse_hook_entry() length=0x%llx\n", length);
+
+ field = strsep(&entry, "/");
+ ret = kstrtoull(field, 16, &value);
+ if (ret || !entry)
+ return -1;
+ pr_info("parse_hook_entry() value=0x%llx\n", value);
+
+ field = strsep(&entry, "]");
+ ret = kstrtoull(field, 16, &mask);
+ if (ret || !entry)
+ return -1;
+ pr_info("parse_hook_entry() mask=0x%llx\n", mask);
+
+ cval = (*(u16 *)entry);
+ pr_info("parse_hook_entry() attrib:%s, cval=0x%x\n",
+ entry, cval);
+ if (cval == *(u16 *)"ro")
+ attrib = OVRD_RO;
+ else if (cval == *(u16 *)"rw")
+ attrib = OVRD_RW;
+ else if (cval == *(u16 *)"rc")
+ attrib = OVRD_RC;
+ else if (cval == *(u16 *)"wc")
+ attrib = OVRD_WC;
+ else
+ return -1;
+
+ if (spaceid == OVRD_SPACE_PCICONF) {
+ address = PCI_ENCODE_ADDR(domain, bus, PCI_DEVFN(dev, func),
+ address);
+ }
+
+ iohook_add_ovrd(spaceid, address, value, mask, length, attrib);
+
+ return 0;
+}
+
+static ssize_t
+hook_write(struct file *file, const char __user *user_buf,
+ size_t user_len, loff_t *offset, int spaceid)
+{
+ char *buf, *pstr, *entry;
+ ssize_t rc;
+
+ if (*offset)
+ return -EINVAL;
+
+ buf = vzalloc(user_len + 1);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ if (strncpy_from_user(buf, user_buf, user_len) < 0) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+
+ /* first delete all overridden registers */
+ iohook_cleanup_ovrd(spaceid);
+
+ buf[user_len] = '\0';
+ pstr = buf;
+ while (pstr && *pstr) {
+ pstr = skip_spaces(pstr);
+ entry = strsep(&pstr, " \t\x0D\x0A");
+ pr_info("hook_write input: %s\n", entry);
+ if (hook_parse_entry(entry, spaceid)) {
+ rc = -EINVAL;
+ goto out_free;
+ }
+ }
+
+ rc = user_len;
+
+out_free:
+ vfree(buf);
+ return rc;
+}
+
+static ssize_t
+hook_io_write(struct file *file, const char __user *user_buf,
+ size_t user_len, loff_t *offset)
+{
+
+ return hook_write(file, user_buf, user_len, offset, OVRD_SPACE_IO);
+}
+
+static ssize_t
+hook_mem_write(struct file *file, const char __user *user_buf,
+ size_t user_len, loff_t *offset)
+{
+
+ return hook_write(file, user_buf, user_len, offset, OVRD_SPACE_MEM);
+}
+
+static ssize_t
+hook_pciconf_write(struct file *file, const char __user *user_buf,
+ size_t user_len, loff_t *offset)
+{
+
+ return hook_write(file, user_buf, user_len, offset, OVRD_SPACE_PCICONF);
+}
+
+static const struct file_operations hook_io_fops = {
+ .open = hook_io_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+ .write = hook_io_write,
+};
+
+static const struct file_operations hook_mem_fops = {
+ .open = hook_mem_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+ .write = hook_mem_write,
+};
+
+static const struct file_operations hook_pciconf_fops = {
+ .open = hook_pciconf_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+ .write = hook_pciconf_write,
+};
+
+static const struct file_operations hook_irq_fops = {
+ .read = hook_irq_read,
+ .write = hook_irq_write,
+};
+
+static const struct file_operations hook_trigger_fops = {
+ .read = hook_trigger_read,
+ .write = hook_trigger_write,
+};
+
+static int __init iohook_init(void)
+{
+ struct dentry *root;
+
+ root = debugfs_create_dir("iohook", NULL);
+ if (root == NULL)
+ return -ENODEV;
+
+ hook_irq_dentry = debugfs_create_file("irq", S_IWUSR,
+ root, NULL, &hook_irq_fops);
+ if (hook_irq_dentry == NULL)
+ return -ENODEV;
+
+ hook_reg_dentry = debugfs_create_file("io", S_IWUSR,
+ root, NULL, &hook_io_fops);
+ if (hook_reg_dentry == NULL)
+ return -ENODEV;
+
+ hook_reg_dentry = debugfs_create_file("mem", S_IWUSR,
+ root, NULL, &hook_mem_fops);
+ if (hook_reg_dentry == NULL)
+ return -ENODEV;
+
+ hook_reg_dentry = debugfs_create_file("pciconf", S_IWUSR,
+ root, NULL, &hook_pciconf_fops);
+ if (hook_reg_dentry == NULL)
+ return -ENODEV;
+
+ hook_reg_dentry = debugfs_create_file("trigger", S_IWUSR,
+ root, NULL, &hook_trigger_fops);
+ if (hook_reg_dentry == NULL)
+ return -ENODEV;
+
+ return 0;
+}
+
+device_initcall(iohook_init);
--
1.7.5.4

2014-04-15 13:59:22

by Rui Wang

[permalink] [raw]
Subject: [PATCH v3 2/5] I/O Hook: Help functions to manage the hook

Add the following kernel helper functions used to add/delete/query Register
Overrides, and to start/stop the I/O Hook:
iohook_add_ovrd
iohook_query_ovrd
iohook_cleanup_ovrd
iohook_start_ovrd
iohook_stop_ovrd
iohook_get_status

Signed-off-by: Rui Wang <[email protected]>
---
drivers/misc/iohook/iohook.c | 151 ++++++++++++++++++++++++++++++++++++++++++
1 files changed, 151 insertions(+), 0 deletions(-)

diff --git a/drivers/misc/iohook/iohook.c b/drivers/misc/iohook/iohook.c
index e6a626f..d8c41d0 100644
--- a/drivers/misc/iohook/iohook.c
+++ b/drivers/misc/iohook/iohook.c
@@ -6,6 +6,7 @@
#include "iohook.h"

static DEFINE_RAW_SPINLOCK(io_hook_lock);
+static DEFINE_RAW_SPINLOCK(engine_lock);

LIST_HEAD(ovrd_io_reg_map);
LIST_HEAD(ovrd_mem_reg_map);
@@ -14,6 +15,150 @@ LIST_HEAD(ovrd_pci_conf_reg_map);
struct static_key ovrdhw_enabled = STATIC_KEY_INIT_FALSE;
EXPORT_SYMBOL(ovrdhw_enabled);

+int g_ovrd_on;
+/* query a Register Override given spaceid and index */
+struct reg_ovrd *iohook_query_ovrd(int spaceid, int idx)
+{
+ struct list_head *reg_ovrd;
+ struct reg_ovrd *ovrdreg, *regentry;
+ unsigned long lock_flags = 0;
+ int n = 0;
+
+ if (spaceid == OVRD_SPACE_MEM)
+ reg_ovrd = &ovrd_mem_reg_map;
+ else if (spaceid == OVRD_SPACE_IO)
+ reg_ovrd = &ovrd_io_reg_map;
+ else
+ reg_ovrd = &ovrd_pci_conf_reg_map;
+
+ regentry = NULL;
+ raw_spin_lock_irqsave(&io_hook_lock, lock_flags);
+ list_for_each_entry(ovrdreg, reg_ovrd, node) {
+ if (n++ >= idx) {
+ regentry = ovrdreg;
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);
+
+ pr_info("iohook_query_ovrd() returns %p, idx=%d\n",
+ regentry, idx);
+ return regentry;
+
+}
+
+/* delete all Register Overrides from one space (IO/mem/pciconf) */
+void iohook_cleanup_ovrd(int spaceid)
+{
+ struct list_head *tmp, *next, *ovrd_list;
+
+ if (spaceid == OVRD_SPACE_MEM)
+ ovrd_list = &ovrd_mem_reg_map;
+ else if (spaceid == OVRD_SPACE_IO)
+ ovrd_list = &ovrd_io_reg_map;
+ else
+ ovrd_list = &ovrd_pci_conf_reg_map;
+
+ list_for_each_safe(tmp, next, ovrd_list) {
+ struct reg_ovrd *ovrdreg =
+ list_entry(tmp, struct reg_ovrd, node);
+ list_del(tmp);
+ kfree(ovrdreg);
+ }
+}
+
+/* for pci config space, address is encoded per PCI_ENCODE_ADDR() */
+void iohook_add_ovrd(int spaceid, u64 address, u64 value, u64 mask,
+ u32 length, u8 attrib)
+{
+ struct list_head *reg_ovrd;
+ struct reg_ovrd *ovrdreg;
+ unsigned long lock_flags = 0;
+
+ if (spaceid == OVRD_SPACE_MEM)
+ reg_ovrd = &ovrd_mem_reg_map;
+ else if (spaceid == OVRD_SPACE_IO)
+ reg_ovrd = &ovrd_io_reg_map;
+ else
+ reg_ovrd = &ovrd_pci_conf_reg_map;
+
+ raw_spin_lock_irqsave(&io_hook_lock, lock_flags);
+ list_for_each_entry(ovrdreg, reg_ovrd, node) {
+ if (ovrdreg->address == address &&
+ ovrdreg->attrib == attrib &&
+ ovrdreg->length == length) {
+ /* if already added the address, just change the bits */
+ ovrdreg->bit_mask |= mask;
+ ovrdreg->val |= value;
+ pr_info("iohook_add_ovrd(): 0x%llx already added, changed to 0x%llx, mask:0x%llx, attrib:0x%x\n",
+ address, ovrdreg->val, ovrdreg->bit_mask,
+ attrib);
+ goto out;
+ } else if (address >= ovrdreg->address &&
+ address < ovrdreg->address + ovrdreg->length) {
+ pr_info("iohook_add_ovrd(): conflicting reg at 0x%llx, length:%llx, mask:0x%llx, attrib:0x%x\n",
+ address, ovrdreg->val, ovrdreg->bit_mask,
+ attrib);
+ goto out;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);
+
+ ovrdreg = kmalloc(sizeof(struct reg_ovrd), GFP_ATOMIC);
+ if (!ovrdreg) {
+ pr_info("failed to alloc Reg Override!\n");
+ return;
+ }
+
+ ovrdreg->address = address;
+ ovrdreg->val = value;
+ ovrdreg->length = length;
+ ovrdreg->bit_mask = mask;
+ ovrdreg->attrib = attrib;
+ raw_spin_lock_irqsave(&io_hook_lock, lock_flags);
+ list_add_tail(&ovrdreg->node, reg_ovrd);
+out:
+ raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);
+
+}
+
+/* to start the hook */
+void iohook_start_ovrd(void)
+{
+ unsigned long lock_flags = 0;
+
+ raw_spin_lock_irqsave(&engine_lock, lock_flags);
+ if (g_ovrd_on)
+ goto done;
+
+ static_key_slow_inc(&ovrdhw_enabled);
+ g_ovrd_on = 1;
+done:
+ raw_spin_unlock_irqrestore(&engine_lock, lock_flags);
+
+}
+
+/* to stop the hook */
+void iohook_stop_ovrd(void)
+{
+ unsigned long lock_flags = 0;
+
+ raw_spin_lock_irqsave(&engine_lock, lock_flags);
+ if (!g_ovrd_on)
+ goto done;
+ g_ovrd_on = 0;
+ static_key_slow_dec(&ovrdhw_enabled);
+done:
+ raw_spin_unlock_irqrestore(&engine_lock, lock_flags);
+
+}
+
+int iohook_get_status(void)
+{
+ return g_ovrd_on;
+}
+
/* len should only be 1, 2, 4, 8 */
static int mem_read(u64 address, int len, void *data)
{
@@ -170,6 +315,9 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)

ret = -EINVAL;

+ if (!g_ovrd_on)
+ return ret;
+
if (spaceid == OVRD_SPACE_MEM) {
/* in the case of memory, 'address' is virtual */
vaddr = address;
@@ -300,6 +448,9 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)

ret = -EINVAL;

+ if (!g_ovrd_on)
+ return ret;
+
if (spaceid == OVRD_SPACE_MEM) {
/* in the case of memory, 'address' is virtual */
vaddr = address;
--
1.7.5.4