2005-03-13 09:56:06

by Roland McGrath

[permalink] [raw]
Subject: [PATCH] x86-64 kprobes: handle %RIP-relative addressing mode

The existing x86-64 kprobes implementation doesn't cope with the
%RIP-relative addressing mode. Kprobes work by single-stepping a copy of
an instruction overwritten by a breakpoint. When a probe is inserted on an
instruction that uses the %RIP-relative data addressing mode, the copy run
in a different location gets different data and so the presence of that
probe causes the probed code to read or write the wrong memory location.
Without this problem fixed, it is woefully unsafe to use the current
kprobes code on x86-64 unless you are sure the instruction you instrument
is not one that accesses global data using the %RIP addressing mode.

This patch fixes the problem by recognizing the %RIP-relative addressing
mode in an instruction when it's being copied to insert the kprobe, and
adjusting its displacement so that it finds the right data. Taking this
approach requires that the copied instruction's %RIP value be within 2GB of
the virtual address of the data, i.e. the text/data areas of the kernel
code and loaded modules. To satisfy this need the patch also replaces the
use of vmalloc for getting instruction pages with lower-level calls to use
a different part of the address space, the area at the top of the address
space just above where modules are loaded. I left one page of red zone at
the top, and the 1MB-4KB thus available allows for at most 69632 kprobes.
(If we ever need to overcome that limit, we can change this to add a hook
into the arch/x86_64/kernel/modules.c code and allocate pages inside the
module area loading area instead.)


Thanks,
Roland


Signed-off-by: Roland McGrath <[email protected]>

--- linux-2.6/arch/x86_64/kernel/kprobes.c
+++ linux-2.6/arch/x86_64/kernel/kprobes.c
@@ -25,6 +25,8 @@
* interface to access function arguments.
* 2004-Oct Jim Keniston <[email protected]> and Prasanna S Panchamukhi
* <[email protected]> adapted for x86_64
+ * 2005-Mar Roland McGrath <[email protected]>
+ * Fixed to handle %rip-relative addressing mode correctly.
*/

#include <linux/config.h>
@@ -86,9 +88,124 @@ int arch_prepare_kprobe(struct kprobe *p
return 0;
}

+/*
+ * Determine if the instruction uses the %rip-relative addressing mode.
+ * If it does, return the address of the 32-bit displacement word.
+ * If not, return null.
+ */
+static inline s32 *is_riprel(u8 *insn)
+{
+ static const unsigned char onebyte_has_modrm[256] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ------------------------------- */
+ /* 00 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 00 */
+ /* 10 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 10 */
+ /* 20 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 20 */
+ /* 30 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 30 */
+ /* 40 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 40 */
+ /* 50 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 50 */
+ /* 60 */ 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0, /* 60 */
+ /* 70 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 70 */
+ /* 80 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 80 */
+ /* 90 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 90 */
+ /* a0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* a0 */
+ /* b0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* b0 */
+ /* c0 */ 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0, /* c0 */
+ /* d0 */ 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1, /* d0 */
+ /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* e0 */
+ /* f0 */ 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1 /* f0 */
+ /* ------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ };
+ static const unsigned char twobyte_has_modrm[256] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ------------------------------- */
+ /* 00 */ 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1, /* 0f */
+ /* 10 */ 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, /* 1f */
+ /* 20 */ 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1, /* 2f */
+ /* 30 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 3f */
+ /* 40 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 4f */
+ /* 50 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 5f */
+ /* 60 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 6f */
+ /* 70 */ 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1, /* 7f */
+ /* 80 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 8f */
+ /* 90 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 9f */
+ /* a0 */ 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1, /* af */
+ /* b0 */ 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1, /* bf */
+ /* c0 */ 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, /* cf */
+ /* d0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* df */
+ /* e0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ef */
+ /* f0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0 /* ff */
+ /* ------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ };
+ int need_modrm;
+
+ /* Skip legacy instruction prefixes. */
+ while (1) {
+ switch (*insn) {
+ case 0x66:
+ case 0x67:
+ case 0x2e:
+ case 0x3e:
+ case 0x26:
+ case 0x64:
+ case 0x65:
+ case 0x36:
+ case 0xf0:
+ case 0xf3:
+ case 0xf2:
+ ++insn;
+ continue;
+ }
+ break;
+ }
+
+ /* Skip REX instruction prefix. */
+ if ((*insn & 0xf0) == 0x40)
+ ++insn;
+
+ if (*insn == 0x0f) { /* Two-byte opcode. */
+ need_modrm = twobyte_has_modrm[*++insn];
+ } else { /* One-byte opcode. */
+ need_modrm = onebyte_has_modrm[*insn];
+ }
+
+ if (need_modrm) {
+ u8 modrm = *++insn;
+ if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
+ /* Displacement follows ModRM byte. */
+ return (s32 *) ++insn;
+ }
+ }
+
+ /* No %rip-relative addressing mode here. */
+ return NULL;
+}
+
void arch_copy_kprobe(struct kprobe *p)
{
+ s32 *ripdisp;
memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
+ ripdisp = is_riprel(p->ainsn.insn);
+ if (ripdisp) {
+ /*
+ * The copied instruction uses the %rip-relative
+ * addressing mode. Adjust the displacement for the
+ * difference between the original location of this
+ * instruction and the location of the copy that will
+ * actually be run. The tricky bit here is making sure
+ * that the sign extension happens correctly in this
+ * calculation, since we need a signed 32-bit result to
+ * be sign-extended to 64 bits when it's added to the
+ * %rip value and yield the same 64-bit result that the
+ * sign-extension of the original signed 32-bit
+ * displacement would have given.
+ */
+ s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
+ BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
+ *ripdisp = disp;
+ }
}

void arch_remove_kprobe(struct kprobe *p)
@@ -417,6 +534,8 @@ static kprobe_opcode_t *get_insn_slot(vo
{
struct kprobe_insn_page *kip;
struct hlist_node *pos;
+ struct vm_struct *area;
+ struct page **pages;

hlist_for_each(pos, &kprobe_insn_pages) {
kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
@@ -439,12 +558,52 @@ static kprobe_opcode_t *get_insn_slot(vo
if (!kip) {
return NULL;
}
- kip->insns = (kprobe_opcode_t*) __vmalloc(PAGE_SIZE,
- GFP_KERNEL|__GFP_HIGHMEM, __pgprot(__PAGE_KERNEL_EXEC));
- if (!kip->insns) {
+
+ /*
+ * For the %rip-relative displacement fixups to be doable, we
+ * need our instruction copy to be within +/- 2GB of any data
+ * it might access via %rip. That is, within 2GB of where the
+ * kernel image and loaded module images reside. From the base
+ * of kernel text (see vmlinux.lds.S) up through the top of the
+ * address space is less than 2GB total. There is a megabyte
+ * of space free from MODULE_END up to the top of the address
+ * space. We cap it one page short of that just to have some
+ * unmapped space at the very top for sanity's sake in case of
+ * *(NULL - constant) accesses in buggy kernel code.
+ *
+ * This basically replicates __vmalloc, except that it uses a
+ * range of addresses starting at MODULE_END. This also
+ * allocates a single page of address space with no following
+ * guard page (__get_vm_area always adds PAGE_SIZE to the size,
+ * so by passing zero we get the one page). We set up all the
+ * data structures here such that a normal vfree call tears
+ * them all down just right.
+ */
+ area = __get_vm_area(0, VM_ALLOC, MODULES_END, 0ULL - PAGE_SIZE);
+ if (!area)
+ goto fail_kip;
+ area->nr_pages = 1;
+ area->pages = kmalloc(sizeof(struct page *), GFP_KERNEL);
+ if (!area->pages)
+ goto fail_area;
+ area->pages[0] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+ if (!area->pages[0])
+ goto fail_pages;
+ pages = area->pages;
+ if (map_vm_area(area, PAGE_KERNEL_EXEC, &pages)) {
+ __free_page(area->pages[0]);
+ fail_pages:
+ kfree(area->pages);
+ fail_area:
+ remove_vm_area(area->addr);
+ kfree(area);
+ fail_kip:
kfree(kip);
return NULL;
}
+ BUG_ON(pages != area->pages + 1);
+ kip->insns = (kprobe_opcode_t *) area->addr;
+
INIT_HLIST_NODE(&kip->hlist);
hlist_add_head(&kip->hlist, &kprobe_insn_pages);
memset(kip->slot_used, 0, INSNS_PER_PAGE);


2005-03-13 16:44:32

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] x86-64 kprobes: handle %RIP-relative addressing mode

Roland McGrath <[email protected]> writes:

> The existing x86-64 kprobes implementation doesn't cope with the
> %RIP-relative addressing mode. Kprobes work by single-stepping a copy of

Thanks for fixing that long standing bug.

> + static const unsigned char onebyte_has_modrm[256] = {

Can you turn these two arrays into a bitmap please?

> + * This basically replicates __vmalloc, except that it uses a

This shouldn't be opencoded here. Instead make a utility function
like vmalloc_range() that takes a start and end address and
make the module allocation use it too.

Also you should fix up asm-x86_64/page.h and Documentation/x86_64/mm.txt
with the new fixed allocation.

> + * range of addresses starting a MODULE_END. This also
> + * allocates a single page of address space with no following
> + * guard page (__get_vm_area always adds PAGE_SIZE to the size,
> + * so by passing zero we get the one page). We set up all the

I think Andrea has just changed that and the patch went into
mainline. Be careful with merging.

-Andi

2005-03-13 18:17:03

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] x86-64 kprobes: handle %RIP-relative addressing mode

Roland McGrath wrote:
>
> + * This basically replicates __vmalloc, except that it uses a
> + * range of addresses starting at MODULE_END. This also

Could you look at these patches:

[PATCH 1/5] vmalloc: introduce __vmalloc_area() function
http://marc.theaimsgroup.com/?l=linux-kernel&m=111013183331326

[PATCH 5/5] vmalloc: use list of pages instead of array in vm_struct
http://marc.theaimsgroup.com/?l=linux-kernel&m=111013224029332

There are in mm3 now. Note that the second one will conflict with
your patch.

Is it possible to use __vmalloc_area()?

Oleg.

2005-03-13 21:48:34

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] x86-64 kprobes: handle %RIP-relative addressing mode

Roland McGrath <[email protected]> wrote:
>
> + area = __get_vm_area(0, VM_ALLOC, MODULES_END, 0ULL - PAGE_SIZE);

The longlong here seems wrong? If this is to mean "the top of the address
space minus a page" then unsigned long is the appropriate type.

2005-03-15 08:59:57

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH] x86-64 kprobes: handle %RIP-relative addressing mode

> Can you turn these two arrays into a bitmap please?

Ok.

> This shouldn't be opencoded here. Instead make a utility function
> like vmalloc_range() that takes a start and end address and
> make the module allocation use it too.
>
> Also you should fix up asm-x86_64/page.h and Documentation/x86_64/mm.txt
> with the new fixed allocation.
[...]
> I think Andrea has just changed that and the patch went into
> mainline. Be careful with merging.

Since __get_vm_area has been changed to make it harder to avoid the guard
page, I decided just to punt and use module_alloc instead. This works
either with or without the -mm patches that clean it up to use __vmalloc_area.
There is enough address space in the module area that I'm not going to
worry about each page kprobes uses wasting a second page of address space.

Here is a new version of the patch that addresses your comments.


Thanks,
Roland


Signed-off-by: Roland McGrath <[email protected]>

--- linux-2.6/arch/x86_64/kernel/kprobes.c
+++ linux-2.6/arch/x86_64/kernel/kprobes.c
@@ -25,6 +25,8 @@
* interface to access function arguments.
* 2004-Oct Jim Keniston <[email protected]> and Prasanna S Panchamukhi
* <[email protected]> adapted for x86_64
+ * 2005-Mar Roland McGrath <[email protected]>
+ * Fixed to handle %rip-relative addressing mode correctly.
*/

#include <linux/config.h>
@@ -34,7 +36,7 @@
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/preempt.h>
-#include <linux/vmalloc.h>
+#include <linux/moduleloader.h>

#include <asm/pgtable.h>
#include <asm/kdebug.h>
@@ -86,9 +88,132 @@ int arch_prepare_kprobe(struct kprobe *p
return 0;
}

+/*
+ * Determine if the instruction uses the %rip-relative addressing mode.
+ * If it does, return the address of the 32-bit displacement word.
+ * If not, return null.
+ */
+static inline s32 *is_riprel(u8 *insn)
+{
+#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 64))
+ static const u64 onebyte_has_modrm[256 / 64] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ------------------------------- */
+ W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
+ W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
+ W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
+ W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
+ W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
+ W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
+ W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
+ W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
+ W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
+ W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
+ W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
+ W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
+ W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
+ W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
+ W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
+ W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */
+ /* ------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ };
+ static const u64 twobyte_has_modrm[256 / 64] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ------------------------------- */
+ W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
+ W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
+ W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
+ W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
+ W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
+ W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
+ W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
+ W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
+ W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
+ W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
+ W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
+ W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
+ W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
+ W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
+ W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
+ W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */
+ /* ------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ };
+#undef W
+ int need_modrm;
+
+ /* Skip legacy instruction prefixes. */
+ while (1) {
+ switch (*insn) {
+ case 0x66:
+ case 0x67:
+ case 0x2e:
+ case 0x3e:
+ case 0x26:
+ case 0x64:
+ case 0x65:
+ case 0x36:
+ case 0xf0:
+ case 0xf3:
+ case 0xf2:
+ ++insn;
+ continue;
+ }
+ break;
+ }
+
+ /* Skip REX instruction prefix. */
+ if ((*insn & 0xf0) == 0x40)
+ ++insn;
+
+ if (*insn == 0x0f) { /* Two-byte opcode. */
+ ++insn;
+ need_modrm = test_bit(*insn, twobyte_has_modrm);
+ } else { /* One-byte opcode. */
+ need_modrm = test_bit(*insn, onebyte_has_modrm);
+ }
+
+ if (need_modrm) {
+ u8 modrm = *++insn;
+ if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
+ /* Displacement follows ModRM byte. */
+ return (s32 *) ++insn;
+ }
+ }
+
+ /* No %rip-relative addressing mode here. */
+ return NULL;
+}
+
void arch_copy_kprobe(struct kprobe *p)
{
+ s32 *ripdisp;
memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
+ ripdisp = is_riprel(p->ainsn.insn);
+ if (ripdisp) {
+ /*
+ * The copied instruction uses the %rip-relative
+ * addressing mode. Adjust the displacement for the
+ * difference between the original location of this
+ * instruction and the location of the copy that will
+ * actually be run. The tricky bit here is making sure
+ * that the sign extension happens correctly in this
+ * calculation, since we need a signed 32-bit result to
+ * be sign-extended to 64 bits when it's added to the
+ * %rip value and yield the same 64-bit result that the
+ * sign-extension of the original signed 32-bit
+ * displacement would have given.
+ */
+ s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
+ BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
+ *ripdisp = disp;
+ }
}

void arch_remove_kprobe(struct kprobe *p)
@@ -439,8 +564,15 @@ static kprobe_opcode_t *get_insn_slot(vo
if (!kip) {
return NULL;
}
- kip->insns = (kprobe_opcode_t*) __vmalloc(PAGE_SIZE,
- GFP_KERNEL|__GFP_HIGHMEM, __pgprot(__PAGE_KERNEL_EXEC));
+
+ /*
+ * For the %rip-relative displacement fixups to be doable, we
+ * need our instruction copy to be within +/- 2GB of any data it
+ * might access via %rip. That is, within 2GB of where the
+ * kernel image and loaded module images reside. So we allocate
+ * a page in the module loading area.
+ */
+ kip->insns = module_alloc(PAGE_SIZE);
if (!kip->insns) {
kfree(kip);
return NULL;
@@ -481,7 +614,7 @@ static void free_insn_slot(kprobe_opcode
hlist_add_head(&kip->hlist,
&kprobe_insn_pages);
} else {
- vfree(kip->insns);
+ module_free(NULL, kip->insns);
kfree(kip);
}
}