This tests that the vsyscall entries do what they're expected to do.
It also confirms that attempts to read the vsyscall page behave as
expected.
If changes are made to the vsyscall code or its memory map handling,
running this test in all three of vsyscall=none, vsyscall=emulate,
and vsyscall=native are helpful.
(Because it's easy, this also compares the vsyscall results to their
vDSO equivalents.)
Signed-off-by: Andy Lutomirski <[email protected]>
---
It's RFC because I want to re-read it myself first. It's also missing
a test that will reliably make sure that vsyscall=none prevents use of
vsyscalls.
Also, I want to add vsyscall=emulate_noread that makes the vsyscall
page be --x. And I want to add a per-process option to turn off
vsyscalls.
tools/testing/selftests/x86/Makefile | 2 +-
tools/testing/selftests/x86/test_vsyscall.c | 435 ++++++++++++++++++++++++++++
2 files changed, 436 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/x86/test_vsyscall.c
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 939a337128db..5d4f10ac2af2 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -7,7 +7,7 @@ include ../lib.mk
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ioperm \
- protection_keys test_vdso
+ protection_keys test_vdso test_vsyscall
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
new file mode 100644
index 000000000000..44d873d71b85
--- /dev/null
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -0,0 +1,435 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <string.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <sys/ucontext.h>
+#include <errno.h>
+#include <err.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <setjmp.h>
+
+#ifdef __x86_64__
+# define VSYS(x) (x)
+#else
+# define VSYS(x) 0
+#endif
+
+#ifndef SYS_getcpu
+# ifdef __x86_64__
+# define SYS_getcpu 309
+# else
+# define SYS_getcpu 318
+# endif
+#endif
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+/* vsyscalls and vDSO */
+bool should_read_vsyscall = false;
+
+typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
+gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
+gtod_t vdso_gtod;
+
+typedef int (*vgettime_t)(clockid_t, struct timespec *);
+vgettime_t vdso_gettime;
+
+typedef long (*time_func_t)(time_t *t);
+time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
+time_func_t vdso_time;
+
+typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
+getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
+getcpu_t vdso_getcpu;
+
+static void init_vdso(void)
+{
+ void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso)
+ vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso) {
+ printf("Warning: failed to find vDSO\n");
+ return;
+ }
+
+ vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday");
+ if (!vdso_gtod)
+ printf("Warning: failed to find gettimeofday in vDSO\n");
+
+ vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime");
+ if (!vdso_gettime)
+ printf("Warning: failed to find clock_gettime in vDSO\n");
+
+ vdso_time = (time_func_t)dlsym(vdso, "__vdso_time");
+ if (!vdso_time)
+ printf("Warning: failed to find time in vDSO\n");
+
+ vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
+ if (!vdso_getcpu)
+ printf("Warning: failed to find getcpu in vDSO\n");
+}
+
+static int init_vsys(void)
+{
+#ifdef __x86_64__
+ int nerrs = 0;
+ FILE *maps;
+ char line[128];
+ bool found = false;
+
+ maps = fopen("/proc/self/maps", "r");
+ if (!maps) {
+ printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
+ should_read_vsyscall = true;
+ return 0;
+ }
+
+ while (fgets(line, sizeof(line), maps)) {
+ char r, x;
+ void *start, *end;
+ char name[128];
+ if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
+ &start, &end, &r, &x, name) != 5)
+ continue;
+
+ if (strcmp(name, "[vsyscall]"))
+ continue;
+
+ printf("\tvsyscall map: %s", line);
+
+ if (start != (void *)0xffffffffff600000 ||
+ end != (void *)0xffffffffff601000) {
+ printf("[FAIL]\taddress range is nonsense\n");
+ nerrs++;
+ }
+
+ printf("\tvsyscall permissions are %c-%c\n", r, x);
+ should_read_vsyscall = (r == 'r');
+ if (x != 'x') {
+ vgtod = NULL;
+ vtime = NULL;
+ vgetcpu = NULL;
+ }
+
+ found = true;
+ break;
+ }
+
+ fclose(maps);
+
+ if (!found) {
+ printf("\tno vsyscall map in /proc/self/maps\n");
+ should_read_vsyscall = false;
+ vgtod = NULL;
+ vtime = NULL;
+ vgetcpu = NULL;
+ }
+
+ return nerrs;
+#else
+ return 0;
+#endif
+}
+
+/* syscalls */
+static inline long sys_gtod(struct timeval *tv, struct timezone *tz)
+{
+ return syscall(SYS_gettimeofday, tv, tz);
+}
+
+static inline int sys_clock_gettime(clockid_t id, struct timespec *ts)
+{
+ return syscall(SYS_clock_gettime, id, ts);
+}
+
+static inline long sys_time(time_t *t)
+{
+ return syscall(SYS_time, t);
+}
+
+static inline long sys_getcpu(unsigned * cpu, unsigned * node,
+ void* cache)
+{
+ return syscall(SYS_getcpu, cpu, node, cache);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+static double tv_diff(const struct timeval *a, const struct timeval *b)
+{
+ return (double)(a->tv_sec - b->tv_sec) +
+ (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6;
+}
+
+static int check_gtod(const struct timeval *tv_sys1,
+ const struct timeval *tv_sys2,
+ const struct timezone *tz_sys,
+ const char *which,
+ const struct timeval *tv_other,
+ const struct timezone *tz_other)
+{
+ int nerrs = 0;
+ double d1, d2;
+
+ if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) {
+ printf("[FAIL] %s tz mismatch\n", which);
+ nerrs++;
+ }
+
+ d1 = tv_diff(tv_other, tv_sys1);
+ d2 = tv_diff(tv_sys2, tv_other);
+ printf("\t%s time offsets: %lf %lf\n", which, d1, d2);
+
+ if (d1 < 0 || d2 < 0) {
+ printf("[FAIL]\t%s time was inconsistent with the syscall\n", which);
+ nerrs++;
+ } else {
+ printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which);
+ }
+
+ return nerrs;
+}
+
+static int test_gtod(void)
+{
+ struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys;
+ struct timezone tz_sys, tz_vdso, tz_vsys;
+ long ret_vdso = -1;
+ long ret_vsys = -1;
+ int nerrs = 0;
+
+ printf("[RUN]\ttest gettimeofday()\n");
+
+ if (sys_gtod(&tv_sys1, &tz_sys) != 0)
+ err(1, "syscall gettimeofday");
+ if (vdso_gtod)
+ ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
+ if (vgtod)
+ ret_vsys = vgtod(&tv_vsys, &tz_vsys);
+ if (sys_gtod(&tv_sys2, &tz_sys) != 0)
+ err(1, "syscall gettimeofday");
+
+ if (vdso_gtod) {
+ if (ret_vdso == 0) {
+ nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso);
+ } else {
+ printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso);
+ nerrs++;
+ }
+ }
+
+ if (vgtod) {
+ if (ret_vsys == 0) {
+ nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
+ } else {
+ printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys);
+ nerrs++;
+ }
+ }
+
+ return nerrs;
+}
+
+static int test_time(void) {
+ int nerrs = 0;
+
+ printf("[RUN]\ttest time()\n");
+ long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0;
+ long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1;
+ t_sys1 = sys_time(&t2_sys1);
+ if (vdso_time)
+ t_vdso = vdso_time(&t2_vdso);
+ if (vtime)
+ t_vsys = vtime(&t2_vsys);
+ t_sys2 = sys_time(&t2_sys2);
+ if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
+ printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2);
+ nerrs++;
+ return nerrs;
+ }
+
+ if (vdso_time) {
+ if (t_vdso < 0 || t_vdso != t2_vdso) {
+ printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso);
+ nerrs++;
+ } else if (t_vdso < t_sys1 || t_vdso > t_sys2) {
+ printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2);
+ nerrs++;
+ } else {
+ printf("[OK]\tvDSO time() is okay\n");
+ }
+ }
+
+ if (vtime) {
+ if (t_vsys < 0 || t_vsys != t2_vsys) {
+ printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
+ nerrs++;
+ } else if (t_vsys < t_sys1 || t_vsys > t_sys2) {
+ printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2);
+ nerrs++;
+ } else {
+ printf("[OK]\tvsyscall time() is okay\n");
+ }
+ }
+
+ return nerrs;
+}
+
+static int test_getcpu(int cpu)
+{
+ int nerrs = 0;
+ long ret_sys, ret_vdso = -1, ret_vsys = -1;
+
+ printf("[RUN]\tgetcpu() on CPU %d\n", cpu);
+
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+ printf("[SKIP]\tfailed to force CPU %d\n", cpu);
+ return nerrs;
+ }
+
+ unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys;
+ unsigned node = 0;
+ bool have_node = false;
+ ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
+ if (vdso_getcpu)
+ ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
+ if (vgetcpu)
+ ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
+
+ if (ret_sys == 0) {
+ if (cpu_sys != cpu) {
+ printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu);
+ nerrs++;
+ }
+
+ have_node = true;
+ node = node_sys;
+ }
+
+ if (vdso_getcpu) {
+ if (ret_vdso) {
+ printf("[FAIL]\tvDSO getcpu() failed\n");
+ nerrs++;
+ } else {
+ if (!have_node) {
+ have_node = true;
+ node = node_vdso;
+ }
+
+ if (cpu_vdso != cpu) {
+ printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu);
+ nerrs++;
+ } else {
+ printf("[OK]\tvDSO reported correct CPU\n");
+ }
+
+ if (node_vdso != node) {
+ printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node);
+ nerrs++;
+ } else {
+ printf("[OK]\tvDSO reported correct node\n");
+ }
+ }
+ }
+
+ if (vgetcpu) {
+ if (ret_vsys) {
+ printf("[FAIL]\tvsyscall getcpu() failed\n");
+ nerrs++;
+ } else {
+ if (!have_node) {
+ have_node = true;
+ node = node_vsys;
+ }
+
+ if (cpu_vsys != cpu) {
+ printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu);
+ nerrs++;
+ } else {
+ printf("[OK]\tvsyscall reported correct CPU\n");
+ }
+
+ if (node_vsys != node) {
+ printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node);
+ nerrs++;
+ } else {
+ printf("[OK]\tvsyscall reported correct node\n");
+ }
+ }
+ }
+
+ return nerrs;
+}
+
+static int test_vsys_r(void)
+{
+#ifdef __x86_64__
+ printf("[RUN]\tChecking read access to the vsyscall page\n");
+ bool can_read;
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ *(volatile int *)0xffffffffff600000;
+ can_read = true;
+ } else {
+ can_read = false;
+ }
+
+ if (can_read && !should_read_vsyscall) {
+ printf("[FAIL]\tWe have read access, but we shouldn't\n");
+ return 1;
+ } else if (!can_read && should_read_vsyscall) {
+ printf("[FAIL]\tWe don't have read access, but we should\n");
+ return 1;
+ } else {
+ printf("[OK]\tgot expected result\n");
+ }
+#endif
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int nerrs = 0;
+
+ init_vdso();
+ nerrs += init_vsys();
+
+ nerrs += test_gtod();
+ nerrs += test_time();
+ nerrs += test_getcpu(0);
+ nerrs += test_getcpu(1);
+
+ sethandler(SIGSEGV, sigsegv, 0);
+ nerrs += test_vsys_r();
+
+ return nerrs ? 1 : 0;
+}
--
2.13.6
On Thu, Jan 04, 2018 at 09:38:37PM -0800, Andy Lutomirski wrote:
> Also, I want to add vsyscall=emulate_noread that makes the vsyscall
> page be --x. And I want to add a per-process option to turn off
> vsyscalls.
What for?
It sounds like a bunch of work for something which is deprecated
anyway...
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Thu, Jan 04, 2018 at 09:38:37PM -0800, Andy Lutomirski wrote:
> It's RFC because I want to re-read it myself first. It's also missing
> a test that will reliably make sure that vsyscall=none prevents use of
> vsyscalls.
With my patch ontop of 4.4 from yesterday:
# ./test_vsyscall_32
Warning: failed to find getcpu in vDSO
[RUN] test gettimeofday()
vDSO time offsets: 0.000001 0.000001
[OK] vDSO gettimeofday()'s timeval was okay
[RUN] test time()
[OK] vDSO time() is okay
[RUN] getcpu() on CPU 0
[RUN] getcpu() on CPU 1
# ./test_vsyscall_64
vsyscall map: ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
vsyscall permissions are r-x
[RUN] test gettimeofday()
vDSO time offsets: 0.000000 0.000005
[OK] vDSO gettimeofday()'s timeval was okay
vsyscall time offsets: 0.000004 0.000001
[OK] vsyscall gettimeofday()'s timeval was okay
[RUN] test time()
[OK] vDSO time() is okay
[OK] vsyscall time() is okay
[RUN] getcpu() on CPU 0
[OK] vDSO reported correct CPU
[OK] vDSO reported correct node
[OK] vsyscall reported correct CPU
[OK] vsyscall reported correct node
[RUN] getcpu() on CPU 1
[OK] vDSO reported correct CPU
[OK] vDSO reported correct node
[OK] vsyscall reported correct CPU
[OK] vsyscall reported correct node
[RUN] Checking read access to the vsyscall page
[FAIL] We don't have read access, but we should
Do I really need the read access? :-)
I have CONFIG_LEGACY_VSYSCALL_EMULATE=y
Thx.
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Fri, Jan 5, 2018 at 4:33 AM, Borislav Petkov <[email protected]> wrote:
> On Thu, Jan 04, 2018 at 09:38:37PM -0800, Andy Lutomirski wrote:
>> Also, I want to add vsyscall=emulate_noread that makes the vsyscall
>> page be --x. And I want to add a per-process option to turn off
>> vsyscalls.
>
> What for?
>
> It sounds like a bunch of work for something which is deprecated
> anyway...
>
emulate_noread would avoid one exploit technique that Kees saw
somewhere. And per-process disablement would let a system remain
compatible with old binaries without reducing security for newer
binaries.
On Fri, Jan 5, 2018 at 5:40 AM, Borislav Petkov <[email protected]> wrote:
> On Thu, Jan 04, 2018 at 09:38:37PM -0800, Andy Lutomirski wrote:
>> It's RFC because I want to re-read it myself first. It's also missing
>> a test that will reliably make sure that vsyscall=none prevents use of
>> vsyscalls.
>
> With my patch ontop of 4.4 from yesterday:
>
> # ./test_vsyscall_32
> Warning: failed to find getcpu in vDSO
I should suppress that warning on 32-bit. Apparently no one ever
wired up 32-bit getcpu.
> [RUN] Checking read access to the vsyscall page
> [FAIL] We don't have read access, but we should
>
> Do I really need the read access? :-)
Yes. There are very clever tools like 'pin' that instrument a binary
by decoding all the instructions it executes and generating an
instrumented copy. If that binary calls into the vDSO, the vDSO gets
decoded and instrumented (which works fine). If the binary calls into
the vsyscall page, it still needs to work. So the vsyscall page
contains machine code that actually works (even if it's NX) to support
these tools. The authors and users of the tools yelled loudly in an
earlier version of the vsyscall emulation code that didn't support
this use case.
The root cause here is that 4.4 is KAISER, not KPTI. The
kaiser_set_shadow_pgd() function is a steaming pile of shit, and this
is a known bug in it. I have zero desire to hack up some stupid
special case in there. For the modern KPTI kernels, I rewrote that
function entirely to be much simpler and much more correct.
It should be straightforward to kludge something up, though, but I'm
not volunteering.
On Fri, Jan 05, 2018 at 09:53:16AM -0800, Andy Lutomirski wrote:
> emulate_noread would avoid one exploit technique that Kees saw
> somewhere. And per-process disablement would let a system remain
> compatible with old binaries without reducing security for newer
> binaries.
Or we can simply say new binaries can switch to the vdso. Because this
way, vsyscall will never really be phased out - new shit will simply
keep using it.
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Fri, Jan 05, 2018 at 10:01:23AM -0800, Andy Lutomirski wrote:
> Yes. There are very clever tools like 'pin' that instrument a binary
> by decoding all the instructions it executes and generating an
> instrumented copy. If that binary calls into the vDSO, the vDSO gets
> decoded and instrumented (which works fine). If the binary calls into
> the vsyscall page, it still needs to work. So the vsyscall page
> contains machine code that actually works (even if it's NX) to support
> these tools. The authors and users of the tools yelled loudly in an
> earlier version of the vsyscall emulation code that didn't support
> this use case.
It rings a bell...
> The root cause here is that 4.4 is KAISER, not KPTI. The
> kaiser_set_shadow_pgd() function is a steaming pile of shit, and this
> is a known bug in it.
Tell me about it.
We found out last night it breaks EFI too, see:
https://lkml.kernel.org/r/[email protected]
To put it mildly, this new PTI et al crap will bring us a lot of fun in
the coming year. I tell ya, a year from now we'll be dealing with the
fallout from this.
> I have zero desire to hack up some stupid special case in there. For
> the modern KPTI kernels, I rewrote that function entirely to be much
> simpler and much more correct.
>
> It should be straightforward to kludge something up, though, but I'm
> not volunteering.
Yeah, I think adding _PAGE_RW into the mix should fix it but I need to
give it a try first.
Thanks for the test!
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Fri, Jan 5, 2018 at 10:30 AM, Borislav Petkov <[email protected]> wrote:
> On Fri, Jan 05, 2018 at 10:01:23AM -0800, Andy Lutomirski wrote:
>> Yes. There are very clever tools like 'pin' that instrument a binary
>> by decoding all the instructions it executes and generating an
>> instrumented copy. If that binary calls into the vDSO, the vDSO gets
>> decoded and instrumented (which works fine). If the binary calls into
>> the vsyscall page, it still needs to work. So the vsyscall page
>> contains machine code that actually works (even if it's NX) to support
>> these tools. The authors and users of the tools yelled loudly in an
>> earlier version of the vsyscall emulation code that didn't support
>> this use case.
>
> It rings a bell...
>
>> The root cause here is that 4.4 is KAISER, not KPTI. The
>> kaiser_set_shadow_pgd() function is a steaming pile of shit, and this
>> is a known bug in it.
>
> Tell me about it.
>
> We found out last night it breaks EFI too, see:
>
> https://lkml.kernel.org/r/[email protected]
>
> To put it mildly, this new PTI et al crap will bring us a lot of fun in
> the coming year. I tell ya, a year from now we'll be dealing with the
> fallout from this.
>
>> I have zero desire to hack up some stupid special case in there. For
>> the modern KPTI kernels, I rewrote that function entirely to be much
>> simpler and much more correct.
>>
>> It should be straightforward to kludge something up, though, but I'm
>> not volunteering.
>
> Yeah, I think adding _PAGE_RW into the mix should fix it but I need to
> give it a try first.
>
Not _PAGE_RW. Probably _PAGE_USER somewhere in the hierarchy.
On Fri, Jan 5, 2018 at 10:23 AM, Borislav Petkov <[email protected]> wrote:
> On Fri, Jan 05, 2018 at 09:53:16AM -0800, Andy Lutomirski wrote:
>> emulate_noread would avoid one exploit technique that Kees saw
>> somewhere. And per-process disablement would let a system remain
>> compatible with old binaries without reducing security for newer
>> binaries.
>
> Or we can simply say new binaries can switch to the vdso. Because this
> way, vsyscall will never really be phased out - new shit will simply
> keep using it.
As far as I know, new binaries don't use vsycall. The holdouts were
musl and Go, and both are fixed.
The remaining problem is that, for certain classes of userspace bugs,
an attacker can take advantage of the vsyscall page's existence at a
fixed address to cause mischief. So opting out of having it be there
could be helpful to mitigate attacks.
On Fri, Jan 05, 2018 at 10:47:15AM -0800, Andy Lutomirski wrote:
> The remaining problem is that, for certain classes of userspace bugs,
> an attacker can take advantage of the vsyscall page's existence at a
> fixed address to cause mischief. So opting out of having it be there
> could be helpful to mitigate attacks.
I understand that but how do you shoo people off the vsyscall page? You
need to tell old binaries about the per-process disablement and new
binaries to move to vdso.
Hmmm, add big fat warnings to vsyscall_64.c?
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Fri, Jan 05, 2018 at 10:45:49AM -0800, Andy Lutomirski wrote:
> Not _PAGE_RW. Probably _PAGE_USER somewhere in the hierarchy.
Yeah, just realized that. But it must be somewhere in the PT hierarchy
because:
0xffffffffff600000-0xffffffffff601000 4K USR ro NX pte
So something up needs to take _PAGE_USER too.
But WTF does it say NX there for and still can execute the vsyscall
test? Oh boy, what a mess...
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
> On Jan 5, 2018, at 11:10 AM, Borislav Petkov <[email protected]> wrote:
>
>> On Fri, Jan 05, 2018 at 10:45:49AM -0800, Andy Lutomirski wrote:
>> Not _PAGE_RW. Probably _PAGE_USER somewhere in the hierarchy.
>
> Yeah, just realized that. But it must be somewhere in the PT hierarchy
> because:
>
> 0xffffffffff600000-0xffffffffff601000 4K USR ro NX pte
>
> So something up needs to take _PAGE_USER too.
>
> But WTF does it say NX there for and still can execute the vsyscall
> test? Oh boy, what a mess...
>
It's emulated! We catch the page fault and fake the whole thing :)
> --
> Regards/Gruss,
> Boris.
>
> Good mailing practices for 400: avoid top-posting and trim the reply.
On Fri, Jan 05, 2018 at 11:22:21AM -0800, Andy Lutomirski wrote:
> It's emulated! We catch the page fault and fake the whole thing :)
Then I'm really confused. It says "ro" above, which means _PAGE_RW is
not set so page is read-only.
I must be missing something...
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
> On Jan 5, 2018, at 11:28 AM, Borislav Petkov <[email protected]> wrote:
>
>> On Fri, Jan 05, 2018 at 11:22:21AM -0800, Andy Lutomirski wrote:
>> It's emulated! We catch the page fault and fake the whole thing :)
>
> Then I'm really confused. It says "ro" above, which means _PAGE_RW is
> not set so page is read-only.
>
> I must be missing something...
>
It's meant to be read-only, user-acccessible, NX as far as the CPU is concerned. When user code calls it, we get an instruction fetch fault, and the kernel fixes it up.
> --
> Regards/Gruss,
> Boris.
>
> Good mailing practices for 400: avoid top-posting and trim the reply.
On 5 January 2018 at 11:08, Andy Lutomirski <[email protected]> wrote:
> This tests that the vsyscall entries do what they're expected to do.
> It also confirms that attempts to read the vsyscall page behave as
> expected.
>
> If changes are made to the vsyscall code or its memory map handling,
> running this test in all three of vsyscall=none, vsyscall=emulate,
> and vsyscall=native are helpful.
We do not want to change the kernel command lines dynamically on
the test bench. It will be hard to test this case in automated setup.
Because with a given setup we run all selftests in a single go.
- Naresh
>
> (Because it's easy, this also compares the vsyscall results to their
> vDSO equivalents.)
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> It's RFC because I want to re-read it myself first. It's also missing
> a test that will reliably make sure that vsyscall=none prevents use of
> vsyscalls.
>
> Also, I want to add vsyscall=emulate_noread that makes the vsyscall
> page be --x. And I want to add a per-process option to turn off
> vsyscalls.
>
> tools/testing/selftests/x86/Makefile | 2 +-
> tools/testing/selftests/x86/test_vsyscall.c | 435 ++++++++++++++++++++++++++++
> 2 files changed, 436 insertions(+), 1 deletion(-)
> create mode 100644 tools/testing/selftests/x86/test_vsyscall.c
>
> diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
> index 939a337128db..5d4f10ac2af2 100644
> --- a/tools/testing/selftests/x86/Makefile
> +++ b/tools/testing/selftests/x86/Makefile
> @@ -7,7 +7,7 @@ include ../lib.mk
>
> TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
> check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ioperm \
> - protection_keys test_vdso
> + protection_keys test_vdso test_vsyscall
> TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
> test_FCMOV test_FCOMI test_FISTTP \
> vdso_restorer
> diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
> new file mode 100644
> index 000000000000..44d873d71b85
> --- /dev/null
> +++ b/tools/testing/selftests/x86/test_vsyscall.c
> @@ -0,0 +1,435 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#define _GNU_SOURCE
> +
> +#include <stdio.h>
> +#include <sys/time.h>
> +#include <time.h>
> +#include <stdlib.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +#include <dlfcn.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <signal.h>
> +#include <sys/ucontext.h>
> +#include <errno.h>
> +#include <err.h>
> +#include <sched.h>
> +#include <stdbool.h>
> +#include <setjmp.h>
> +
> +#ifdef __x86_64__
> +# define VSYS(x) (x)
> +#else
> +# define VSYS(x) 0
> +#endif
> +
> +#ifndef SYS_getcpu
> +# ifdef __x86_64__
> +# define SYS_getcpu 309
> +# else
> +# define SYS_getcpu 318
> +# endif
> +#endif
> +
> +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
> + int flags)
> +{
> + struct sigaction sa;
> + memset(&sa, 0, sizeof(sa));
> + sa.sa_sigaction = handler;
> + sa.sa_flags = SA_SIGINFO | flags;
> + sigemptyset(&sa.sa_mask);
> + if (sigaction(sig, &sa, 0))
> + err(1, "sigaction");
> +}
> +
> +/* vsyscalls and vDSO */
> +bool should_read_vsyscall = false;
> +
> +typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
> +gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
> +gtod_t vdso_gtod;
> +
> +typedef int (*vgettime_t)(clockid_t, struct timespec *);
> +vgettime_t vdso_gettime;
> +
> +typedef long (*time_func_t)(time_t *t);
> +time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
> +time_func_t vdso_time;
> +
> +typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
> +getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
> +getcpu_t vdso_getcpu;
> +
> +static void init_vdso(void)
> +{
> + void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
> + if (!vdso)
> + vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
> + if (!vdso) {
> + printf("Warning: failed to find vDSO\n");
> + return;
> + }
> +
> + vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday");
> + if (!vdso_gtod)
> + printf("Warning: failed to find gettimeofday in vDSO\n");
> +
> + vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime");
> + if (!vdso_gettime)
> + printf("Warning: failed to find clock_gettime in vDSO\n");
> +
> + vdso_time = (time_func_t)dlsym(vdso, "__vdso_time");
> + if (!vdso_time)
> + printf("Warning: failed to find time in vDSO\n");
> +
> + vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
> + if (!vdso_getcpu)
> + printf("Warning: failed to find getcpu in vDSO\n");
> +}
> +
> +static int init_vsys(void)
> +{
> +#ifdef __x86_64__
> + int nerrs = 0;
> + FILE *maps;
> + char line[128];
> + bool found = false;
> +
> + maps = fopen("/proc/self/maps", "r");
> + if (!maps) {
> + printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
> + should_read_vsyscall = true;
> + return 0;
> + }
> +
> + while (fgets(line, sizeof(line), maps)) {
> + char r, x;
> + void *start, *end;
> + char name[128];
> + if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
> + &start, &end, &r, &x, name) != 5)
> + continue;
> +
> + if (strcmp(name, "[vsyscall]"))
> + continue;
> +
> + printf("\tvsyscall map: %s", line);
> +
> + if (start != (void *)0xffffffffff600000 ||
> + end != (void *)0xffffffffff601000) {
> + printf("[FAIL]\taddress range is nonsense\n");
> + nerrs++;
> + }
> +
> + printf("\tvsyscall permissions are %c-%c\n", r, x);
> + should_read_vsyscall = (r == 'r');
> + if (x != 'x') {
> + vgtod = NULL;
> + vtime = NULL;
> + vgetcpu = NULL;
> + }
> +
> + found = true;
> + break;
> + }
> +
> + fclose(maps);
> +
> + if (!found) {
> + printf("\tno vsyscall map in /proc/self/maps\n");
> + should_read_vsyscall = false;
> + vgtod = NULL;
> + vtime = NULL;
> + vgetcpu = NULL;
> + }
> +
> + return nerrs;
> +#else
> + return 0;
> +#endif
> +}
> +
> +/* syscalls */
> +static inline long sys_gtod(struct timeval *tv, struct timezone *tz)
> +{
> + return syscall(SYS_gettimeofday, tv, tz);
> +}
> +
> +static inline int sys_clock_gettime(clockid_t id, struct timespec *ts)
> +{
> + return syscall(SYS_clock_gettime, id, ts);
> +}
> +
> +static inline long sys_time(time_t *t)
> +{
> + return syscall(SYS_time, t);
> +}
> +
> +static inline long sys_getcpu(unsigned * cpu, unsigned * node,
> + void* cache)
> +{
> + return syscall(SYS_getcpu, cpu, node, cache);
> +}
> +
> +static jmp_buf jmpbuf;
> +
> +static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
> +{
> + siglongjmp(jmpbuf, 1);
> +}
> +
> +static double tv_diff(const struct timeval *a, const struct timeval *b)
> +{
> + return (double)(a->tv_sec - b->tv_sec) +
> + (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6;
> +}
> +
> +static int check_gtod(const struct timeval *tv_sys1,
> + const struct timeval *tv_sys2,
> + const struct timezone *tz_sys,
> + const char *which,
> + const struct timeval *tv_other,
> + const struct timezone *tz_other)
> +{
> + int nerrs = 0;
> + double d1, d2;
> +
> + if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) {
> + printf("[FAIL] %s tz mismatch\n", which);
> + nerrs++;
> + }
> +
> + d1 = tv_diff(tv_other, tv_sys1);
> + d2 = tv_diff(tv_sys2, tv_other);
> + printf("\t%s time offsets: %lf %lf\n", which, d1, d2);
> +
> + if (d1 < 0 || d2 < 0) {
> + printf("[FAIL]\t%s time was inconsistent with the syscall\n", which);
> + nerrs++;
> + } else {
> + printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which);
> + }
> +
> + return nerrs;
> +}
> +
> +static int test_gtod(void)
> +{
> + struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys;
> + struct timezone tz_sys, tz_vdso, tz_vsys;
> + long ret_vdso = -1;
> + long ret_vsys = -1;
> + int nerrs = 0;
> +
> + printf("[RUN]\ttest gettimeofday()\n");
> +
> + if (sys_gtod(&tv_sys1, &tz_sys) != 0)
> + err(1, "syscall gettimeofday");
> + if (vdso_gtod)
> + ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
> + if (vgtod)
> + ret_vsys = vgtod(&tv_vsys, &tz_vsys);
> + if (sys_gtod(&tv_sys2, &tz_sys) != 0)
> + err(1, "syscall gettimeofday");
> +
> + if (vdso_gtod) {
> + if (ret_vdso == 0) {
> + nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso);
> + } else {
> + printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso);
> + nerrs++;
> + }
> + }
> +
> + if (vgtod) {
> + if (ret_vsys == 0) {
> + nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
> + } else {
> + printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys);
> + nerrs++;
> + }
> + }
> +
> + return nerrs;
> +}
> +
> +static int test_time(void) {
> + int nerrs = 0;
> +
> + printf("[RUN]\ttest time()\n");
> + long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0;
> + long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1;
> + t_sys1 = sys_time(&t2_sys1);
> + if (vdso_time)
> + t_vdso = vdso_time(&t2_vdso);
> + if (vtime)
> + t_vsys = vtime(&t2_vsys);
> + t_sys2 = sys_time(&t2_sys2);
> + if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
> + printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2);
> + nerrs++;
> + return nerrs;
> + }
> +
> + if (vdso_time) {
> + if (t_vdso < 0 || t_vdso != t2_vdso) {
> + printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso);
> + nerrs++;
> + } else if (t_vdso < t_sys1 || t_vdso > t_sys2) {
> + printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2);
> + nerrs++;
> + } else {
> + printf("[OK]\tvDSO time() is okay\n");
> + }
> + }
> +
> + if (vtime) {
> + if (t_vsys < 0 || t_vsys != t2_vsys) {
> + printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
> + nerrs++;
> + } else if (t_vsys < t_sys1 || t_vsys > t_sys2) {
> + printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2);
> + nerrs++;
> + } else {
> + printf("[OK]\tvsyscall time() is okay\n");
> + }
> + }
> +
> + return nerrs;
> +}
> +
> +static int test_getcpu(int cpu)
> +{
> + int nerrs = 0;
> + long ret_sys, ret_vdso = -1, ret_vsys = -1;
> +
> + printf("[RUN]\tgetcpu() on CPU %d\n", cpu);
> +
> + cpu_set_t cpuset;
> + CPU_ZERO(&cpuset);
> + CPU_SET(cpu, &cpuset);
> + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
> + printf("[SKIP]\tfailed to force CPU %d\n", cpu);
> + return nerrs;
> + }
> +
> + unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys;
> + unsigned node = 0;
> + bool have_node = false;
> + ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
> + if (vdso_getcpu)
> + ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
> + if (vgetcpu)
> + ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
> +
> + if (ret_sys == 0) {
> + if (cpu_sys != cpu) {
> + printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu);
> + nerrs++;
> + }
> +
> + have_node = true;
> + node = node_sys;
> + }
> +
> + if (vdso_getcpu) {
> + if (ret_vdso) {
> + printf("[FAIL]\tvDSO getcpu() failed\n");
> + nerrs++;
> + } else {
> + if (!have_node) {
> + have_node = true;
> + node = node_vdso;
> + }
> +
> + if (cpu_vdso != cpu) {
> + printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu);
> + nerrs++;
> + } else {
> + printf("[OK]\tvDSO reported correct CPU\n");
> + }
> +
> + if (node_vdso != node) {
> + printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node);
> + nerrs++;
> + } else {
> + printf("[OK]\tvDSO reported correct node\n");
> + }
> + }
> + }
> +
> + if (vgetcpu) {
> + if (ret_vsys) {
> + printf("[FAIL]\tvsyscall getcpu() failed\n");
> + nerrs++;
> + } else {
> + if (!have_node) {
> + have_node = true;
> + node = node_vsys;
> + }
> +
> + if (cpu_vsys != cpu) {
> + printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu);
> + nerrs++;
> + } else {
> + printf("[OK]\tvsyscall reported correct CPU\n");
> + }
> +
> + if (node_vsys != node) {
> + printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node);
> + nerrs++;
> + } else {
> + printf("[OK]\tvsyscall reported correct node\n");
> + }
> + }
> + }
> +
> + return nerrs;
> +}
> +
> +static int test_vsys_r(void)
> +{
> +#ifdef __x86_64__
> + printf("[RUN]\tChecking read access to the vsyscall page\n");
> + bool can_read;
> + if (sigsetjmp(jmpbuf, 1) == 0) {
> + *(volatile int *)0xffffffffff600000;
> + can_read = true;
> + } else {
> + can_read = false;
> + }
> +
> + if (can_read && !should_read_vsyscall) {
> + printf("[FAIL]\tWe have read access, but we shouldn't\n");
> + return 1;
> + } else if (!can_read && should_read_vsyscall) {
> + printf("[FAIL]\tWe don't have read access, but we should\n");
> + return 1;
> + } else {
> + printf("[OK]\tgot expected result\n");
> + }
> +#endif
> +
> + return 0;
> +}
> +
> +int main(int argc, char **argv)
> +{
> + int nerrs = 0;
> +
> + init_vdso();
> + nerrs += init_vsys();
> +
> + nerrs += test_gtod();
> + nerrs += test_time();
> + nerrs += test_getcpu(0);
> + nerrs += test_getcpu(1);
> +
> + sethandler(SIGSEGV, sigsegv, 0);
> + nerrs += test_vsys_r();
> +
> + return nerrs ? 1 : 0;
> +}
> --
> 2.13.6
>
On Tue, Jan 09, 2018 at 05:15:15PM +0530, Naresh Kamboju wrote:
> We do not want to change the kernel command lines dynamically on
> the test bench. It will be hard to test this case in automated setup.
> Because with a given setup we run all selftests in a single go.
Those cmdline parameters are already upstream, without that test patch.
If you wanna test all, you'd have to change the cmdline or rebuilt the
kernel three times with each of those set, alternatingly:
CONFIG_LEGACY_VSYSCALL_NATIVE
CONFIG_LEGACY_VSYSCALL_EMULATE
CONFIG_LEGACY_VSYSCALL_NONE
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Thu, Jan 4, 2018 at 10:38 PM, Andy Lutomirski <[email protected]> wrote:
> This tests that the vsyscall entries do what they're expected to do.
> It also confirms that attempts to read the vsyscall page behave as
> expected.
>
> If changes are made to the vsyscall code or its memory map handling,
> running this test in all three of vsyscall=none, vsyscall=emulate,
> and vsyscall=native are helpful.
>
> (Because it's easy, this also compares the vsyscall results to their
> vDSO equivalents.)
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
Hi Andy,
I am not cc'ed on this patch. Doesn't look like the patch is sent to everybody
get_maintainers lists for the set of files that are changed in this patch.
Please make sure to include everybody on your future kselftest patches.
thanks,
-- Shuah
On Tue, Jan 9, 2018 at 1:25 PM, Shuah Khan <[email protected]> wrote:
> On Thu, Jan 4, 2018 at 10:38 PM, Andy Lutomirski <[email protected]> wrote:
>> This tests that the vsyscall entries do what they're expected to do.
>> It also confirms that attempts to read the vsyscall page behave as
>> expected.
>>
>> If changes are made to the vsyscall code or its memory map handling,
>> running this test in all three of vsyscall=none, vsyscall=emulate,
>> and vsyscall=native are helpful.
>>
>> (Because it's easy, this also compares the vsyscall results to their
>> vDSO equivalents.)
>>
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>>
>
> Hi Andy,
>
> I am not cc'ed on this patch. Doesn't look like the patch is sent to everybody
> get_maintainers lists for the set of files that are changed in this patch.
>
> Please make sure to include everybody on your future kselftest patches.
>
Ok.