2024-03-21 07:52:53

by lee bruce

[permalink] [raw]
Subject: BUG: unable to handle kernel paging request in fuse_copy_do

Hello I found a bug titled "BUG: unable to handle kernel paging
request in fuse_copy_do” with modified syzkaller, and maybe it is
related to fs/fuse.
I also confirmed in the latest upstream.

If you fix this issue, please add the following tag to the commit:
Reported-by: xingwei lee <[email protected]>
Reported-by: yue sun <[email protected]>

kernel: upstream 23956900041d968f9ad0f30db6dede4daccd7aa9
kernel config: https://syzkaller.appspot.com/text?tag=KernelConfig&x=9f47e8dfa53b0b11
with KASAN enabled
compiler: gcc (Debian 12.2.0-14) 12.2.0

BUG: unable to handle kernel paging request in fuse_copy_do
UDPLite: UDP-Lite is deprecated and scheduled to be removed in 2025,
please contact the netdev mailing list
BUG: unable to handle page fault for address: ffff88802c29c000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 13001067 P4D 13001067 PUD 13002067 PMD 24c8d063 PTE 800fffffd3d63060
Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI
CPU: 1 PID: 8221 Comm: 1e9 Not tainted 6.8.0-05202-g9187210eee7d-dirty #21
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.16.2-1.fc38 04/01/2014
RIP: 0010:memcpy+0xc/0x20 arch/x86/lib/memcpy_64.S:38
Code: 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90 90 90 90 90 90
90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 48 89 f80
RSP: 0018:ffffc9001065f9c8 EFLAGS: 00010246
RAX: ffffc9001065fb10 RBX: ffffc9001065fc78 RCX: 0000000000000010
RDX: 0000000000000010 RSI: ffff88802c29c000 RDI: ffffc9001065fb10
RBP: 0000000000000010 R08: ffff88802c29c000 R09: 0000000000000001
R10: ffffffff8ea82ed7 R11: ffffc9001065fd98 R12: ffffc9001065fac0
R13: 0000000000000010 R14: ffffc9001065faf0 R15: ffffc9001065fcbc
FS: 000000000f82d480(0000) GS:ffff88823bc00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffff88802c29c000 CR3: 000000002dd7c000 CR4: 0000000000750ef0
PKRU: 55555554
Call Trace:
<TASK>
fuse_copy_do+0x152/0x340 fs/fuse/dev.c:758
fuse_copy_one fs/fuse/dev.c:1007 [inline]
fuse_dev_do_write+0x1df/0x26a0 fs/fuse/dev.c:1863
fuse_dev_write+0x129/0x1b0 fs/fuse/dev.c:1960
call_write_iter include/linux/fs.h:2108 [inline]
new_sync_write fs/read_write.c:497 [inline]
vfs_write+0x62e/0x10a0 fs/read_write.c:590
ksys_write+0xf6/0x1d0 fs/read_write.c:643
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x7c/0x1d0 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x6c/0x74

=* repro.c =*
#define _GNU_SOURCE

#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

#ifndef __NR_memfd_secret
#define __NR_memfd_secret 447
#endif

static __thread int clone_ongoing;
static __thread int skip_segv;
static __thread jmp_buf segv_env;

static void segv_handler(int sig, siginfo_t* info, void* ctx) {
if (__atomic_load_n(&clone_ongoing, __ATOMIC_RELAXED) != 0) {
exit(sig);
}
uintptr_t addr = (uintptr_t)info->si_addr;
const uintptr_t prog_start = 1 << 20;
const uintptr_t prog_end = 100 << 20;
int skip = __atomic_load_n(&skip_segv, __ATOMIC_RELAXED) != 0;
int valid = addr < prog_start || addr > prog_end;
if (skip && valid) {
_longjmp(segv_env, 1);
}
exit(sig);
}

static void install_segv_handler(void) {
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_IGN;
syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = segv_handler;
sa.sa_flags = SA_NODEFER | SA_SIGINFO;
sigaction(SIGSEGV, &sa, NULL);
sigaction(SIGBUS, &sa, NULL);
}

#define NONFAILING(...) \
({ \
int ok = 1; \
__atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST); \
if (_setjmp(segv_env) == 0) { \
__VA_ARGS__; \
} else \
ok = 0; \
__atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST); \
ok; \
})

static void sleep_ms(uint64_t ms) {
usleep(ms * 1000);
}

static uint64_t current_time_ms(void) {
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts))
exit(1);
return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

static bool write_file(const char* file, const char* what, ...) {
char buf[1024];
va_list args;
va_start(args, what);
vsnprintf(buf, sizeof(buf), what, args);
va_end(args);
buf[sizeof(buf) - 1] = 0;
int len = strlen(buf);
int fd = open(file, O_WRONLY | O_CLOEXEC);
if (fd == -1)
return false;
if (write(fd, buf, len) != len) {
int err = errno;
close(fd);
errno = err;
return false;
}
close(fd);
return true;
}

static void kill_and_wait(int pid, int* status) {
kill(-pid, SIGKILL);
kill(pid, SIGKILL);
for (int i = 0; i < 100; i++) {
if (waitpid(-1, status, WNOHANG | __WALL) == pid)
return;
usleep(1000);
}
DIR* dir = opendir("/sys/fs/fuse/connections");
if (dir) {
for (;;) {
struct dirent* ent = readdir(dir);
if (!ent)
break;
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
continue;
char abort[300];
snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort",
ent->d_name);
int fd = open(abort, O_WRONLY);
if (fd == -1) {
continue;
}
if (write(fd, abort, 1) < 0) {
}
close(fd);
}
closedir(dir);
} else {
}
while (waitpid(-1, status, __WALL) != pid) {
}
}

static void setup_test() {
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setpgrp();
write_file("/proc/self/oom_score_adj", "1000");
}

static void execute_one(void);

#define WAIT_FLAGS __WALL

static void loop(void) {
int iter = 0;
for (;; iter++) {
int pid = fork();
if (pid < 0)
exit(1);
if (pid == 0) {
setup_test();
execute_one();
exit(0);
}
int status = 0;
uint64_t start = current_time_ms();
for (;;) {
if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
break;
sleep_ms(1);
if (current_time_ms() - start < 5000)
continue;
kill_and_wait(pid, &status);
break;
}
}
}

uint64_t r[3] = {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff};

void execute_one(void) {
intptr_t res = 0;
NONFAILING(memcpy((void*)0x20002040, "./file0\000", 8));
syscall(__NR_mkdirat, /*fd=*/0xffffff9c, /*path=*/0x20002040ul, /*mode=*/0ul);
NONFAILING(memcpy((void*)0x20002080, "/dev/fuse\000", 10));
res = syscall(__NR_openat, /*fd=*/0xffffffffffffff9cul, /*file=*/0x20002080ul,
/*flags=*/2ul, /*mode=*/0ul);
if (res != -1)
r[0] = res;
NONFAILING(memcpy((void*)0x200020c0, "./file0\000", 8));
NONFAILING(memcpy((void*)0x20002100, "fuse\000", 5));
NONFAILING(memcpy((void*)0x20002140, "fd", 2));
NONFAILING(*(uint8_t*)0x20002142 = 0x3d);
NONFAILING(sprintf((char*)0x20002143, "0x%016llx", (long long)r[0]));
NONFAILING(*(uint8_t*)0x20002155 = 0x2c);
NONFAILING(memcpy((void*)0x20002156, "rootmode", 8));
NONFAILING(*(uint8_t*)0x2000215e = 0x3d);
NONFAILING(sprintf((char*)0x2000215f, "%023llo", (long long)0x4000));
NONFAILING(*(uint8_t*)0x20002176 = 0x2c);
NONFAILING(memcpy((void*)0x20002177, "user_id", 7));
NONFAILING(*(uint8_t*)0x2000217e = 0x3d);
NONFAILING(sprintf((char*)0x2000217f, "%020llu", (long long)0));
NONFAILING(*(uint8_t*)0x20002193 = 0x2c);
NONFAILING(memcpy((void*)0x20002194, "group_id", 8));
NONFAILING(*(uint8_t*)0x2000219c = 0x3d);
NONFAILING(sprintf((char*)0x2000219d, "%020llu", (long long)0));
NONFAILING(*(uint8_t*)0x200021b1 = 0x2c);
NONFAILING(*(uint8_t*)0x200021b2 = 0);
syscall(__NR_mount, /*src=*/0ul, /*dst=*/0x200020c0ul, /*type=*/0x20002100ul,
/*flags=*/0ul, /*opts=*/0x20002140ul);
res = syscall(__NR_memfd_secret, /*flags=*/0ul);
if (res != -1)
r[1] = res;
syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0xb36000ul,
/*prot=PROT_GROWSUP|PROT_READ*/ 0x2000001ul,
/*flags=MAP_STACK|MAP_POPULATE|MAP_FIXED|MAP_SHARED*/ 0x28011ul,
/*fd=*/r[1], /*offset=*/0ul);
syscall(__NR_ftruncate, /*fd=*/r[1], /*len=*/7ul);
res = syscall(__NR_socket, /*domain=*/2ul, /*type=*/2ul, /*proto=*/0x88);
if (res != -1)
r[2] = res;
NONFAILING(*(uint32_t*)0x20000280 = 0);
syscall(__NR_getsockopt, /*fd=*/r[2], /*level=*/1, /*optname=*/0x11,
/*optval=*/0ul, /*optlen=*/0x20000280ul);
NONFAILING(*(uint32_t*)0x20000000 = 0x50);
NONFAILING(*(uint32_t*)0x20000004 = 0);
NONFAILING(*(uint64_t*)0x20000008 = 0);
NONFAILING(*(uint32_t*)0x20000010 = 7);
NONFAILING(*(uint32_t*)0x20000014 = 0x27);
NONFAILING(*(uint32_t*)0x20000018 = 0);
NONFAILING(*(uint32_t*)0x2000001c = 0);
NONFAILING(*(uint16_t*)0x20000020 = 0);
NONFAILING(*(uint16_t*)0x20000022 = 0);
NONFAILING(*(uint32_t*)0x20000024 = 0);
NONFAILING(*(uint32_t*)0x20000028 = 0);
NONFAILING(*(uint16_t*)0x2000002c = 0);
NONFAILING(*(uint16_t*)0x2000002e = 0);
NONFAILING(memset((void*)0x20000030, 0, 32));
syscall(__NR_write, /*fd=*/r[0], /*arg=*/0x20000000ul, /*len=*/0x50ul);
}
int main(void) {
syscall(__NR_mmap, /*addr=*/0x1ffff000ul, /*len=*/0x1000ul, /*prot=*/0ul,
/*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
/*offset=*/0ul);
syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x1000000ul,
/*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul,
/*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
/*offset=*/0ul);
syscall(__NR_mmap, /*addr=*/0x21000000ul, /*len=*/0x1000ul, /*prot=*/0ul,
/*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
/*offset=*/0ul);
install_segv_handler();
loop();
return 0;
}

=* repro.txt =*
mkdirat(0xffffffffffffff9c, &(0x7f0000002040)='./file0\x00', 0x0)
r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000002080), 0x2, 0x0)
mount$fuse(0x0, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100),
0x0, &(0x7f0000002140)={{'fd', 0x3d, r0}, 0x2c, {'rootmode', 0x3d,
0x4000}})
r1 = memfd_secret(0x0)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x2000001, 0x28011, r1, 0x0)
ftruncate(r1, 0x7)
r2 = socket$inet_udplite(0x2, 0x2, 0x88)
getsockopt$sock_cred(r2, 0x1, 0x11, 0x0, &(0x7f0000000280))
write$FUSE_INIT(r0, &(0x7f0000000000)={0x50}, 0x50)


see aslo https://gist.github.com/xrivendell7/961be96ae091c9671bb56efea902cec4.

I hope it helps.
best regards.
xingwei Lee


2024-03-22 13:50:29

by Miklos Szeredi

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

[MM list + secretmem author CC-d]

On Thu, 21 Mar 2024 at 08:52, xingwei lee <[email protected]> wrote:
>
> Hello I found a bug titled "BUG: unable to handle kernel paging
> request in fuse_copy_do” with modified syzkaller, and maybe it is
> related to fs/fuse.
> I also confirmed in the latest upstream.
>
> If you fix this issue, please add the following tag to the commit:
> Reported-by: xingwei lee <[email protected]>
> Reported-by: yue sun <[email protected]>

Thanks for the report. This looks like a secretmem vs get_user_pages issue.

I reduced the syz reproducer to a minimal one that isn't dependent on fuse:

=== repro.c ===
#define _GNU_SOURCE

#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/socket.h>

int main(void)
{
int fd1, fd2, fd3;
int pip[2];
struct iovec iov;
void *addr;

fd1 = syscall(__NR_memfd_secret, 0);
addr = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd1, 0);
ftruncate(fd1, 7);
fd2 = socket(AF_INET, SOCK_DGRAM, 0);
getsockopt(fd2, 0, 0, NULL, addr);

pipe(pip);
iov.iov_base = addr;
iov.iov_len = 0x50;
vmsplice(pip[1], &iov, 1, 0);

fd3 = open("/tmp/repro-secretmem.test", O_RDWR | O_CREAT, 0x600);
splice(pip[0], NULL, fd3, NULL, 0x50, 0);

return 0;
}
=======

Thanks,
Miklos

>
> kernel: upstream 23956900041d968f9ad0f30db6dede4daccd7aa9
> kernel config: https://syzkaller.appspot.com/text?tag=KernelConfig&x=9f47e8dfa53b0b11
> with KASAN enabled
> compiler: gcc (Debian 12.2.0-14) 12.2.0
>
> BUG: unable to handle kernel paging request in fuse_copy_do
> UDPLite: UDP-Lite is deprecated and scheduled to be removed in 2025,
> please contact the netdev mailing list
> BUG: unable to handle page fault for address: ffff88802c29c000
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 13001067 P4D 13001067 PUD 13002067 PMD 24c8d063 PTE 800fffffd3d63060
> Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI
> CPU: 1 PID: 8221 Comm: 1e9 Not tainted 6.8.0-05202-g9187210eee7d-dirty #21
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> 1.16.2-1.fc38 04/01/2014
> RIP: 0010:memcpy+0xc/0x20 arch/x86/lib/memcpy_64.S:38
> Code: 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90 90 90 90 90 90
> 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 48 89 f80
> RSP: 0018:ffffc9001065f9c8 EFLAGS: 00010246
> RAX: ffffc9001065fb10 RBX: ffffc9001065fc78 RCX: 0000000000000010
> RDX: 0000000000000010 RSI: ffff88802c29c000 RDI: ffffc9001065fb10
> RBP: 0000000000000010 R08: ffff88802c29c000 R09: 0000000000000001
> R10: ffffffff8ea82ed7 R11: ffffc9001065fd98 R12: ffffc9001065fac0
> R13: 0000000000000010 R14: ffffc9001065faf0 R15: ffffc9001065fcbc
> FS: 000000000f82d480(0000) GS:ffff88823bc00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: ffff88802c29c000 CR3: 000000002dd7c000 CR4: 0000000000750ef0
> PKRU: 55555554
> Call Trace:
> <TASK>
> fuse_copy_do+0x152/0x340 fs/fuse/dev.c:758
> fuse_copy_one fs/fuse/dev.c:1007 [inline]
> fuse_dev_do_write+0x1df/0x26a0 fs/fuse/dev.c:1863
> fuse_dev_write+0x129/0x1b0 fs/fuse/dev.c:1960
> call_write_iter include/linux/fs.h:2108 [inline]
> new_sync_write fs/read_write.c:497 [inline]
> vfs_write+0x62e/0x10a0 fs/read_write.c:590
> ksys_write+0xf6/0x1d0 fs/read_write.c:643
> do_syscall_x64 arch/x86/entry/common.c:52 [inline]
> do_syscall_64+0x7c/0x1d0 arch/x86/entry/common.c:83
> entry_SYSCALL_64_after_hwframe+0x6c/0x74
>
> =* repro.c =*
> #define _GNU_SOURCE
>
> #include <dirent.h>
> #include <endian.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <setjmp.h>
> #include <signal.h>
> #include <stdarg.h>
> #include <stdbool.h>
> #include <stdint.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <sys/prctl.h>
> #include <sys/stat.h>
> #include <sys/syscall.h>
> #include <sys/types.h>
> #include <sys/wait.h>
> #include <time.h>
> #include <unistd.h>
>
> #ifndef __NR_memfd_secret
> #define __NR_memfd_secret 447
> #endif
>
> static __thread int clone_ongoing;
> static __thread int skip_segv;
> static __thread jmp_buf segv_env;
>
> static void segv_handler(int sig, siginfo_t* info, void* ctx) {
> if (__atomic_load_n(&clone_ongoing, __ATOMIC_RELAXED) != 0) {
> exit(sig);
> }
> uintptr_t addr = (uintptr_t)info->si_addr;
> const uintptr_t prog_start = 1 << 20;
> const uintptr_t prog_end = 100 << 20;
> int skip = __atomic_load_n(&skip_segv, __ATOMIC_RELAXED) != 0;
> int valid = addr < prog_start || addr > prog_end;
> if (skip && valid) {
> _longjmp(segv_env, 1);
> }
> exit(sig);
> }
>
> static void install_segv_handler(void) {
> struct sigaction sa;
> memset(&sa, 0, sizeof(sa));
> sa.sa_handler = SIG_IGN;
> syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
> syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
> memset(&sa, 0, sizeof(sa));
> sa.sa_sigaction = segv_handler;
> sa.sa_flags = SA_NODEFER | SA_SIGINFO;
> sigaction(SIGSEGV, &sa, NULL);
> sigaction(SIGBUS, &sa, NULL);
> }
>
> #define NONFAILING(...) \
> ({ \
> int ok = 1; \
> __atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST); \
> if (_setjmp(segv_env) == 0) { \
> __VA_ARGS__; \
> } else \
> ok = 0; \
> __atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST); \
> ok; \
> })
>
> static void sleep_ms(uint64_t ms) {
> usleep(ms * 1000);
> }
>
> static uint64_t current_time_ms(void) {
> struct timespec ts;
> if (clock_gettime(CLOCK_MONOTONIC, &ts))
> exit(1);
> return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
> }
>
> static bool write_file(const char* file, const char* what, ...) {
> char buf[1024];
> va_list args;
> va_start(args, what);
> vsnprintf(buf, sizeof(buf), what, args);
> va_end(args);
> buf[sizeof(buf) - 1] = 0;
> int len = strlen(buf);
> int fd = open(file, O_WRONLY | O_CLOEXEC);
> if (fd == -1)
> return false;
> if (write(fd, buf, len) != len) {
> int err = errno;
> close(fd);
> errno = err;
> return false;
> }
> close(fd);
> return true;
> }
>
> static void kill_and_wait(int pid, int* status) {
> kill(-pid, SIGKILL);
> kill(pid, SIGKILL);
> for (int i = 0; i < 100; i++) {
> if (waitpid(-1, status, WNOHANG | __WALL) == pid)
> return;
> usleep(1000);
> }
> DIR* dir = opendir("/sys/fs/fuse/connections");
> if (dir) {
> for (;;) {
> struct dirent* ent = readdir(dir);
> if (!ent)
> break;
> if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
> continue;
> char abort[300];
> snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort",
> ent->d_name);
> int fd = open(abort, O_WRONLY);
> if (fd == -1) {
> continue;
> }
> if (write(fd, abort, 1) < 0) {
> }
> close(fd);
> }
> closedir(dir);
> } else {
> }
> while (waitpid(-1, status, __WALL) != pid) {
> }
> }
>
> static void setup_test() {
> prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
> setpgrp();
> write_file("/proc/self/oom_score_adj", "1000");
> }
>
> static void execute_one(void);
>
> #define WAIT_FLAGS __WALL
>
> static void loop(void) {
> int iter = 0;
> for (;; iter++) {
> int pid = fork();
> if (pid < 0)
> exit(1);
> if (pid == 0) {
> setup_test();
> execute_one();
> exit(0);
> }
> int status = 0;
> uint64_t start = current_time_ms();
> for (;;) {
> if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
> break;
> sleep_ms(1);
> if (current_time_ms() - start < 5000)
> continue;
> kill_and_wait(pid, &status);
> break;
> }
> }
> }
>
> uint64_t r[3] = {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff};
>
> void execute_one(void) {
> intptr_t res = 0;
> NONFAILING(memcpy((void*)0x20002040, "./file0\000", 8));
> syscall(__NR_mkdirat, /*fd=*/0xffffff9c, /*path=*/0x20002040ul, /*mode=*/0ul);
> NONFAILING(memcpy((void*)0x20002080, "/dev/fuse\000", 10));
> res = syscall(__NR_openat, /*fd=*/0xffffffffffffff9cul, /*file=*/0x20002080ul,
> /*flags=*/2ul, /*mode=*/0ul);
> if (res != -1)
> r[0] = res;
> NONFAILING(memcpy((void*)0x200020c0, "./file0\000", 8));
> NONFAILING(memcpy((void*)0x20002100, "fuse\000", 5));
> NONFAILING(memcpy((void*)0x20002140, "fd", 2));
> NONFAILING(*(uint8_t*)0x20002142 = 0x3d);
> NONFAILING(sprintf((char*)0x20002143, "0x%016llx", (long long)r[0]));
> NONFAILING(*(uint8_t*)0x20002155 = 0x2c);
> NONFAILING(memcpy((void*)0x20002156, "rootmode", 8));
> NONFAILING(*(uint8_t*)0x2000215e = 0x3d);
> NONFAILING(sprintf((char*)0x2000215f, "%023llo", (long long)0x4000));
> NONFAILING(*(uint8_t*)0x20002176 = 0x2c);
> NONFAILING(memcpy((void*)0x20002177, "user_id", 7));
> NONFAILING(*(uint8_t*)0x2000217e = 0x3d);
> NONFAILING(sprintf((char*)0x2000217f, "%020llu", (long long)0));
> NONFAILING(*(uint8_t*)0x20002193 = 0x2c);
> NONFAILING(memcpy((void*)0x20002194, "group_id", 8));
> NONFAILING(*(uint8_t*)0x2000219c = 0x3d);
> NONFAILING(sprintf((char*)0x2000219d, "%020llu", (long long)0));
> NONFAILING(*(uint8_t*)0x200021b1 = 0x2c);
> NONFAILING(*(uint8_t*)0x200021b2 = 0);
> syscall(__NR_mount, /*src=*/0ul, /*dst=*/0x200020c0ul, /*type=*/0x20002100ul,
> /*flags=*/0ul, /*opts=*/0x20002140ul);
> res = syscall(__NR_memfd_secret, /*flags=*/0ul);
> if (res != -1)
> r[1] = res;
> syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0xb36000ul,
> /*prot=PROT_GROWSUP|PROT_READ*/ 0x2000001ul,
> /*flags=MAP_STACK|MAP_POPULATE|MAP_FIXED|MAP_SHARED*/ 0x28011ul,
> /*fd=*/r[1], /*offset=*/0ul);
> syscall(__NR_ftruncate, /*fd=*/r[1], /*len=*/7ul);
> res = syscall(__NR_socket, /*domain=*/2ul, /*type=*/2ul, /*proto=*/0x88);
> if (res != -1)
> r[2] = res;
> NONFAILING(*(uint32_t*)0x20000280 = 0);
> syscall(__NR_getsockopt, /*fd=*/r[2], /*level=*/1, /*optname=*/0x11,
> /*optval=*/0ul, /*optlen=*/0x20000280ul);
> NONFAILING(*(uint32_t*)0x20000000 = 0x50);
> NONFAILING(*(uint32_t*)0x20000004 = 0);
> NONFAILING(*(uint64_t*)0x20000008 = 0);
> NONFAILING(*(uint32_t*)0x20000010 = 7);
> NONFAILING(*(uint32_t*)0x20000014 = 0x27);
> NONFAILING(*(uint32_t*)0x20000018 = 0);
> NONFAILING(*(uint32_t*)0x2000001c = 0);
> NONFAILING(*(uint16_t*)0x20000020 = 0);
> NONFAILING(*(uint16_t*)0x20000022 = 0);
> NONFAILING(*(uint32_t*)0x20000024 = 0);
> NONFAILING(*(uint32_t*)0x20000028 = 0);
> NONFAILING(*(uint16_t*)0x2000002c = 0);
> NONFAILING(*(uint16_t*)0x2000002e = 0);
> NONFAILING(memset((void*)0x20000030, 0, 32));
> syscall(__NR_write, /*fd=*/r[0], /*arg=*/0x20000000ul, /*len=*/0x50ul);
> }
> int main(void) {
> syscall(__NR_mmap, /*addr=*/0x1ffff000ul, /*len=*/0x1000ul, /*prot=*/0ul,
> /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
> /*offset=*/0ul);
> syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x1000000ul,
> /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul,
> /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
> /*offset=*/0ul);
> syscall(__NR_mmap, /*addr=*/0x21000000ul, /*len=*/0x1000ul, /*prot=*/0ul,
> /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
> /*offset=*/0ul);
> install_segv_handler();
> loop();
> return 0;
> }
>
> =* repro.txt =*
> mkdirat(0xffffffffffffff9c, &(0x7f0000002040)='./file0\x00', 0x0)
> r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000002080), 0x2, 0x0)
> mount$fuse(0x0, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100),
> 0x0, &(0x7f0000002140)={{'fd', 0x3d, r0}, 0x2c, {'rootmode', 0x3d,
> 0x4000}})
> r1 = memfd_secret(0x0)
> mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x2000001, 0x28011, r1, 0x0)
> ftruncate(r1, 0x7)
> r2 = socket$inet_udplite(0x2, 0x2, 0x88)
> getsockopt$sock_cred(r2, 0x1, 0x11, 0x0, &(0x7f0000000280))
> write$FUSE_INIT(r0, &(0x7f0000000000)={0x50}, 0x50)
>
>
> see aslo https://gist.github.com/xrivendell7/961be96ae091c9671bb56efea902cec4.
>
> I hope it helps.
> best regards.
> xingwei Lee

2024-03-22 15:41:46

by David Hildenbrand

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

On 22.03.24 14:50, Miklos Szeredi wrote:
> [MM list + secretmem author CC-d]
>
> On Thu, 21 Mar 2024 at 08:52, xingwei lee <[email protected]> wrote:
>>
>> Hello I found a bug titled "BUG: unable to handle kernel paging
>> request in fuse_copy_do” with modified syzkaller, and maybe it is
>> related to fs/fuse.
>> I also confirmed in the latest upstream.
>>
>> If you fix this issue, please add the following tag to the commit:
>> Reported-by: xingwei lee <[email protected]>
>> Reported-by: yue sun <[email protected]>
>
> Thanks for the report. This looks like a secretmem vs get_user_pages issue.
>
> I reduced the syz reproducer to a minimal one that isn't dependent on fuse:
>
> === repro.c ===
> #define _GNU_SOURCE
>
> #include <fcntl.h>
> #include <unistd.h>
> #include <sys/mman.h>
> #include <sys/syscall.h>
> #include <sys/socket.h>
>
> int main(void)
> {
> int fd1, fd2, fd3;
> int pip[2];
> struct iovec iov;
> void *addr;
>
> fd1 = syscall(__NR_memfd_secret, 0);
> addr = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd1, 0);
> ftruncate(fd1, 7);
> fd2 = socket(AF_INET, SOCK_DGRAM, 0);
> getsockopt(fd2, 0, 0, NULL, addr);
>
> pipe(pip);
> iov.iov_base = addr;
> iov.iov_len = 0x50;
> vmsplice(pip[1], &iov, 1, 0);

pip[1] should be the write end. So it will be used as the source.

I assume we go the ITER_SOURCE path in vmsplice, and call
vmsplice_to_pipe(). Then we call iter_to_pipe().

I would expect iov_iter_get_pages2() -> get_user_pages_fast() to fail on
secretmem pages?

But at least the vmsplice() just seems to work. Which is weird, because
GUP-fast should not apply (page not faulted in?) and check_vma_flags()
bails out early on vma_is_secretmem(vma).

So something is not quite right.

>
> fd3 = open("/tmp/repro-secretmem.test", O_RDWR | O_CREAT, 0x600);
> splice(pip[0], NULL, fd3, NULL, 0x50, 0);
>
> return 0;
> }




--
Cheers,

David / dhildenb


2024-03-22 19:47:05

by Miklos Szeredi

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <[email protected]> wrote:

> But at least the vmsplice() just seems to work. Which is weird, because
> GUP-fast should not apply (page not faulted in?)

But it is faulted in, and that indeed seems to be the root cause.
Improved repro:

#define _GNU_SOURCE

#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/syscall.h>

int main(void)
{
int fd1, fd2;
int pip[2];
struct iovec iov;
char *addr;
int ret;

fd1 = syscall(__NR_memfd_secret, 0);
addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
ftruncate(fd1, 7);
addr[0] = 1; /* fault in page */
pipe(pip);
iov.iov_base = addr;
iov.iov_len = 0x50;
ret = vmsplice(pip[1], &iov, 1, 0);
if (ret == -1 && errno == EFAULT) {
printf("Success\n");
return 0;
}

fd2 = open("/tmp/repro-secretmem.test", O_RDWR | O_CREAT, 0x600);
splice(pip[0], NULL, fd2, NULL, 0x50, 0);

return 0;
}

2024-03-22 21:08:31

by David Hildenbrand

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

On 22.03.24 20:46, Miklos Szeredi wrote:
> On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <[email protected]> wrote:
>
>> But at least the vmsplice() just seems to work. Which is weird, because
>> GUP-fast should not apply (page not faulted in?)
>
> But it is faulted in, and that indeed seems to be the root cause.

secretmem mmap() won't populate the page tables. So it's not faulted in yet.

When we GUP via vmsplice, GUP-fast should not find it in the page tables
and fallback to slow GUP.

There, we seem to pass check_vma_flags(), trigger faultin_page() to
fault it in, and then find it via follow_page_mask().

.. and I wonder how we manage to skip check_vma_flags(), or otherwise
managed to GUP it.

vmsplice() should, in theory, never succeed here.

Weird :/

> Improved repro:
>
> #define _GNU_SOURCE
>
> #include <fcntl.h>
> #include <unistd.h>
> #include <stdio.h>
> #include <errno.h>
> #include <sys/mman.h>
> #include <sys/syscall.h>
>
> int main(void)
> {
> int fd1, fd2;
> int pip[2];
> struct iovec iov;
> char *addr;
> int ret;
>
> fd1 = syscall(__NR_memfd_secret, 0);
> addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
> ftruncate(fd1, 7);
> addr[0] = 1; /* fault in page */
> pipe(pip);
> iov.iov_base = addr;
> iov.iov_len = 0x50;
> ret = vmsplice(pip[1], &iov, 1, 0);
> if (ret == -1 && errno == EFAULT) {
> printf("Success\n");
> return 0;
> }
>
> fd2 = open("/tmp/repro-secretmem.test", O_RDWR | O_CREAT, 0x600);
> splice(pip[0], NULL, fd2, NULL, 0x50, 0);
>
> return 0;
> }
>

--
Cheers,

David / dhildenb


2024-03-22 21:14:27

by Miklos Szeredi

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <[email protected]> wrote:
>
> On 22.03.24 20:46, Miklos Szeredi wrote:
> > On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <[email protected]> wrote:
> >
> >> But at least the vmsplice() just seems to work. Which is weird, because
> >> GUP-fast should not apply (page not faulted in?)
> >
> > But it is faulted in, and that indeed seems to be the root cause.
>
> secretmem mmap() won't populate the page tables. So it's not faulted in yet.
>
> When we GUP via vmsplice, GUP-fast should not find it in the page tables
> and fallback to slow GUP.
>
> There, we seem to pass check_vma_flags(), trigger faultin_page() to
> fault it in, and then find it via follow_page_mask().
>
> ... and I wonder how we manage to skip check_vma_flags(), or otherwise
> managed to GUP it.
>
> vmsplice() should, in theory, never succeed here.
>
> Weird :/
>
> > Improved repro:
> >
> > #define _GNU_SOURCE
> >
> > #include <fcntl.h>
> > #include <unistd.h>
> > #include <stdio.h>
> > #include <errno.h>
> > #include <sys/mman.h>
> > #include <sys/syscall.h>
> >
> > int main(void)
> > {
> > int fd1, fd2;
> > int pip[2];
> > struct iovec iov;
> > char *addr;
> > int ret;
> >
> > fd1 = syscall(__NR_memfd_secret, 0);
> > addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
> > ftruncate(fd1, 7);
> > addr[0] = 1; /* fault in page */

Here the page is faulted in and GUP-fast will find it. It's not in
the kernel page table, but it is in the user page table, which is what
matter for GUP.

Thanks,
Miklos

2024-03-22 21:19:03

by David Hildenbrand

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

On 22.03.24 22:13, Miklos Szeredi wrote:
> On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <[email protected]> wrote:
>>
>> On 22.03.24 20:46, Miklos Szeredi wrote:
>>> On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <[email protected]> wrote:
>>>
>>>> But at least the vmsplice() just seems to work. Which is weird, because
>>>> GUP-fast should not apply (page not faulted in?)
>>>
>>> But it is faulted in, and that indeed seems to be the root cause.
>>
>> secretmem mmap() won't populate the page tables. So it's not faulted in yet.
>>
>> When we GUP via vmsplice, GUP-fast should not find it in the page tables
>> and fallback to slow GUP.
>>
>> There, we seem to pass check_vma_flags(), trigger faultin_page() to
>> fault it in, and then find it via follow_page_mask().
>>
>> ... and I wonder how we manage to skip check_vma_flags(), or otherwise
>> managed to GUP it.
>>
>> vmsplice() should, in theory, never succeed here.
>>
>> Weird :/
>>
>>> Improved repro:
>>>
>>> #define _GNU_SOURCE
>>>
>>> #include <fcntl.h>
>>> #include <unistd.h>
>>> #include <stdio.h>
>>> #include <errno.h>
>>> #include <sys/mman.h>
>>> #include <sys/syscall.h>
>>>
>>> int main(void)
>>> {
>>> int fd1, fd2;
>>> int pip[2];
>>> struct iovec iov;
>>> char *addr;
>>> int ret;
>>>
>>> fd1 = syscall(__NR_memfd_secret, 0);
>>> addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
>>> ftruncate(fd1, 7);
>>> addr[0] = 1; /* fault in page */
>
> Here the page is faulted in and GUP-fast will find it. It's not in
> the kernel page table, but it is in the user page table, which is what
> matter for GUP.

Trust me, I know the GUP code very well :P

gup_pte_range -- GUP fast -- contains:

if (unlikely(folio_is_secretmem(folio))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}

So we "should" be rejecting any secretmem folios and fallback to GUP slow.


.. we don't check the same in gup_huge_pmd(), but we shouldn't ever see
THP in secretmem code.

--
Cheers,

David / dhildenb


2024-03-22 21:34:19

by David Hildenbrand

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

On 22.03.24 22:18, David Hildenbrand wrote:
> On 22.03.24 22:13, Miklos Szeredi wrote:
>> On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <[email protected]> wrote:
>>>
>>> On 22.03.24 20:46, Miklos Szeredi wrote:
>>>> On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <[email protected]> wrote:
>>>>
>>>>> But at least the vmsplice() just seems to work. Which is weird, because
>>>>> GUP-fast should not apply (page not faulted in?)
>>>>
>>>> But it is faulted in, and that indeed seems to be the root cause.
>>>
>>> secretmem mmap() won't populate the page tables. So it's not faulted in yet.
>>>
>>> When we GUP via vmsplice, GUP-fast should not find it in the page tables
>>> and fallback to slow GUP.
>>>
>>> There, we seem to pass check_vma_flags(), trigger faultin_page() to
>>> fault it in, and then find it via follow_page_mask().
>>>
>>> ... and I wonder how we manage to skip check_vma_flags(), or otherwise
>>> managed to GUP it.
>>>
>>> vmsplice() should, in theory, never succeed here.
>>>
>>> Weird :/
>>>
>>>> Improved repro:
>>>>
>>>> #define _GNU_SOURCE
>>>>
>>>> #include <fcntl.h>
>>>> #include <unistd.h>
>>>> #include <stdio.h>
>>>> #include <errno.h>
>>>> #include <sys/mman.h>
>>>> #include <sys/syscall.h>
>>>>
>>>> int main(void)
>>>> {
>>>> int fd1, fd2;
>>>> int pip[2];
>>>> struct iovec iov;
>>>> char *addr;
>>>> int ret;
>>>>
>>>> fd1 = syscall(__NR_memfd_secret, 0);
>>>> addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
>>>> ftruncate(fd1, 7);
>>>> addr[0] = 1; /* fault in page */
>>
>> Here the page is faulted in and GUP-fast will find it. It's not in
>> the kernel page table, but it is in the user page table, which is what
>> matter for GUP.
>
> Trust me, I know the GUP code very well :P
>
> gup_pte_range -- GUP fast -- contains:
>
> if (unlikely(folio_is_secretmem(folio))) {
> gup_put_folio(folio, 1, flags);
> goto pte_unmap;
> }
>
> So we "should" be rejecting any secretmem folios and fallback to GUP slow.
>
>
> ... we don't check the same in gup_huge_pmd(), but we shouldn't ever see
> THP in secretmem code.
>

Ehm:

[ 29.441405] Secretmem fault: PFN: 1096177
[ 29.442092] GUP-fast: PFN: 1096177


.. is folio_is_secretmem() broken?

.. is it something "obvious" like:

diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
index 35f3a4a8ceb1e..6996f1f53f147 100644
--- a/include/linux/secretmem.h
+++ b/include/linux/secretmem.h
@@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
* We know that secretmem pages are not compound and LRU so we can
* save a couple of cycles here.
*/
- if (folio_test_large(folio) || !folio_test_lru(folio))
+ if (folio_test_large(folio) || folio_test_lru(folio))
return false;

mapping = (struct address_space *)


--
Cheers,

David / dhildenb


2024-03-22 21:38:05

by David Hildenbrand

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

On 22.03.24 22:33, David Hildenbrand wrote:
> On 22.03.24 22:18, David Hildenbrand wrote:
>> On 22.03.24 22:13, Miklos Szeredi wrote:
>>> On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <[email protected]> wrote:
>>>>
>>>> On 22.03.24 20:46, Miklos Szeredi wrote:
>>>>> On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <[email protected]> wrote:
>>>>>
>>>>>> But at least the vmsplice() just seems to work. Which is weird, because
>>>>>> GUP-fast should not apply (page not faulted in?)
>>>>>
>>>>> But it is faulted in, and that indeed seems to be the root cause.
>>>>
>>>> secretmem mmap() won't populate the page tables. So it's not faulted in yet.
>>>>
>>>> When we GUP via vmsplice, GUP-fast should not find it in the page tables
>>>> and fallback to slow GUP.
>>>>
>>>> There, we seem to pass check_vma_flags(), trigger faultin_page() to
>>>> fault it in, and then find it via follow_page_mask().
>>>>
>>>> ... and I wonder how we manage to skip check_vma_flags(), or otherwise
>>>> managed to GUP it.
>>>>
>>>> vmsplice() should, in theory, never succeed here.
>>>>
>>>> Weird :/
>>>>
>>>>> Improved repro:
>>>>>
>>>>> #define _GNU_SOURCE
>>>>>
>>>>> #include <fcntl.h>
>>>>> #include <unistd.h>
>>>>> #include <stdio.h>
>>>>> #include <errno.h>
>>>>> #include <sys/mman.h>
>>>>> #include <sys/syscall.h>
>>>>>
>>>>> int main(void)
>>>>> {
>>>>> int fd1, fd2;
>>>>> int pip[2];
>>>>> struct iovec iov;
>>>>> char *addr;
>>>>> int ret;
>>>>>
>>>>> fd1 = syscall(__NR_memfd_secret, 0);
>>>>> addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
>>>>> ftruncate(fd1, 7);
>>>>> addr[0] = 1; /* fault in page */
>>>
>>> Here the page is faulted in and GUP-fast will find it. It's not in
>>> the kernel page table, but it is in the user page table, which is what
>>> matter for GUP.
>>
>> Trust me, I know the GUP code very well :P
>>
>> gup_pte_range -- GUP fast -- contains:
>>
>> if (unlikely(folio_is_secretmem(folio))) {
>> gup_put_folio(folio, 1, flags);
>> goto pte_unmap;
>> }
>>
>> So we "should" be rejecting any secretmem folios and fallback to GUP slow.
>>
>>
>> ... we don't check the same in gup_huge_pmd(), but we shouldn't ever see
>> THP in secretmem code.
>>
>
> Ehm:
>
> [ 29.441405] Secretmem fault: PFN: 1096177
> [ 29.442092] GUP-fast: PFN: 1096177
>
>
> ... is folio_is_secretmem() broken?
>
> ... is it something "obvious" like:
>
> diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
> index 35f3a4a8ceb1e..6996f1f53f147 100644
> --- a/include/linux/secretmem.h
> +++ b/include/linux/secretmem.h
> @@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
> * We know that secretmem pages are not compound and LRU so we can
> * save a couple of cycles here.
> */
> - if (folio_test_large(folio) || !folio_test_lru(folio))
> + if (folio_test_large(folio) || folio_test_lru(folio))
> return false;
>
> mapping = (struct address_space *)

.. yes, that does the trick!

--
Cheers,

David / dhildenb


2024-03-22 21:56:28

by David Hildenbrand

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

On 22.03.24 22:37, David Hildenbrand wrote:
> On 22.03.24 22:33, David Hildenbrand wrote:
>> On 22.03.24 22:18, David Hildenbrand wrote:
>>> On 22.03.24 22:13, Miklos Szeredi wrote:
>>>> On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <[email protected]> wrote:
>>>>>
>>>>> On 22.03.24 20:46, Miklos Szeredi wrote:
>>>>>> On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <[email protected]> wrote:
>>>>>>
>>>>>>> But at least the vmsplice() just seems to work. Which is weird, because
>>>>>>> GUP-fast should not apply (page not faulted in?)
>>>>>>
>>>>>> But it is faulted in, and that indeed seems to be the root cause.
>>>>>
>>>>> secretmem mmap() won't populate the page tables. So it's not faulted in yet.
>>>>>
>>>>> When we GUP via vmsplice, GUP-fast should not find it in the page tables
>>>>> and fallback to slow GUP.
>>>>>
>>>>> There, we seem to pass check_vma_flags(), trigger faultin_page() to
>>>>> fault it in, and then find it via follow_page_mask().
>>>>>
>>>>> ... and I wonder how we manage to skip check_vma_flags(), or otherwise
>>>>> managed to GUP it.
>>>>>
>>>>> vmsplice() should, in theory, never succeed here.
>>>>>
>>>>> Weird :/
>>>>>
>>>>>> Improved repro:
>>>>>>
>>>>>> #define _GNU_SOURCE
>>>>>>
>>>>>> #include <fcntl.h>
>>>>>> #include <unistd.h>
>>>>>> #include <stdio.h>
>>>>>> #include <errno.h>
>>>>>> #include <sys/mman.h>
>>>>>> #include <sys/syscall.h>
>>>>>>
>>>>>> int main(void)
>>>>>> {
>>>>>> int fd1, fd2;
>>>>>> int pip[2];
>>>>>> struct iovec iov;
>>>>>> char *addr;
>>>>>> int ret;
>>>>>>
>>>>>> fd1 = syscall(__NR_memfd_secret, 0);
>>>>>> addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
>>>>>> ftruncate(fd1, 7);
>>>>>> addr[0] = 1; /* fault in page */
>>>>
>>>> Here the page is faulted in and GUP-fast will find it. It's not in
>>>> the kernel page table, but it is in the user page table, which is what
>>>> matter for GUP.
>>>
>>> Trust me, I know the GUP code very well :P
>>>
>>> gup_pte_range -- GUP fast -- contains:
>>>
>>> if (unlikely(folio_is_secretmem(folio))) {
>>> gup_put_folio(folio, 1, flags);
>>> goto pte_unmap;
>>> }
>>>
>>> So we "should" be rejecting any secretmem folios and fallback to GUP slow.
>>>
>>>
>>> ... we don't check the same in gup_huge_pmd(), but we shouldn't ever see
>>> THP in secretmem code.
>>>
>>
>> Ehm:
>>
>> [ 29.441405] Secretmem fault: PFN: 1096177
>> [ 29.442092] GUP-fast: PFN: 1096177
>>
>>
>> ... is folio_is_secretmem() broken?
>>
>> ... is it something "obvious" like:
>>
>> diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
>> index 35f3a4a8ceb1e..6996f1f53f147 100644
>> --- a/include/linux/secretmem.h
>> +++ b/include/linux/secretmem.h
>> @@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
>> * We know that secretmem pages are not compound and LRU so we can
>> * save a couple of cycles here.
>> */
>> - if (folio_test_large(folio) || !folio_test_lru(folio))
>> + if (folio_test_large(folio) || folio_test_lru(folio))
>> return false;
>>
>> mapping = (struct address_space *)
>
> ... yes, that does the trick!
>

Proper patch (I might send out again on Monday "officially"). There are
other improvements we want to do to folio_is_secretmem() in the light of
folio_fast_pin_allowed(), that I wanted to do a while ago. I might send
a patch for that as well now that I'm at it.


From 85558a46d9f249f26bd77dd3b18d14f248464845 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <[email protected]>
Date: Fri, 22 Mar 2024 22:45:36 +0100
Subject: [PATCH] mm/secretmem: fix GUP-fast succeeding on secretmem folios

folio_is_secretmem() states that secretmem folios cannot be LRU folios:
so we may only exit early if we find an LRU folio. Yet, we exit early if
we find a folio that is not a secretmem folio.

Consequently, folio_is_secretmem() fails to detect secretmem folios and,
therefore, we can succeed in grabbing a secretmem folio during GUP-fast,
crashing the kernel when we later try reading/writing to the folio, because
the folio has been unmapped from the directmap.

Reported-by: xingwei lee <[email protected]>
Reported-by: yue sun <[email protected]>
Debugged-by: Miklos Szeredi <[email protected]>
Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
Cc: <[email protected]>
Signed-off-by: David Hildenbrand <[email protected]>
---
include/linux/secretmem.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
index 35f3a4a8ceb1..6996f1f53f14 100644
--- a/include/linux/secretmem.h
+++ b/include/linux/secretmem.h
@@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
* We know that secretmem pages are not compound and LRU so we can
* save a couple of cycles here.
*/
- if (folio_test_large(folio) || !folio_test_lru(folio))
+ if (folio_test_large(folio) || folio_test_lru(folio))
return false;

mapping = (struct address_space *)
--
2.43.2


--
Cheers,

David / dhildenb


2024-03-24 10:31:09

by Mike Rapoport

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

On Fri, Mar 22, 2024 at 10:56:08PM +0100, David Hildenbrand wrote:
> On 22.03.24 22:37, David Hildenbrand wrote:
> > On 22.03.24 22:33, David Hildenbrand wrote:
> > > On 22.03.24 22:18, David Hildenbrand wrote:
> > > > On 22.03.24 22:13, Miklos Szeredi wrote:
> > > > > On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <[email protected]> wrote:
> > > > > >
> > > > > > On 22.03.24 20:46, Miklos Szeredi wrote:
> > > > > > > On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <[email protected]> wrote:
> > > > > > >
> > > > > > > > But at least the vmsplice() just seems to work. Which is weird, because
> > > > > > > > GUP-fast should not apply (page not faulted in?)
> > > > > > >
> > > > > > > But it is faulted in, and that indeed seems to be the root cause.
> > > > > >
> > > > > > secretmem mmap() won't populate the page tables. So it's not faulted in yet.
> > > > > >
> > > > > > When we GUP via vmsplice, GUP-fast should not find it in the page tables
> > > > > > and fallback to slow GUP.
> > > > > >
> > > > > > There, we seem to pass check_vma_flags(), trigger faultin_page() to
> > > > > > fault it in, and then find it via follow_page_mask().
> > > > > >
> > > > > > ... and I wonder how we manage to skip check_vma_flags(), or otherwise
> > > > > > managed to GUP it.
> > > > > >
> > > > > > vmsplice() should, in theory, never succeed here.
> > > > > >
> > > > > > Weird :/
> > > > > >
> > > > > > > Improved repro:
> > > > > > >
> > > > > > > #define _GNU_SOURCE
> > > > > > >
> > > > > > > #include <fcntl.h>
> > > > > > > #include <unistd.h>
> > > > > > > #include <stdio.h>
> > > > > > > #include <errno.h>
> > > > > > > #include <sys/mman.h>
> > > > > > > #include <sys/syscall.h>
> > > > > > >
> > > > > > > int main(void)
> > > > > > > {
> > > > > > > int fd1, fd2;
> > > > > > > int pip[2];
> > > > > > > struct iovec iov;
> > > > > > > char *addr;
> > > > > > > int ret;
> > > > > > >
> > > > > > > fd1 = syscall(__NR_memfd_secret, 0);
> > > > > > > addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
> > > > > > > ftruncate(fd1, 7);
> > > > > > > addr[0] = 1; /* fault in page */
> > > > >
> > > > > Here the page is faulted in and GUP-fast will find it. It's not in
> > > > > the kernel page table, but it is in the user page table, which is what
> > > > > matter for GUP.
> > > >
> > > > Trust me, I know the GUP code very well :P
> > > >
> > > > gup_pte_range -- GUP fast -- contains:
> > > >
> > > > if (unlikely(folio_is_secretmem(folio))) {
> > > > gup_put_folio(folio, 1, flags);
> > > > goto pte_unmap;
> > > > }
> > > >
> > > > So we "should" be rejecting any secretmem folios and fallback to GUP slow.
> > > >
> > > >
> > > > ... we don't check the same in gup_huge_pmd(), but we shouldn't ever see
> > > > THP in secretmem code.
> > > >
> > >
> > > Ehm:
> > >
> > > [ 29.441405] Secretmem fault: PFN: 1096177
> > > [ 29.442092] GUP-fast: PFN: 1096177
> > >
> > >
> > > ... is folio_is_secretmem() broken?
> > >
> > > ... is it something "obvious" like:
> > >
> > > diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
> > > index 35f3a4a8ceb1e..6996f1f53f147 100644
> > > --- a/include/linux/secretmem.h
> > > +++ b/include/linux/secretmem.h
> > > @@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
> > > * We know that secretmem pages are not compound and LRU so we can
> > > * save a couple of cycles here.
> > > */
> > > - if (folio_test_large(folio) || !folio_test_lru(folio))
> > > + if (folio_test_large(folio) || folio_test_lru(folio))
> > > return false;
> > > mapping = (struct address_space *)
> >
> > ... yes, that does the trick!
> >
>
> Proper patch (I might send out again on Monday "officially"). There are
> other improvements we want to do to folio_is_secretmem() in the light of
> folio_fast_pin_allowed(), that I wanted to do a while ago. I might send
> a patch for that as well now that I'm at it.

The most robust but a bit slower solution is to make folio_is_secretmem()
call folio_mapping() rather than open code the check.

What improvements did you have in mind?

> From 85558a46d9f249f26bd77dd3b18d14f248464845 Mon Sep 17 00:00:00 2001
> From: David Hildenbrand <[email protected]>
> Date: Fri, 22 Mar 2024 22:45:36 +0100
> Subject: [PATCH] mm/secretmem: fix GUP-fast succeeding on secretmem folios
>
> folio_is_secretmem() states that secretmem folios cannot be LRU folios:
> so we may only exit early if we find an LRU folio. Yet, we exit early if
> we find a folio that is not a secretmem folio.
>
> Consequently, folio_is_secretmem() fails to detect secretmem folios and,
> therefore, we can succeed in grabbing a secretmem folio during GUP-fast,
> crashing the kernel when we later try reading/writing to the folio, because
> the folio has been unmapped from the directmap.
>
> Reported-by: xingwei lee <[email protected]>
> Reported-by: yue sun <[email protected]>
> Debugged-by: Miklos Szeredi <[email protected]>
> Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
> Cc: <[email protected]>
> Signed-off-by: David Hildenbrand <[email protected]>

Reviewed-by: Mike Rapoport (IBM) <[email protected]>

> ---
> include/linux/secretmem.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
> index 35f3a4a8ceb1..6996f1f53f14 100644
> --- a/include/linux/secretmem.h
> +++ b/include/linux/secretmem.h
> @@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
> * We know that secretmem pages are not compound and LRU so we can
> * save a couple of cycles here.
> */
> - if (folio_test_large(folio) || !folio_test_lru(folio))
> + if (folio_test_large(folio) || folio_test_lru(folio))
> return false;
> mapping = (struct address_space *)
> --
> 2.43.2
>
>
> --
> Cheers,
>
> David / dhildenb
>

--
Sincerely yours,
Mike.

2024-03-25 15:19:31

by Miklos Szeredi

[permalink] [raw]
Subject: Re: BUG: unable to handle kernel paging request in fuse_copy_do

On Fri, 22 Mar 2024 at 22:56, David Hildenbrand <[email protected]> wrote:

> From 85558a46d9f249f26bd77dd3b18d14f248464845 Mon Sep 17 00:00:00 2001
> From: David Hildenbrand <[email protected]>
> Date: Fri, 22 Mar 2024 22:45:36 +0100
> Subject: [PATCH] mm/secretmem: fix GUP-fast succeeding on secretmem folios
>
> folio_is_secretmem() states that secretmem folios cannot be LRU folios:
> so we may only exit early if we find an LRU folio. Yet, we exit early if
> we find a folio that is not a secretmem folio.
>
> Consequently, folio_is_secretmem() fails to detect secretmem folios and,
> therefore, we can succeed in grabbing a secretmem folio during GUP-fast,
> crashing the kernel when we later try reading/writing to the folio, because
> the folio has been unmapped from the directmap.
>
> Reported-by: xingwei lee <[email protected]>
> Reported-by: yue sun <[email protected]>
> Debugged-by: Miklos Szeredi <[email protected]>
> Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
> Cc: <[email protected]>
> Signed-off-by: David Hildenbrand <[email protected]>

Verified that it's no longer crashing with the reproducers.

Tested-by: Miklos Szeredi <[email protected]>