From: Tatsiana Brouka <[email protected]>
Implement system call for bulk retrieveing of pids in binary form.
Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
converting with atoi().
/proc may be not mounted especially in containers. Natural extension of
hidepid=2 efforts is to not mount /proc at all.
It could be used by programs like ps, top or CRIU. Speed increase will
become more drastic once combined with bulk retrieval of process statistics.
Sample program:
#include <stdio.h>
static inline long sys_pidmap(int *pid, unsigned int n, int start)
{
register long r10 asm ("r10") = 0;
long rv;
asm volatile (
"syscall"
: "=a" (rv)
: "0" (333), "D" (pid), "S" (n), "d" (start), "r" (r10)
: "rcx", "r11", "cc", "memory"
);
return rv;
}
int main(void)
{
int pid[5];
unsigned int start;
int n;
start = 0;
while ((n = sys_pidmap(pid, sizeof(pid)/sizeof(pid[0]), start)) > 0) {
int i;
for (i = 0; i < n; i++) {
printf(" %u", pid[i]);
}
printf("\n");
start = pid[n - 1] + 1;
}
return 0;
}
Signed-off-by: Tatsiana Brouka <[email protected]>
Signed-off-by: Alexey Dobriyan <[email protected]>
---
arch/x86/entry/syscalls/syscall_64.tbl | 1
include/linux/syscalls.h | 4
kernel/Makefile | 2
kernel/pidmap.c | 116 ++++++++++++++
tools/testing/selftests/Makefile | 1
tools/testing/selftests/pidmap/Makefile | 5
tools/testing/selftests/pidmap/pidmap.c | 263 ++++++++++++++++++++++++++++++++
7 files changed, 392 insertions(+)
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
332 common statx sys_statx
+333 common pidmap sys_pidmap
#
# x32-specific system call numbers start at 512 to avoid cache impact
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -923,4 +923,8 @@ asmlinkage long sys_pkey_free(int pkey);
asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);
+asmlinkage long sys_pidmap(int __user *pids,
+ unsigned int pids_count,
+ unsigned int start_pid,
+ int flags);
#endif
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,8 @@ obj-y = fork.o exec_domain.o panic.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o
+obj-y += pidmap.o
+
obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
--- /dev/null
+++ b/kernel/pidmap.c
@@ -0,0 +1,116 @@
+#include <linux/bitops.h>
+#include <linux/cred.h>
+#include <linux/kernel.h>
+#include <linux/pid.h>
+#include <linux/ptrace.h>
+#include <linux/rcupdate.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+/**
+ * pidmap - get allocated PIDs
+ * @pids: Destination buffer.
+ * @pids_count: number of elements in the buffer.
+ * @start_pid: PID to start from.
+ * @flags: flags, must be 0.
+ *
+ * Write allocated PIDs to a buffer starting from @start_pid (inclusive).
+ * PIDs are filled from pid namespace of the calling process POV:
+ * unshare(CLONE_NEWPID)+fork+pidmap in child will always return 1/1.
+ *
+ * pidmap(2) hides PIDs inaccessible at /proc mounted with "hide_pid" option.
+ *
+ * Note, pidmap(2) does not guarantee that any of returned PID exists
+ * by the time system call exits.
+ *
+ * Return: number of PIDs written to the buffer or error code otherwise.
+ */
+SYSCALL_DEFINE4(pidmap, int __user *, pids, unsigned int, pids_count,
+ unsigned int, start_pid, int, flags)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ unsigned int start_page, start_elem;
+ unsigned int last_pos = 0;
+ unsigned int last_set_pid = 0;
+ unsigned long mask;
+ bool has_perms = false;
+ unsigned int i;
+
+ if (flags)
+ return -EINVAL;
+
+ /*
+ * Pid 0 does not exist, however, corresponding bit is always set in
+ * ->pidmap[0].page, so we should skip it.
+ */
+ if (start_pid == 0)
+ start_pid = 1;
+
+ if (start_pid > ns->last_pid)
+ return 0;
+
+ if (ns->hide_pid < HIDEPID_INVISIBLE || in_group_p(ns->pid_gid))
+ has_perms = true;
+
+ start_page = start_pid / BITS_PER_PAGE;
+ start_elem = (start_pid % BITS_PER_PAGE) / BITS_PER_LONG;
+ mask = ~0UL << (start_pid % BITS_PER_LONG);
+
+ for (i = start_page; i < PIDMAP_ENTRIES; i++) {
+ unsigned int j;
+
+ /*
+ * ->pidmap[].page is set once to a valid pointer,
+ * therefore do not take any locks.
+ */
+ if (ns->pidmap[i].page == NULL)
+ continue;
+
+ for (j = start_elem; j < PAGE_SIZE/sizeof(unsigned long); j++) {
+ unsigned long val;
+
+ val = *((unsigned long *)ns->pidmap[i].page + j);
+ val &= mask;
+ mask = ~0UL;
+ while (val != 0) {
+ struct task_struct *task;
+
+ if (last_pos == pids_count)
+ return last_pos;
+
+ last_set_pid = i * BITS_PER_PAGE +
+ j * BITS_PER_LONG + __ffs(val);
+
+ if (has_perms)
+ goto write;
+
+ rcu_read_lock();
+ task = find_task_by_pid_ns(last_set_pid, ns);
+ if (!task) {
+ rcu_read_unlock();
+ goto next;
+ }
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
+ rcu_read_unlock();
+ goto next;
+ }
+ rcu_read_unlock();
+write:
+ if (put_user(last_set_pid, pids + last_pos))
+ return -EFAULT;
+ last_pos++;
+ if (last_set_pid == ns->last_pid)
+ return last_pos;
+next:
+ val &= (val - 1);
+ }
+
+ }
+ start_elem = 0;
+ }
+ if (last_set_pid == 0)
+ return 0;
+ else
+ return last_pos;
+}
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -20,6 +20,7 @@ TARGETS += mount
TARGETS += mqueue
TARGETS += net
TARGETS += nsfs
+TARGETS += pidmap
TARGETS += powerpc
TARGETS += pstore
TARGETS += ptrace
--- /dev/null
+++ b/tools/testing/selftests/pidmap/Makefile
@@ -0,0 +1,5 @@
+CFLAGS = -Wall
+
+TEST_GEN_PROGS := pidmap
+
+include ../lib.mk
--- /dev/null
+++ b/tools/testing/selftests/pidmap/pidmap.c
@@ -0,0 +1,263 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <dirent.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <signal.h>
+#include "../kselftest_harness.h"
+
+#define SIZE 512
+
+static inline long pidmap(int *pid, unsigned int count, unsigned int start_pid)
+{
+ long ret;
+
+ register long r10 asm("r10") = 0;
+
+ asm volatile ("syscall" : "=a"(ret) :
+ "0"(333), "D"(pid), "S"(count), "d"(start_pid), "r"(r10) :
+ "rcx", "r11", "cc", "memory");
+ return ret;
+}
+
+static int compare(const void *a, const void *b)
+{
+ return *((int *)a) > *((int *)b);
+}
+
+int pidmap_full(int **pid, unsigned int *res_count)
+{
+ int n;
+ int start_pid = 1;
+ *pid = (int *)malloc(SIZE * sizeof(int));
+ *res_count = 0;
+
+ while ((n = pidmap(*pid + *res_count, SIZE, start_pid)) > 0) {
+ *res_count += n;
+ *pid = (int *)realloc(*pid, (*res_count + SIZE) * sizeof(int));
+ start_pid = (*pid)[*res_count - 1] + 1;
+ }
+ return n;
+}
+
+int pidmap_proc(int **pid, unsigned int *n)
+{
+ DIR *dir = opendir("/proc");
+ struct dirent *dirs;
+
+ *n = 0;
+ *pid = NULL;
+
+ while ((dirs = readdir(dir))) {
+ char dname[32] = "";
+ DIR *task_dir;
+
+ if (dirs->d_name[0] < '0' || dirs->d_name[0] > '9')
+ continue;
+
+ strcpy(dname, "/proc/");
+ strcat(dname, dirs->d_name);
+ strcat(dname, "/task");
+ task_dir = opendir(dname);
+
+ if (task_dir) {
+ struct dirent *task_dirs;
+
+ while ((task_dirs = readdir(task_dir))) {
+ if (task_dirs->d_name[0] < '0' ||
+ task_dirs->d_name[0] > '9')
+ continue;
+
+ *pid = (int *)realloc(*pid, (*n + 1) *
+ sizeof(int));
+ if (*pid == NULL)
+ return -1;
+ *(*pid + *n) = atoi(task_dirs->d_name);
+ *n += 1;
+ }
+ } else {
+ *pid = (int *)realloc(*pid, (*n + 1) * sizeof(int));
+ if (*pid == NULL)
+ return -1;
+ *(*pid + *n) = atoi(dirs->d_name);
+ *n += 1;
+ }
+ closedir(task_dir);
+ }
+ closedir(dir);
+ return 0;
+}
+
+TEST(bufsize)
+{
+ int pid[SIZE];
+
+ EXPECT_EQ(0, pidmap(pid, 0, 1));
+}
+
+TEST(get_pid)
+{
+ int pid;
+ int ret;
+
+ ret = pidmap(&pid, 1, getpid());
+ ASSERT_LE(0, ret);
+ EXPECT_EQ(getpid(), pid);
+}
+
+TEST(bad_start)
+{
+ int pid[SIZE];
+
+ ASSERT_LE(0, pidmap(pid, SIZE, -1));
+ ASSERT_LE(0, pidmap(pid, SIZE, ~0U));
+ ASSERT_LE(0, pidmap(pid, SIZE, 0));
+ EXPECT_EQ(1, pid[0]);
+}
+
+TEST(child_pid)
+{
+ pid_t pid = fork();
+
+ if (pid == 0)
+ pause();
+ else {
+ int ret;
+ int result = 0;
+
+ ret = pidmap(&result, 1, pid);
+ EXPECT_LE(0, ret);
+ EXPECT_EQ(pid, result);
+ kill(pid, SIGTERM);
+ }
+}
+
+int write_pidmax(int new_pidmax)
+{
+ char old_pidmax[32];
+ char new[32];
+ int fd = open("/proc/sys/kernel/pid_max", O_RDWR);
+
+ if (read(fd, old_pidmax, 32) <= 0)
+ printf("Read failed\n");
+ lseek(fd, 0, 0);
+ snprintf(new, sizeof(new), "%d", new_pidmax);
+ if (write(fd, new, strlen(new)) <= 0)
+ printf("Write failed\n");
+ close(fd);
+ return atoi(old_pidmax);
+}
+
+void do_forks(unsigned int n)
+{
+ while (n--) {
+ pid_t pid = fork();
+
+ if (pid == 0)
+ exit(0);
+ waitpid(pid, NULL, 0);
+ }
+}
+
+TEST(pid_max)
+{
+ int *pid;
+ unsigned int n;
+ int ret, p;
+ int old_pidmax;
+
+ old_pidmax = write_pidmax(50000);
+
+ do_forks(40000);
+
+ p = fork();
+
+ if (p == 0)
+ pause();
+
+ ret = pidmap_full(&pid, &n);
+
+ EXPECT_LE(0, ret);
+ EXPECT_EQ(p, pid[n - 1]);
+
+ kill(p, SIGKILL);
+ write_pidmax(old_pidmax);
+}
+
+TEST(compare_proc)
+{
+ pid_t pid;
+
+ if (unshare(CLONE_NEWNS | CLONE_NEWPID) == -1)
+ return;
+
+ pid = fork();
+
+ if (pid == 0) {
+ pid_t pid;
+ int i = 0;
+
+ mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL);
+ mount("none", "/proc", NULL, MS_REC | MS_PRIVATE, NULL);
+ mount("proc", "/proc", "proc",
+ MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
+
+ while (i < 150) {
+ i++;
+
+ pid = fork();
+
+ if (pid == -1) {
+ wait(NULL);
+ umount("/proc");
+ return;
+ }
+ if (pid == 0) {
+ pause();
+ return;
+ }
+ }
+
+ int *pids, *pids_proc;
+ unsigned int n = 0;
+ unsigned int n_proc = 0;
+ int ret, ret_proc;
+
+ ret = pidmap_full(&pids, &n);
+
+ ret_proc = pidmap_proc(&pids_proc, &n_proc);
+ qsort(pids_proc, n_proc, sizeof(int), compare);
+
+ EXPECT_LE(0, ret);
+ EXPECT_EQ(n_proc, n);
+
+ if (ret <= 0 || ret_proc <= 0 || n != n_proc) {
+ killpg(0, SIGTERM);
+ wait(NULL);
+ umount("/proc");
+ free(pids);
+ free(pids_proc);
+ return;
+ }
+
+ for (int i = 0; i < n; i++) {
+ EXPECT_EQ(pids_proc[i], pids[i]);
+ if (pids_proc[i] != pids[i])
+ break;
+ }
+ EXPECT_EQ(1, pids[0]);
+
+ free(pids_proc);
+ free(pids);
+ killpg(0, SIGTERM);
+ wait(NULL);
+ umount("/proc");
+ }
+}
+
+TEST_HARNESS_MAIN
From: Aliaksandr Patseyenak <[email protected]>
Implement system call for bulk retrieveing of opened descriptors
in binary form.
Some daemons could use it to reliably close file descriptors
before starting. Currently they close everything upto some number
which formally is not reliable. Other natural users are lsof(1) and CRIU
(although lsof does so much in /proc that the effect is thoroughly buried).
Once again, /proc, the only way to learn anything about file descriptors
may not be available.
Sample program:
#include <stdlib.h>
#include <stdio.h>
static inline long sys_fdmap(int pid, int *fd, unsigned int n, int start)
{
register long r10 asm ("r10") = start;
register long r8 asm ("r8") = 0;
long rv;
asm volatile (
"syscall"
: "=a" (rv)
: "0" (334), "D" (pid), "S" (fd), "d" (n), "r" (r10), "r" (r8)
: "rcx", "r11", "cc", "memory"
);
return rv;
}
int main(int argc, char *argv[])
{
int fd[3];
int pid;
unsigned int start;
int n;
pid = 0;
if (argc > 1)
pid = atoi(argv[1]);
start = 0;
while ((n = sys_fdmap(pid, fd, sizeof(fd)/sizeof(fd[0]), start)) > 0) {
unsigned int i;
for (i = 0; i < n; i++) {
printf(" %u", fd[i]);
}
printf("\n");
start = fd[n - 1] + 1;
}
return 0;
}
Signed-off-by: Aliaksandr Patseyenak <[email protected]>
Signed-off-by: Alexey Dobriyan <[email protected]>
---
arch/x86/entry/syscalls/syscall_64.tbl | 1
fs/Makefile | 2
fs/fdmap.c | 105 +++++++++++++++++++
include/linux/syscalls.h | 2
tools/testing/selftests/fdmap/.gitignore | 1
tools/testing/selftests/fdmap/Makefile | 7 +
tools/testing/selftests/fdmap/fdmap.c | 112 +++++++++++++++++++++
tools/testing/selftests/fdmap/fdmap.h | 12 ++
tools/testing/selftests/fdmap/fdmap_test.c | 153 +++++++++++++++++++++++++++++
9 files changed, 394 insertions(+), 1 deletion(-)
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -340,6 +340,7 @@
331 common pkey_free sys_pkey_free
332 common statx sys_statx
333 common pidmap sys_pidmap
+334 common fdmap sys_fdmap
#
# x32-specific system call numbers start at 512 to avoid cache impact
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o fdmap.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
--- /dev/null
+++ b/fs/fdmap.c
@@ -0,0 +1,105 @@
+#include <linux/bitops.h>
+#include <linux/fdtable.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+/**
+ * fdmap - get opened file descriptors of a process
+ * @pid: the pid of the target process
+ * @fds: allocated userspace buffer
+ * @count: buffer size (in descriptors)
+ * @start_fd: first descriptor to search from (inclusive)
+ * @flags: reserved for future functionality, must be zero
+ *
+ * If @pid is zero then it's current process.
+ * Return: number of descriptors written. An error code otherwise.
+ */
+SYSCALL_DEFINE5(fdmap, pid_t, pid, int __user *, fds, unsigned int, count,
+ int, start_fd, int, flags)
+{
+ struct task_struct *task;
+ struct files_struct *files;
+ unsigned long search_mask;
+ unsigned int user_index, offset;
+ int masksize;
+
+ if (start_fd < 0 || flags != 0)
+ return -EINVAL;
+
+ if (!pid) {
+ files = get_files_struct(current);
+ } else {
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ return -EACCES;
+ }
+ files = get_files_struct(task);
+ rcu_read_unlock();
+ }
+ if (!files)
+ return 0;
+
+ offset = start_fd / BITS_PER_LONG;
+ search_mask = ULONG_MAX << (start_fd % BITS_PER_LONG);
+ user_index = 0;
+#define FDS_BUF_SIZE (1024/sizeof(unsigned long))
+ masksize = FDS_BUF_SIZE;
+ while (user_index < count && masksize == FDS_BUF_SIZE) {
+ unsigned long open_fds[FDS_BUF_SIZE];
+ struct fdtable *fdt;
+ unsigned int i;
+
+ /*
+ * fdt->max_fds can grow, get it every time
+ * before copying part into internal buffer.
+ */
+ rcu_read_lock();
+ fdt = files_fdtable(files);
+ masksize = fdt->max_fds / 8 - offset * sizeof(long);
+ if (masksize < 0) {
+ rcu_read_unlock();
+ break;
+ }
+ masksize = min(masksize, (int)sizeof(open_fds));
+ memcpy(open_fds, fdt->open_fds + offset, masksize);
+ rcu_read_unlock();
+
+ open_fds[0] &= search_mask;
+ search_mask = ULONG_MAX;
+ masksize = (masksize + sizeof(long) - 1) / sizeof(long);
+ start_fd = offset * BITS_PER_LONG;
+ /*
+ * for_each_set_bit_from() can re-read first word
+ * multiple times which is not optimal.
+ */
+ for (i = 0; i < masksize; i++) {
+ unsigned long mask = open_fds[i];
+
+ while (mask) {
+ unsigned int real_fd = start_fd + __ffs(mask);
+
+ if (put_user(real_fd, fds + user_index)) {
+ put_files_struct(files);
+ return -EFAULT;
+ }
+ if (++user_index >= count)
+ goto out;
+ mask &= mask - 1;
+ }
+ start_fd += BITS_PER_LONG;
+ }
+ offset += FDS_BUF_SIZE;
+ }
+out:
+ put_files_struct(files);
+
+ return user_index;
+}
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -922,6 +922,8 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
asmlinkage long sys_pkey_free(int pkey);
asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);
+asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
+ int start_fd, int flags);
asmlinkage long sys_pidmap(int __user *pids,
unsigned int pids_count,
--- /dev/null
+++ b/tools/testing/selftests/fdmap/.gitignore
@@ -0,0 +1 @@
+fdmap_test
--- /dev/null
+++ b/tools/testing/selftests/fdmap/Makefile
@@ -0,0 +1,7 @@
+TEST_GEN_PROGS := fdmap_test
+CFLAGS += -Wall
+
+include ../lib.mk
+
+$(TEST_GEN_PROGS): fdmap_test.c fdmap.c fdmap.h ../kselftest_harness.h
+ $(CC) $(CFLAGS) $(LDFLAGS) $< fdmap.c -o $@
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include "fdmap.h"
+
+#define BUF_SIZE 1024
+
+long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags)
+{
+ register int64_t r10 asm("r10") = start_fd;
+ register int64_t r8 asm("r8") = flags;
+ long ret;
+
+ asm volatile (
+ "syscall"
+ : "=a"(ret)
+ : "0" (334),
+ "D" (pid), "S" (fds), "d" (count), "r" (r10), "r" (r8)
+ : "rcx", "r11", "cc", "memory"
+ );
+ return ret;
+}
+
+int fdmap_full(pid_t pid, int **fds, size_t *n)
+{
+ int buf[BUF_SIZE], start_fd = 0;
+ long ret;
+
+ *n = 0;
+ *fds = NULL;
+ for (;;) {
+ int *new_buff;
+
+ ret = fdmap(pid, buf, BUF_SIZE, start_fd, 0);
+ if (ret < 0)
+ break;
+ if (!ret)
+ return 0;
+
+ new_buff = realloc(*fds, (*n + ret) * sizeof(int));
+ if (!new_buff) {
+ ret = -errno;
+ break;
+ }
+ *fds = new_buff;
+ memcpy(*fds + *n, buf, ret * sizeof(int));
+ *n += ret;
+ start_fd = (*fds)[*n - 1] + 1;
+ }
+ free(*fds);
+ *fds = NULL;
+ return -ret;
+}
+
+int fdmap_proc(pid_t pid, int **fds, size_t *n)
+{
+ char fds_path[20];
+ int dir_fd = 0;
+ struct dirent *fd_link;
+ DIR *fds_dir;
+
+ *fds = NULL;
+ *n = 0;
+ if (!pid)
+ strcpy(fds_path, "/proc/self/fd");
+ else
+ sprintf(fds_path, "/proc/%d/fd", pid);
+
+ fds_dir = opendir(fds_path);
+ if (!fds_dir)
+ return errno == ENOENT ? ESRCH : errno;
+ if (!pid)
+ dir_fd = dirfd(fds_dir);
+
+ while ((fd_link = readdir(fds_dir))) {
+ if (fd_link->d_name[0] < '0'
+ || fd_link->d_name[0] > '9')
+ continue;
+ if (*n % BUF_SIZE == 0) {
+ int *new_buff;
+
+ new_buff = realloc(*fds, (*n + BUF_SIZE) * sizeof(int));
+ if (!new_buff) {
+ int ret = errno;
+
+ free(*fds);
+ *fds = NULL;
+ return ret;
+ }
+ *fds = new_buff;
+ }
+ (*fds)[*n] = atoi(fd_link->d_name);
+ *n += 1;
+ }
+ closedir(fds_dir);
+
+ if (!pid) {
+ size_t i;
+
+ for (i = 0; i < *n; i++)
+ if ((*fds)[i] == dir_fd)
+ break;
+ i++;
+ memmove(*fds + i - 1, *fds + i, (*n - i) * sizeof(int));
+ (*n)--;
+ }
+ return 0;
+}
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap.h
@@ -0,0 +1,12 @@
+#ifndef FDMAP_H
+#define FDMAP_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags);
+int fdmap_full(pid_t pid, int **fds, size_t *n);
+int fdmap_proc(pid_t pid, int **fds, size_t *n);
+
+#endif
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap_test.c
@@ -0,0 +1,153 @@
+#include <errno.h>
+#include <syscall.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <limits.h>
+#include "../kselftest_harness.h"
+#include "fdmap.h"
+
+TEST(efault) {
+ int ret;
+
+ ret = syscall(334, 0, NULL, 20 * sizeof(int), 0, 0);
+ ASSERT_EQ(-1, ret);
+ ASSERT_EQ(EFAULT, errno);
+}
+
+TEST(big_start_fd) {
+ int fds[1];
+ int ret;
+
+ ret = syscall(334, 0, fds, sizeof(int), INT_MAX, 0);
+ ASSERT_EQ(0, ret);
+}
+
+TEST(einval) {
+ int ret;
+
+ ret = syscall(334, 0, NULL, 0, -1, 0);
+ ASSERT_EQ(-1, ret);
+ ASSERT_EQ(EINVAL, errno);
+
+ ret = syscall(334, 0, NULL, 0, 0, 1);
+ ASSERT_EQ(-1, ret);
+ ASSERT_EQ(EINVAL, errno);
+}
+
+TEST(esrch) {
+ int fds[1], ret;
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_NE(-1, pid);
+ if (!pid)
+ exit(0);
+ waitpid(pid, NULL, 0);
+
+ ret = syscall(334, pid, fds, sizeof(int), 0, 0);
+ ASSERT_EQ(-1, ret);
+ ASSERT_EQ(ESRCH, errno);
+}
+
+TEST(simple) {
+ int *fds1, *fds2;
+ size_t size1, size2, i;
+ int ret1, ret2;
+
+ ret1 = fdmap_full(0, &fds1, &size1);
+ ret2 = fdmap_proc(0, &fds2, &size2);
+ ASSERT_EQ(ret2, ret1);
+ ASSERT_EQ(size2, size1);
+ for (i = 0; i < size1; i++)
+ ASSERT_EQ(fds2[i], fds1[i]);
+ free(fds1);
+ free(fds2);
+}
+
+TEST(init) {
+ int *fds1, *fds2;
+ size_t size1, size2, i;
+ int ret1, ret2;
+
+ ret1 = fdmap_full(1, &fds1, &size1);
+ ret2 = fdmap_proc(1, &fds2, &size2);
+ ASSERT_EQ(ret2, ret1);
+ ASSERT_EQ(size2, size1);
+ for (i = 0; i < size1; i++)
+ ASSERT_EQ(fds2[i], fds1[i]);
+ free(fds1);
+ free(fds2);
+}
+
+TEST(zero) {
+ int *fds, i;
+ size_t size;
+ int ret;
+
+ ret = fdmap_proc(0, &fds, &size);
+ ASSERT_EQ(0, ret);
+ for (i = 0; i < size; i++)
+ close(fds[i]);
+ free(fds);
+ fds = NULL;
+
+ ret = fdmap_full(0, &fds, &size);
+ ASSERT_EQ(0, ret);
+ ASSERT_EQ(0, size);
+}
+
+TEST(more_fds) {
+ int *fds1, *fds2, ret1, ret2;
+ size_t size1, size2, i;
+
+ struct rlimit rlim = {
+ .rlim_cur = 600000,
+ .rlim_max = 600000
+ };
+ ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlim));
+ for (int i = 0; i < 500000; i++)
+ dup(0);
+
+ ret1 = fdmap_full(0, &fds1, &size1);
+ ret2 = fdmap_proc(0, &fds2, &size2);
+ ASSERT_EQ(ret2, ret1);
+ ASSERT_EQ(size2, size1);
+ for (i = 0; i < size1; i++)
+ ASSERT_EQ(fds2[i], fds1[i]);
+ free(fds1);
+ free(fds2);
+}
+
+TEST(child) {
+ int pipefd[2];
+ int *fds1, *fds2, ret1, ret2, i;
+ size_t size1, size2;
+ char byte = 0;
+ pid_t pid;
+
+ ASSERT_NE(-1, pipe(pipefd));
+ pid = fork();
+ ASSERT_NE(-1, pid);
+ if (!pid) {
+ read(pipefd[0], &byte, 1);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ ret1 = fdmap_full(0, &fds1, &size1);
+ ret2 = fdmap_proc(0, &fds2, &size2);
+ ASSERT_EQ(ret2, ret1);
+ ASSERT_EQ(size2, size1);
+ for (i = 0; i < size1; i++)
+ ASSERT_EQ(fds2[i], fds1[i]);
+ free(fds1);
+ free(fds2);
+
+ write(pipefd[1], &byte, 1);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ waitpid(pid, NULL, 0);
+}
+
+TEST_HARNESS_MAIN
On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <[email protected]> wrote:
> Implement system call for bulk retrieveing of pids in binary form.
>
> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> converting with atoi().
>
> /proc may be not mounted especially in containers. Natural extension of
> hidepid=2 efforts is to not mount /proc at all.
>
> It could be used by programs like ps, top or CRIU. Speed increase will
> become more drastic once combined with bulk retrieval of process statistics.
The patches are performance optimizations, but their changelogs contain
no performance measurements!
Demonstration of some compelling real-world performance benefits would
help things along a lot.
On 09/05/17 15:53, Andrew Morton wrote:
> On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <[email protected]> wrote:
>
>> Implement system call for bulk retrieveing of pids in binary form.
>>
>> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
>> converting with atoi().
>>
>> /proc may be not mounted especially in containers. Natural extension of
>> hidepid=2 efforts is to not mount /proc at all.
>>
>> It could be used by programs like ps, top or CRIU. Speed increase will
>> become more drastic once combined with bulk retrieval of process statistics.
>
> The patches are performance optimizations, but their changelogs contain
> no performance measurements!
>
> Demonstration of some compelling real-world performance benefits would
> help things along a lot.
>
also, I expect that the tiny kernel people will want kconfig options for
these syscalls.
--
~Randy
On Tue, 5 Sep 2017, Randy Dunlap wrote:
> On 09/05/17 15:53, Andrew Morton wrote:
> > On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <[email protected]> wrote:
> >
> >> Implement system call for bulk retrieveing of pids in binary form.
> >>
> >> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> >> converting with atoi().
> >>
> >> /proc may be not mounted especially in containers. Natural extension of
> >> hidepid=2 efforts is to not mount /proc at all.
> >>
> >> It could be used by programs like ps, top or CRIU. Speed increase will
> >> become more drastic once combined with bulk retrieval of process statistics.
> >
> > The patches are performance optimizations, but their changelogs contain
> > no performance measurements!
> >
> > Demonstration of some compelling real-world performance benefits would
> > help things along a lot.
> >
>
> also, I expect that the tiny kernel people will want kconfig options for
> these syscalls.
And of course that stuff wants the corresponding man pages written up.
Thanks,
tglx
On 9/6/17, Andrew Morton <[email protected]> wrote:
> On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <[email protected]>
> wrote:
>
>> Implement system call for bulk retrieveing of pids in binary form.
>>
>> Using /proc is slower than necessary: 3 syscalls + another 3 for each
>> thread +
>> converting with atoi().
>>
>> /proc may be not mounted especially in containers. Natural extension of
>> hidepid=2 efforts is to not mount /proc at all.
>>
>> It could be used by programs like ps, top or CRIU. Speed increase will
>> become more drastic once combined with bulk retrieval of process
>> statistics.
>
> The patches are performance optimizations, but their changelogs contain
> no performance measurements!
>
> Demonstration of some compelling real-world performance benefits would
> help things along a lot.
I forgot the sheet with numbers at work. :^)
They're very embarrassing for /proc.
pidmap:
N=1<<16 times
~130 processes (~250 task_structs) on a regular desktop system
opendir + readdir + closedir /proc + the same for every /proc/$PID/task
(roughly what htop(1) does) vs pidmap
/proc 16.80+-0.73%
pidmap 0.06+-0.31%
fdmap:
N=1<<22 times
4 opened descriptors (0, 1, 2, 3)
opendir+readdir+closedir /proc/self/fd (lsof(1)) vs fdmap
/proc 8.31+-0.37%
fdmap 0.32+-0.72%
Currently performance improvements may not be huge or even visible.
That's because programs like ps/top/lsof _have_ to use /proc to retrieve
other information. If combined with bulk taskstats-ish retrieval interfaces
they should run around /proc.
On 9/6/17, Randy Dunlap <[email protected]> wrote:
> On 09/05/17 15:53, Andrew Morton wrote:
>> On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <[email protected]>
>> wrote:
>>
>>> Implement system call for bulk retrieveing of pids in binary form.
>>>
>>> Using /proc is slower than necessary: 3 syscalls + another 3 for each
>>> thread +
>>> converting with atoi().
>>>
>>> /proc may be not mounted especially in containers. Natural extension of
>>> hidepid=2 efforts is to not mount /proc at all.
>>>
>>> It could be used by programs like ps, top or CRIU. Speed increase will
>>> become more drastic once combined with bulk retrieval of process
>>> statistics.
>>
>> The patches are performance optimizations, but their changelogs contain
>> no performance measurements!
>>
>> Demonstration of some compelling real-world performance benefits would
>> help things along a lot.
>>
>
> also, I expect that the tiny kernel people will want kconfig options for
> these syscalls.
We'll add it but the question if it is a good idea. Ideally these system calls
should be mandatory and /proc optional.
$ size kernel/pidmap.o fs/fdmap.o
text data bss dec hex filename
560 0 0 560 230 kernel/pidmap.o
617 0 0 617 269 fs/fdmap.o
On Wed, Sep 6, 2017 at 2:04 AM, Alexey Dobriyan <[email protected]> wrote:
> On 9/6/17, Randy Dunlap <[email protected]> wrote:
>> On 09/05/17 15:53, Andrew Morton wrote:
>>> On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <[email protected]>
>>> wrote:
>>>
>>>> Implement system call for bulk retrieveing of pids in binary form.
>>>>
>>>> Using /proc is slower than necessary: 3 syscalls + another 3 for each
>>>> thread +
>>>> converting with atoi().
>>>>
>>>> /proc may be not mounted especially in containers. Natural extension of
>>>> hidepid=2 efforts is to not mount /proc at all.
>>>>
>>>> It could be used by programs like ps, top or CRIU. Speed increase will
>>>> become more drastic once combined with bulk retrieval of process
>>>> statistics.
>>>
>>> The patches are performance optimizations, but their changelogs contain
>>> no performance measurements!
>>>
>>> Demonstration of some compelling real-world performance benefits would
>>> help things along a lot.
>>>
>>
>> also, I expect that the tiny kernel people will want kconfig options for
>> these syscalls.
>
> We'll add it but the question if it is a good idea. Ideally these system calls
> should be mandatory and /proc optional.
>
> $ size kernel/pidmap.o fs/fdmap.o
> text data bss dec hex filename
> 560 0 0 560 230 kernel/pidmap.o
> 617 0 0 617 269 fs/fdmap.o
After much discussion at LPC/KS last year, I thought the idea was to
try to speed up /proc rather than replacing it outright. The two
specific ideas I recall were:
1. Add a syscall like readfileat() that you can use to, in a single
operation, open, read, and close a /proc file (or other file). This
should vastly reduce locking and RCU overhead.
2. Add a /proc file that has a nice binary format for task info. (nl_attr?)
I don't see why pidmap() deserves to be significantly faster than getdents().
Also, a pidmap() syscall like this inherently bypasses any security
restrictions implied by the way that /proc is mounted. It can respect
hidepid, but hidepid (as a per-namespace concept) is an enormous turd
that badly needs to be deprecated, and Djalal is working on exactly
that.
Hi Alexey,
On Thu, Sep 7, 2017 at 4:04 AM, Andy Lutomirski <[email protected]> wrote:
> On Wed, Sep 6, 2017 at 2:04 AM, Alexey Dobriyan <[email protected]> wrote:
>> On 9/6/17, Randy Dunlap <[email protected]> wrote:
>>> On 09/05/17 15:53, Andrew Morton wrote:
[...]
>>>
>>> also, I expect that the tiny kernel people will want kconfig options for
>>> these syscalls.
>>
>> We'll add it but the question if it is a good idea. Ideally these system calls
>> should be mandatory and /proc optional.
>>
>> $ size kernel/pidmap.o fs/fdmap.o
>> text data bss dec hex filename
>> 560 0 0 560 230 kernel/pidmap.o
>> 617 0 0 617 269 fs/fdmap.o
>
> After much discussion at LPC/KS last year, I thought the idea was to
> try to speed up /proc rather than replacing it outright. The two
> specific ideas I recall were:
>
> 1. Add a syscall like readfileat() that you can use to, in a single
> operation, open, read, and close a /proc file (or other file). This
> should vastly reduce locking and RCU overhead.
>
> 2. Add a /proc file that has a nice binary format for task info. (nl_attr?)
>
> I don't see why pidmap() deserves to be significantly faster than getdents().
>
> Also, a pidmap() syscall like this inherently bypasses any security
> restrictions implied by the way that /proc is mounted. It can respect
> hidepid, but hidepid (as a per-namespace concept) is an enormous turd
> that badly needs to be deprecated, and Djalal is working on exactly
> that.
Yes as noted by Andy, me and Alexey Gladkov are working on modernizing
procfs [1] and to reduce/remove ties within pid namespaces which has lot
of problems now.
We just picked the task again, and this was the result of discussion
with Andy some months ago, on how to improve hidepid, but also how to
improve procfs in general, so we can add other mechanisms to hide or return
NULL on other /proc/_file_not_needed_by_containers_ or
/proc/_specific_module_files_ everything that is not virtualized , or mount only
some specific view of the whole /proc API this will also be used by containers.
This also should make it hard for attackers since we are planning to have
a backward compatible options on how to better treat some of these files in
regard of some namespaces.
The syscall or readfileat() for one operation is a nice addition
definitively. But
in general it would be better to treat /proc as a filesystem and not add other
specific interfaces that may abstract it with pidns, as it is the situation now
which make it from userspace perspective: hard to use especially for security
context.
Alexey, could you please Cc'us on future, thank you very much!
[1] https://lkml.org/lkml/2017/4/25/282
--
tixxdz
On 9/7/17, Andy Lutomirski <[email protected]> wrote:
> On Wed, Sep 6, 2017 at 2:04 AM, Alexey Dobriyan <[email protected]>
> wrote:
>> On 9/6/17, Randy Dunlap <[email protected]> wrote:
>>> On 09/05/17 15:53, Andrew Morton wrote:
>>>> On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <[email protected]>
>>>> wrote:
>>>>
>>>>> Implement system call for bulk retrieveing of pids in binary form.
>>>>>
>>>>> Using /proc is slower than necessary: 3 syscalls + another 3 for each
>>>>> thread +
>>>>> converting with atoi().
>>>>>
>>>>> /proc may be not mounted especially in containers. Natural extension
>>>>> of
>>>>> hidepid=2 efforts is to not mount /proc at all.
>>>>>
>>>>> It could be used by programs like ps, top or CRIU. Speed increase will
>>>>> become more drastic once combined with bulk retrieval of process
>>>>> statistics.
>>>>
>>>> The patches are performance optimizations, but their changelogs contain
>>>> no performance measurements!
>>>>
>>>> Demonstration of some compelling real-world performance benefits would
>>>> help things along a lot.
>>>>
>>>
>>> also, I expect that the tiny kernel people will want kconfig options for
>>> these syscalls.
>>
>> We'll add it but the question if it is a good idea. Ideally these system
>> calls
>> should be mandatory and /proc optional.
>>
>> $ size kernel/pidmap.o fs/fdmap.o
>> text data bss dec hex filename
>> 560 0 0 560 230 kernel/pidmap.o
>> 617 0 0 617 269 fs/fdmap.o
>
> After much discussion at LPC/KS last year, I thought the idea was to
> try to speed up /proc rather than replacing it outright. The two
> specific ideas I recall were:
>
> 1. Add a syscall like readfileat() that you can use to, in a single
> operation, open, read, and close a /proc file (or other file). This
> should vastly reduce locking and RCU overhead.
>
> 2. Add a /proc file that has a nice binary format for task info.
> (nl_attr?)
If you do binary data in /proc there is no need for /proc part.
System call can do everything /proc/$PID/bstat (or whatever the name)
does.
> I don't see why pidmap() deserves to be significantly faster than
> getdents().
Just look at profile. XXX is pure slowdown. _Some_ of it can be deleted
or sped up but not everything. All dcache stuff is unavoidable.
XXX 6.35% [k] number
OK* 5.21% [k] proc_readfd_common (* partially XXX)
OK 4.19% [k] __rcu_read_unlock
XXX 4.05% [.] __GI_____strtoll_l_internal
XXX 3.73% [k] dput
OK 3.64% [k] entry_SYSCALL_64_fastpath
XXX 3.23% [k] proc_fill_cache
XXX 3.10% [k] __d_lookup
XXX 3.09% [k] filldir
XXX 2.74% [k] format_decode
XXX 2.47% [k] link_path_walk
OK* 2.26% [k] _raw_spin_lock
OK 1.73% [k] get_files_struct
XXX 1.64% [k] __d_lookup_rcu
XXX 1.61% [k] do_sys_open
XXX 1.49% [k] pid_revalidate
OK 1.48% [k] __check_object_size
XXX 1.47% [k] do_filp_open
? 1.44% [.] __memmove_sse2
OK 1.40% [k] __rcu_read_lock
XXX 1.33% [.] __readdir64
XXX 1.32% [k] __follow_mount_rcu.isra.6
XXX 1.30% [k] set_root
XXX 1.27% [k] lookup_fast
XXX 1.23% [k] full_name_hash
OK? 1.17% [k] call_rcu
XXX 1.17% [k] sys_open
? 1.02% [k] lockref_put_or_lock
XXX 1.00% [k] pid_delete_dentry
XXX 0.99% [k] iterate_dir
XXX 0.95% [k] inode_permission
XXX 0.94% [k] __slab_alloc.isra.22.constprop.26
OK 0.93% [k] rcu_process_callbacks
XXX 0.93% [.] __getdents64
XXX 0.93% [k] vsnprintf
XXX 0.92% [k] sys_close
> Also, a pidmap() syscall like this inherently bypasses any security
> restrictions implied by the way that /proc is mounted. It can respect
> hidepid, but hidepid (as a per-namespace concept) is an enormous turd
> that badly needs to be deprecated, and Djalal is working on exactly
> that.
I agree pid_ns->hide_pid is silly idea. It should be a property of
an individual mount but as posted pidmap() respect it (at a cost of
some slowdown).
On 9/7/17, Djalal Harouni <[email protected]> wrote:
> Hi Alexey,
>
> On Thu, Sep 7, 2017 at 4:04 AM, Andy Lutomirski <[email protected]>
> wrote:
>> On Wed, Sep 6, 2017 at 2:04 AM, Alexey Dobriyan <[email protected]>
>> wrote:
>>> On 9/6/17, Randy Dunlap <[email protected]> wrote:
>>>> On 09/05/17 15:53, Andrew Morton wrote:
> [...]
>>>>
>>>> also, I expect that the tiny kernel people will want kconfig options
>>>> for
>>>> these syscalls.
>>>
>>> We'll add it but the question if it is a good idea. Ideally these system
>>> calls
>>> should be mandatory and /proc optional.
>>>
>>> $ size kernel/pidmap.o fs/fdmap.o
>>> text data bss dec hex filename
>>> 560 0 0 560 230 kernel/pidmap.o
>>> 617 0 0 617 269 fs/fdmap.o
>>
>> After much discussion at LPC/KS last year, I thought the idea was to
>> try to speed up /proc rather than replacing it outright. The two
>> specific ideas I recall were:
>>
>> 1. Add a syscall like readfileat() that you can use to, in a single
>> operation, open, read, and close a /proc file (or other file). This
>> should vastly reduce locking and RCU overhead.
>>
>> 2. Add a /proc file that has a nice binary format for task info.
>> (nl_attr?)
>>
>> I don't see why pidmap() deserves to be significantly faster than
>> getdents().
>>
>> Also, a pidmap() syscall like this inherently bypasses any security
>> restrictions implied by the way that /proc is mounted. It can respect
>> hidepid, but hidepid (as a per-namespace concept) is an enormous turd
>> that badly needs to be deprecated, and Djalal is working on exactly
>> that.
>
> Yes as noted by Andy, me and Alexey Gladkov are working on modernizing
> procfs [1] and to reduce/remove ties within pid namespaces which has lot
> of problems now.
> ...
Kudos for digging into this mess.
But the question will remain: how get pids of existing processes quickly.
On Tue, Sep 05, 2017 at 10:05:00PM +0300, Alexey Dobriyan wrote:
> From: Tatsiana Brouka <[email protected]>
>
> Implement system call for bulk retrieveing of pids in binary form.
>
> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> converting with atoi().
>
> /proc may be not mounted especially in containers. Natural extension of
> hidepid=2 efforts is to not mount /proc at all.
>
> It could be used by programs like ps, top or CRIU. Speed increase will
> become more drastic once combined with bulk retrieval of process statistics.
What could give a noticeable performance gain in a less ridiculous way
is an interface like task_diag, see https://lkml.org/lkml/2016/4/11/924 .
--
ldv