2022-08-26 06:51:42

by Muhammad Usama Anjum

[permalink] [raw]
Subject: [PATCH v3 2/4] fs/proc/task_mmu: Implement IOCTL to get and clear soft dirty PTE bit

This ioctl can be used to watch the process's memory and perform
atomic operations which aren't possible through procfs. Three
operations have been implemented:

- PAGEMAP_SD_GET gets the soft dirty pages in a address range.
- PAGEMAP_SD_CLEAR clears the soft dirty bit from dirty pages in a
address range.
- PAGEMAP_SD_GET_AND_CLEAR gets and clears the soft dirty bit in a
address range.

struct pagemap_sd_args is used as the argument of the IOCTL. In this
struct:
- The range is specified through start and len.
- The output buffer and size is specified as vec and vec_len.
- The flags can be specified in the flags field. Currently only one
PAGEMAP_SD_NO_REUSED_REGIONS is supported which can be specified to
ignore the VMA dirty flags.

This is based on a patch from Gabriel Krisman Bertazi.

Signed-off-by: Muhammad Usama Anjum <[email protected]>
---
Changes in v3:
- Tighten the user-kernel interface by using explicit types and add more
error checking

Changes in v2:
- Convert the interface from syscall to ioctl
- Remove pidfd support as it doesn't make sense in ioctl
---
fs/proc/task_mmu.c | 260 ++++++++++++++++++++++++++++++++++
include/uapi/linux/fs.h | 23 +++
tools/include/uapi/linux/fs.h | 23 +++
3 files changed, 306 insertions(+)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f66674033207..33d3d5c2ab40 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,8 @@
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <uapi/linux/fs.h>
+#include <linux/vmalloc.h>

#include <asm/elf.h>
#include <asm/tlb.h>
@@ -1775,11 +1777,269 @@ static int pagemap_release(struct inode *inode, struct file *file)
return 0;
}

+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define IS_CLEAR_SD_OP(op) (op == PAGEMAP_SD_CLEAR || op == PAGEMAP_SD_GET_AND_CLEAR)
+#define IS_GET_SD_OP(op) (op == PAGEMAP_SD_GET || op == PAGEMAP_SD_GET_AND_CLEAR)
+#define PAGEMAP_SD_FLAGS_MASK (PAGEMAP_SD_NO_REUSED_REGIONS)
+
+struct pagemap_sd_private {
+ unsigned long start;
+ __u64 *vec;
+ unsigned long vec_len;
+ unsigned long index;
+ unsigned int op;
+ unsigned int flags;
+};
+
+static int pagemap_sd_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct pagemap_sd_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long start = addr;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int dirty;
+ bool dirty_vma = (p->flags & PAGEMAP_SD_NO_REUSED_REGIONS) ? 0 :
+ (vma->vm_flags & VM_SOFTDIRTY);
+
+ end = min(end, walk->vma->vm_end);
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
+ /*
+ * Break huge page into small pages if operation needs to be performed is
+ * on a portion of the huge page or the return buffer cannot store complete
+ * data. Then process this PMD as having normal pages.
+ */
+ if ((IS_CLEAR_SD_OP(p->op) && (end - addr < HPAGE_SIZE)) ||
+ (IS_GET_SD_OP(p->op) && (p->index + HPAGE_SIZE/PAGE_SIZE > p->vec_len))) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, addr);
+ goto process_smaller_pages;
+ } else {
+ dirty = check_soft_dirty_pmd(vma, addr, pmd, IS_CLEAR_SD_OP(p->op));
+ if (IS_GET_SD_OP(p->op) && (dirty_vma || dirty)) {
+ for (; addr != end && p->index < p->vec_len;
+ addr += PAGE_SIZE)
+ p->vec[p->index++] = addr - p->start;
+ }
+ }
+ }
+ spin_unlock(ptl);
+ return 0;
+ }
+
+process_smaller_pages:
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ dirty = check_soft_dirty(vma, addr, pte, IS_CLEAR_SD_OP(p->op));
+
+ if (IS_GET_SD_OP(p->op) && (dirty_vma || dirty)) {
+ p->vec[p->index++] = addr - p->start;
+ WARN_ON(p->index > p->vec_len);
+ }
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+
+ if (IS_CLEAR_SD_OP(p->op))
+ flush_tlb_mm_range(vma->vm_mm, start, end, PAGE_SHIFT, false);
+
+ return 0;
+}
+
+static int pagemap_sd_pte_hole(unsigned long addr, unsigned long end, int depth,
+ struct mm_walk *walk)
+{
+ struct pagemap_sd_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (p->flags & PAGEMAP_SD_NO_REUSED_REGIONS)
+ return 0;
+
+ if (vma && (vma->vm_flags & VM_SOFTDIRTY) && IS_GET_SD_OP(p->op)) {
+ for (; addr != end && p->index < p->vec_len; addr += PAGE_SIZE)
+ p->vec[p->index++] = addr - p->start;
+ }
+
+ return 0;
+}
+
+static int pagemap_sd_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
+{
+ struct pagemap_sd_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int ret;
+ unsigned long end_cut = end;
+
+ if (p->flags & PAGEMAP_SD_NO_REUSED_REGIONS)
+ return 0;
+
+ if (IS_CLEAR_SD_OP(p->op) && (vma->vm_flags & VM_SOFTDIRTY)) {
+ if (vma->vm_start < start) {
+ ret = split_vma(vma->vm_mm, vma, start, 1);
+ if (ret)
+ return ret;
+ }
+
+ if (IS_GET_SD_OP(p->op))
+ end_cut = min(start + p->vec_len * PAGE_SIZE, end);
+
+ if (vma->vm_end > end_cut) {
+ ret = split_vma(vma->vm_mm, vma, end_cut, 0);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void pagemap_sd_post_vma(struct mm_walk *walk)
+{
+ struct pagemap_sd_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (p->flags & PAGEMAP_SD_NO_REUSED_REGIONS)
+ return;
+
+ if (IS_CLEAR_SD_OP(p->op) && (vma->vm_flags & VM_SOFTDIRTY)) {
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ vma_set_page_prot(vma);
+ }
+}
+
+static int pagemap_sd_pmd_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_sd_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (IS_GET_SD_OP(p->op) && (p->index == p->vec_len))
+ return -1;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ return 0;
+}
+
+static const struct mm_walk_ops pagemap_sd_ops = {
+ .test_walk = pagemap_sd_pmd_test_walk,
+ .pre_vma = pagemap_sd_pre_vma,
+ .pmd_entry = pagemap_sd_pmd_entry,
+ .pte_hole = pagemap_sd_pte_hole,
+ .post_vma = pagemap_sd_post_vma,
+};
+
+static long do_pagemap_sd_cmd(struct mm_struct *mm, unsigned int cmd, struct pagemap_sd_args *arg)
+{
+ struct pagemap_sd_private sd_data;
+ struct mmu_notifier_range range;
+ unsigned long start, end;
+ int ret;
+
+ start = (unsigned long)untagged_addr(arg->start);
+ if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
+ return -EINVAL;
+
+ if (IS_GET_SD_OP(cmd) &&
+ ((arg->vec_len == 0) || (!arg->vec) || (!access_ok((loff_t *)arg->vec, arg->vec_len))))
+ return -EINVAL;
+
+ if ((arg->flags & ~PAGEMAP_SD_FLAGS_MASK) || (arg->__reserved))
+ return -EINVAL;
+
+ end = start + arg->len;
+ sd_data.start = start;
+ sd_data.op = cmd;
+ sd_data.flags = arg->flags;
+ sd_data.index = 0;
+ sd_data.vec_len = arg->vec_len;
+
+ if (IS_GET_SD_OP(cmd)) {
+ sd_data.vec = vzalloc(arg->vec_len * sizeof(loff_t));
+ if (!sd_data.vec)
+ return -ENOMEM;
+ }
+
+ if (IS_CLEAR_SD_OP(cmd)) {
+ mmap_write_lock(mm);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, NULL,
+ mm, start, end);
+ mmu_notifier_invalidate_range_start(&range);
+ inc_tlb_flush_pending(mm);
+ } else {
+ mmap_read_lock(mm);
+ }
+
+ ret = walk_page_range(mm, start, end, &pagemap_sd_ops, &sd_data);
+
+ if (IS_CLEAR_SD_OP(cmd)) {
+ mmu_notifier_invalidate_range_end(&range);
+ dec_tlb_flush_pending(mm);
+
+ mmap_write_unlock(mm);
+ } else {
+ mmap_read_unlock(mm);
+ }
+
+ if (ret < 0)
+ goto free_sd_data;
+
+ if (IS_GET_SD_OP(cmd)) {
+ ret = copy_to_user((loff_t *)arg->vec, sd_data.vec, sd_data.index * sizeof(loff_t));
+ if (ret) {
+ ret = -EIO;
+ goto free_sd_data;
+ }
+ ret = sd_data.index;
+ } else {
+ ret = 0;
+ }
+
+free_sd_data:
+ if (IS_GET_SD_OP(cmd))
+ vfree(sd_data.vec);
+
+ return ret;
+}
+
+static long pagemap_sd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct pagemap_sd_args __user *uarg = (struct pagemap_sd_args __user *)arg;
+ struct mm_struct *mm = file->private_data;
+ struct pagemap_sd_args arguments;
+
+ switch (cmd) {
+ case PAGEMAP_SD_GET:
+ fallthrough;
+ case PAGEMAP_SD_CLEAR:
+ fallthrough;
+ case PAGEMAP_SD_GET_AND_CLEAR:
+ if (copy_from_user(&arguments, uarg, sizeof(struct pagemap_sd_args)))
+ return -EFAULT;
+ return do_pagemap_sd_cmd(mm, cmd, &arguments);
+ default:
+ return -EINVAL;
+ }
+}
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
.release = pagemap_release,
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ .unlocked_ioctl = pagemap_sd_ioctl,
+ .compat_ioctl = pagemap_sd_ioctl,
+#endif /* CONFIG_MEM_SOFT_DIRTY */
};
#endif /* CONFIG_PROC_PAGE_MONITOR */

diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b7b56871029c..4f6d1c0ae524 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -305,4 +305,27 @@ typedef int __bitwise __kernel_rwf_t;
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
RWF_APPEND)

+/**
+ * struct pagemap_sd_args - Soft-dirty IOCTL argument
+ * @start: Starting address
+ * @len: Length of the region
+ * @vec: Output buffer address
+ * @vec_len: Length of the output buffer
+ * @flags: Special flags for the IOCTL
+ * @__reserved: Reserved member to preserve data alignment. Should be 0.
+ */
+struct pagemap_sd_args {
+ __u64 __user start;
+ __u64 len;
+ __u64 __user vec;
+ __u64 vec_len;
+ __u32 flags;
+ __u32 __reserved;
+};
+
+#define PAGEMAP_SD_GET _IOWR('f', 16, struct pagemap_sd_args)
+#define PAGEMAP_SD_CLEAR _IOWR('f', 17, struct pagemap_sd_args)
+#define PAGEMAP_SD_GET_AND_CLEAR _IOWR('f', 18, struct pagemap_sd_args)
+#define PAGEMAP_SD_NO_REUSED_REGIONS 0x1
+
#endif /* _UAPI_LINUX_FS_H */
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index b7b56871029c..4f6d1c0ae524 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -305,4 +305,27 @@ typedef int __bitwise __kernel_rwf_t;
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
RWF_APPEND)

+/**
+ * struct pagemap_sd_args - Soft-dirty IOCTL argument
+ * @start: Starting address
+ * @len: Length of the region
+ * @vec: Output buffer address
+ * @vec_len: Length of the output buffer
+ * @flags: Special flags for the IOCTL
+ * @__reserved: Reserved member to preserve data alignment. Should be 0.
+ */
+struct pagemap_sd_args {
+ __u64 __user start;
+ __u64 len;
+ __u64 __user vec;
+ __u64 vec_len;
+ __u32 flags;
+ __u32 __reserved;
+};
+
+#define PAGEMAP_SD_GET _IOWR('f', 16, struct pagemap_sd_args)
+#define PAGEMAP_SD_CLEAR _IOWR('f', 17, struct pagemap_sd_args)
+#define PAGEMAP_SD_GET_AND_CLEAR _IOWR('f', 18, struct pagemap_sd_args)
+#define PAGEMAP_SD_NO_REUSED_REGIONS 0x1
+
#endif /* _UAPI_LINUX_FS_H */
--
2.30.2