Add a new sysctl vm.cow_pte to set MMF_COW_PTE_READY flag for enabling
copy-on-write (COW) to the PTE page table during the next time of fork.
Since it has a time gap between using the sysctl to enable the COW PTE
and doing the fork, we use two states to determine the task that wants
to do COW PTE or already doing it.
Signed-off-by: Chih-En Lin <[email protected]>
---
include/linux/pgtable.h | 6 ++++++
kernel/fork.c | 5 +++++
kernel/sysctl.c | 8 ++++++++
mm/Makefile | 2 +-
mm/cow_pte.c | 39 +++++++++++++++++++++++++++++++++++++++
5 files changed, 59 insertions(+), 1 deletion(-)
create mode 100644 mm/cow_pte.c
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 014ee8f0fbaab..d03d01aefe989 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -937,6 +937,12 @@ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
__ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
+
+int cow_pte_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+
+extern int sysctl_cow_pte_pid;
+
#endif /* CONFIG_MMU */
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 8a9e92068b150..6981944a7c6ec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2671,6 +2671,11 @@ pid_t kernel_clone(struct kernel_clone_args *args)
trace = 0;
}
+ if (current->mm && test_bit(MMF_COW_PTE_READY, ¤t->mm->flags)) {
+ clear_bit(MMF_COW_PTE_READY, ¤t->mm->flags);
+ set_bit(MMF_COW_PTE, ¤t->mm->flags);
+ }
+
p = copy_process(NULL, trace, NUMA_NO_NODE, args);
add_latent_entropy();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 205d605cacc5b..c4f54412ae3a9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2360,6 +2360,14 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = mmap_min_addr_handler,
},
+ {
+ .procname = "cow_pte",
+ .data = &sysctl_cow_pte_pid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = cow_pte_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
#endif
#ifdef CONFIG_NUMA
{
diff --git a/mm/Makefile b/mm/Makefile
index 9a564f8364035..7a568d5066ee6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -40,7 +40,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o
+ pgtable-generic.o rmap.o vmalloc.o cow_pte.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/cow_pte.c b/mm/cow_pte.c
new file mode 100644
index 0000000000000..4e50aa4294ce7
--- /dev/null
+++ b/mm/cow_pte.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sysctl.h>
+#include <linux/pgtable.h>
+#include <linux/sched.h>
+#include <linux/sched/coredump.h>
+#include <linux/pid.h>
+
+/* sysctl will write to this variable */
+int sysctl_cow_pte_pid = -1;
+
+static void set_cow_pte_task(void)
+{
+ struct pid *pid;
+ struct task_struct *task;
+
+ pid = find_get_pid(sysctl_cow_pte_pid);
+ if (!pid) {
+ pr_info("pid %d does not exist\n", sysctl_cow_pte_pid);
+ sysctl_cow_pte_pid = -1;
+ return;
+ }
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!test_bit(MMF_COW_PTE, &task->mm->flags))
+ set_bit(MMF_COW_PTE_READY, &task->mm->flags);
+ sysctl_cow_pte_pid = -1;
+}
+
+int cow_pte_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (write && !ret)
+ set_cow_pte_task();
+
+ return ret;
+}
--
2.37.3
On Sep 27, 2022, at 9:29 AM, Chih-En Lin <[email protected]> wrote:
> Add a new sysctl vm.cow_pte to set MMF_COW_PTE_READY flag for enabling
> copy-on-write (COW) to the PTE page table during the next time of fork.
>
> Since it has a time gap between using the sysctl to enable the COW PTE
> and doing the fork, we use two states to determine the task that wants
> to do COW PTE or already doing it.
I don’t get why it is needed in general and certainly why sysctl controls
this behavior.
IIUC, it sounds that you want prctl and not sysctl for such control. But
clearly you think that this control is needed because there is a tradeoff.
Please explain the tradeoff and how users are expected to make a decision
whether to turn the flag or not.
On Tue, Sep 27, 2022 at 05:27:45PM +0000, Nadav Amit wrote:
> On Sep 27, 2022, at 9:29 AM, Chih-En Lin <[email protected]> wrote:
>
> > Add a new sysctl vm.cow_pte to set MMF_COW_PTE_READY flag for enabling
> > copy-on-write (COW) to the PTE page table during the next time of fork.
> >
> > Since it has a time gap between using the sysctl to enable the COW PTE
> > and doing the fork, we use two states to determine the task that wants
> > to do COW PTE or already doing it.
>
> I don’t get why it is needed in general and certainly why sysctl controls
> this behavior.
>
> IIUC, it sounds that you want prctl and not sysctl for such control. But
> clearly you think that this control is needed because there is a tradeoff.
> Please explain the tradeoff and how users are expected to make a decision
> whether to turn the flag or not.
>
If applying COW to the page table, it will has a significantly change
to kernel, this is why I think it uses the sysctl at first.
But, prctl might be better a choice.
For the tradeoff. Since, in some cases (like executing the command in
the terminal), enabling COW to page table only will increase the
overhead due to the page fault (break COW). It doesn't have any benefit
from the COW mechanism. So, we let the users decide which process will
enable COW page table.
The expected user usually will be the process that requires a lot of
memory and want to create a new process for an isolated environment.
(e.g., fuzzer, container, etc) So, expand COW to page table may
improves the startup time and memory usage (on-demand allocate memory).
Thanks,
Chih-En Lin
On 9/27/22 09:29, Chih-En Lin wrote:
> Add a new sysctl vm.cow_pte to set MMF_COW_PTE_READY flag for enabling
> copy-on-write (COW) to the PTE page table during the next time of fork.
>
> Since it has a time gap between using the sysctl to enable the COW PTE
> and doing the fork, we use two states to determine the task that wants
> to do COW PTE or already doing it.
>
> Signed-off-by: Chih-En Lin <[email protected]>
> ---
> include/linux/pgtable.h | 6 ++++++
> kernel/fork.c | 5 +++++
> kernel/sysctl.c | 8 ++++++++
> mm/Makefile | 2 +-
> mm/cow_pte.c | 39 +++++++++++++++++++++++++++++++++++++++
> 5 files changed, 59 insertions(+), 1 deletion(-)
> create mode 100644 mm/cow_pte.c
>
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 014ee8f0fbaab..d03d01aefe989 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -937,6 +937,12 @@ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
> __ptep_modify_prot_commit(vma, addr, ptep, pte);
> }
> #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
> +
> +int cow_pte_handler(struct ctl_table *table, int write, void *buffer,
> + size_t *lenp, loff_t *ppos);
> +
> +extern int sysctl_cow_pte_pid;
So are setting a global value, to a single pid?? Only one pid at a time
can be set up?
I think that tells you already that there is a huge API problem here.
As the other thread with Nadav said, this is not a sysctl. It wants to
be a prctl(), at least the way things look so far.
> +
> #endif /* CONFIG_MMU */
>
> /*
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 8a9e92068b150..6981944a7c6ec 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2671,6 +2671,11 @@ pid_t kernel_clone(struct kernel_clone_args *args)
> trace = 0;
> }
>
> + if (current->mm && test_bit(MMF_COW_PTE_READY, ¤t->mm->flags)) {
> + clear_bit(MMF_COW_PTE_READY, ¤t->mm->flags);
> + set_bit(MMF_COW_PTE, ¤t->mm->flags);
> + }
> +
> p = copy_process(NULL, trace, NUMA_NO_NODE, args);
> add_latent_entropy();
>
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 205d605cacc5b..c4f54412ae3a9 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -2360,6 +2360,14 @@ static struct ctl_table vm_table[] = {
> .mode = 0644,
> .proc_handler = mmap_min_addr_handler,
> },
> + {
> + .procname = "cow_pte",
> + .data = &sysctl_cow_pte_pid,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = cow_pte_handler,
> + .extra1 = SYSCTL_ZERO,
> + },
> #endif
> #ifdef CONFIG_NUMA
> {
> diff --git a/mm/Makefile b/mm/Makefile
> index 9a564f8364035..7a568d5066ee6 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -40,7 +40,7 @@ mmu-y := nommu.o
> mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
> mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
> msync.o page_vma_mapped.o pagewalk.o \
> - pgtable-generic.o rmap.o vmalloc.o
> + pgtable-generic.o rmap.o vmalloc.o cow_pte.o
>
>
> ifdef CONFIG_CROSS_MEMORY_ATTACH
> diff --git a/mm/cow_pte.c b/mm/cow_pte.c
> new file mode 100644
> index 0000000000000..4e50aa4294ce7
> --- /dev/null
> +++ b/mm/cow_pte.c
> @@ -0,0 +1,39 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/sysctl.h>
> +#include <linux/pgtable.h>
> +#include <linux/sched.h>
> +#include <linux/sched/coredump.h>
> +#include <linux/pid.h>
> +
> +/* sysctl will write to this variable */
> +int sysctl_cow_pte_pid = -1;
> +
> +static void set_cow_pte_task(void)
> +{
> + struct pid *pid;
> + struct task_struct *task;
> +
> + pid = find_get_pid(sysctl_cow_pte_pid);
This seems to be missing a corresponding call to put_pid().
thanks,
--
John Hubbard
NVIDIA
On Tue, Sep 27, 2022 at 02:22:27PM -0700, John Hubbard wrote:
> > +extern int sysctl_cow_pte_pid;
>
> So are setting a global value, to a single pid?? Only one pid at a time
> can be set up?
>
> I think that tells you already that there is a huge API problem here.
>
> As the other thread with Nadav said, this is not a sysctl. It wants to
> be a prctl(), at least the way things look so far.
I will change it to use the prctl().
Probably it will be under PR_SET_MM or create a new option.
> > +static void set_cow_pte_task(void)
> > +{
> > + struct pid *pid;
> > + struct task_struct *task;
> > +
> > + pid = find_get_pid(sysctl_cow_pte_pid);
>
> This seems to be missing a corresponding call to put_pid().
Thanks.
> thanks,
>
> --
> John Hubbard
> NVIDIA
>
Best regards,
Chih-En Lin