Some of the features (unmap, migrate, device exclusive, mkclean, etc)
might modify the pte entry via rmap. Add a new page vma mapped walk
flag, PVMW_BREAK_COW_PTE, to indicate the rmap walking to break COW PTE.
Signed-off-by: Chih-En Lin <[email protected]>
---
include/linux/rmap.h | 2 ++
mm/migrate.c | 3 ++-
mm/page_vma_mapped.c | 2 ++
mm/rmap.c | 12 +++++++-----
mm/vmscan.c | 7 ++++++-
5 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bd3504d11b155..d0f07e5519736 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -368,6 +368,8 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
#define PVMW_SYNC (1 << 0)
/* Look for migration entries rather than present PTEs */
#define PVMW_MIGRATION (1 << 1)
+/* Break COW-ed PTE during walking */
+#define PVMW_BREAK_COW_PTE (1 << 2)
struct page_vma_mapped_walk {
unsigned long pfn;
diff --git a/mm/migrate.c b/mm/migrate.c
index dff333593a8ae..a4be7e04c9b09 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -174,7 +174,8 @@ void putback_movable_pages(struct list_head *l)
static bool remove_migration_pte(struct folio *folio,
struct vm_area_struct *vma, unsigned long addr, void *old)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
+ DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr,
+ PVMW_SYNC | PVMW_MIGRATION | PVMW_BREAK_COW_PTE);
while (page_vma_mapped_walk(&pvmw)) {
rmap_t rmap_flags = RMAP_NONE;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 93e13fc17d3cb..5dfc9236dc505 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -251,6 +251,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
step_forward(pvmw, PMD_SIZE);
continue;
}
+ if (pvmw->flags & PVMW_BREAK_COW_PTE)
+ break_cow_pte(vma, pvmw->pmd, pvmw->address);
if (!map_pte(pvmw))
goto next_pte;
this_pte:
diff --git a/mm/rmap.c b/mm/rmap.c
index 2ec925e5fa6a9..b1b7dcbd498be 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -807,7 +807,8 @@ static bool folio_referenced_one(struct folio *folio,
struct vm_area_struct *vma, unsigned long address, void *arg)
{
struct folio_referenced_arg *pra = arg;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ /* it will clear the entry, so we should break COW PTE. */
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
int referenced = 0;
while (page_vma_mapped_walk(&pvmw)) {
@@ -1012,7 +1013,8 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address,
+ PVMW_SYNC | PVMW_BREAK_COW_PTE);
int *cleaned = arg;
*cleaned += page_vma_mkclean_one(&pvmw);
@@ -1471,7 +1473,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
pte_t pteval;
struct page *subpage;
bool anon_exclusive, ret = true;
@@ -1842,7 +1844,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
pte_t pteval;
struct page *subpage;
bool anon_exclusive, ret = true;
@@ -2195,7 +2197,7 @@ static bool page_make_device_exclusive_one(struct folio *folio,
struct vm_area_struct *vma, unsigned long address, void *priv)
{
struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
struct make_exclusive_args *args = priv;
pte_t pteval;
struct page *subpage;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 026199c047e0e..980d2056adfd1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1781,6 +1781,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
}
}
+ /*
+ * Break COW PTE since checking the reference
+ * of folio might modify the PTE.
+ */
if (!ignore_references)
references = folio_check_references(folio, sc);
@@ -1864,7 +1868,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
/*
* The folio is mapped into the page tables of one or more
- * processes. Try to unmap it here.
+ * processes. Try to unmap it here. Also, since it will write
+ * to the page tables, break COW PTE if they are.
*/
if (folio_mapped(folio)) {
enum ttu_flags flags = TTU_BATCH_FLUSH;
--
2.37.3
On Tue, Dec 20, 2022 at 8:25 PM Chih-En Lin <[email protected]> wrote:
>
> Some of the features (unmap, migrate, device exclusive, mkclean, etc)
> might modify the pte entry via rmap. Add a new page vma mapped walk
> flag, PVMW_BREAK_COW_PTE, to indicate the rmap walking to break COW PTE.
>
> Signed-off-by: Chih-En Lin <[email protected]>
> ---
> include/linux/rmap.h | 2 ++
> mm/migrate.c | 3 ++-
> mm/page_vma_mapped.c | 2 ++
> mm/rmap.c | 12 +++++++-----
> mm/vmscan.c | 7 ++++++-
> 5 files changed, 19 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index bd3504d11b155..d0f07e5519736 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -368,6 +368,8 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
> #define PVMW_SYNC (1 << 0)
> /* Look for migration entries rather than present PTEs */
> #define PVMW_MIGRATION (1 << 1)
> +/* Break COW-ed PTE during walking */
> +#define PVMW_BREAK_COW_PTE (1 << 2)
>
> struct page_vma_mapped_walk {
> unsigned long pfn;
> diff --git a/mm/migrate.c b/mm/migrate.c
> index dff333593a8ae..a4be7e04c9b09 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -174,7 +174,8 @@ void putback_movable_pages(struct list_head *l)
> static bool remove_migration_pte(struct folio *folio,
> struct vm_area_struct *vma, unsigned long addr, void *old)
> {
> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
> + DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr,
> + PVMW_SYNC | PVMW_MIGRATION | PVMW_BREAK_COW_PTE);
>
> while (page_vma_mapped_walk(&pvmw)) {
> rmap_t rmap_flags = RMAP_NONE;
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 93e13fc17d3cb..5dfc9236dc505 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -251,6 +251,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> step_forward(pvmw, PMD_SIZE);
> continue;
> }
> + if (pvmw->flags & PVMW_BREAK_COW_PTE)
> + break_cow_pte(vma, pvmw->pmd, pvmw->address);
> if (!map_pte(pvmw))
> goto next_pte;
> this_pte:
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 2ec925e5fa6a9..b1b7dcbd498be 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -807,7 +807,8 @@ static bool folio_referenced_one(struct folio *folio,
> struct vm_area_struct *vma, unsigned long address, void *arg)
> {
> struct folio_referenced_arg *pra = arg;
> - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> + /* it will clear the entry, so we should break COW PTE. */
> + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
what do you mean by breaking cow pte? in memory reclamation case, we are only
checking and clearing page referenced bit in pte, do we really need to
break cow?
> int referenced = 0;
>
> while (page_vma_mapped_walk(&pvmw)) {
> @@ -1012,7 +1013,8 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
> static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
> unsigned long address, void *arg)
> {
> - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
> + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address,
> + PVMW_SYNC | PVMW_BREAK_COW_PTE);
> int *cleaned = arg;
>
> *cleaned += page_vma_mkclean_one(&pvmw);
> @@ -1471,7 +1473,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> unsigned long address, void *arg)
> {
> struct mm_struct *mm = vma->vm_mm;
> - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
> pte_t pteval;
> struct page *subpage;
> bool anon_exclusive, ret = true;
> @@ -1842,7 +1844,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
> unsigned long address, void *arg)
> {
> struct mm_struct *mm = vma->vm_mm;
> - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
> pte_t pteval;
> struct page *subpage;
> bool anon_exclusive, ret = true;
> @@ -2195,7 +2197,7 @@ static bool page_make_device_exclusive_one(struct folio *folio,
> struct vm_area_struct *vma, unsigned long address, void *priv)
> {
> struct mm_struct *mm = vma->vm_mm;
> - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
> struct make_exclusive_args *args = priv;
> pte_t pteval;
> struct page *subpage;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 026199c047e0e..980d2056adfd1 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1781,6 +1781,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> }
> }
>
> + /*
> + * Break COW PTE since checking the reference
> + * of folio might modify the PTE.
> + */
> if (!ignore_references)
> references = folio_check_references(folio, sc);
>
> @@ -1864,7 +1868,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>
> /*
> * The folio is mapped into the page tables of one or more
> - * processes. Try to unmap it here.
> + * processes. Try to unmap it here. Also, since it will write
> + * to the page tables, break COW PTE if they are.
> */
> if (folio_mapped(folio)) {
> enum ttu_flags flags = TTU_BATCH_FLUSH;
> --
> 2.37.3
>
Thanks
Barry
On Mon, Dec 26, 2022 at 10:40:49PM +1300, Barry Song wrote:
> On Tue, Dec 20, 2022 at 8:25 PM Chih-En Lin <[email protected]> wrote:
> >
> > Some of the features (unmap, migrate, device exclusive, mkclean, etc)
> > might modify the pte entry via rmap. Add a new page vma mapped walk
> > flag, PVMW_BREAK_COW_PTE, to indicate the rmap walking to break COW PTE.
> >
> > Signed-off-by: Chih-En Lin <[email protected]>
> > ---
> > include/linux/rmap.h | 2 ++
> > mm/migrate.c | 3 ++-
> > mm/page_vma_mapped.c | 2 ++
> > mm/rmap.c | 12 +++++++-----
> > mm/vmscan.c | 7 ++++++-
> > 5 files changed, 19 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > index bd3504d11b155..d0f07e5519736 100644
> > --- a/include/linux/rmap.h
> > +++ b/include/linux/rmap.h
> > @@ -368,6 +368,8 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
> > #define PVMW_SYNC (1 << 0)
> > /* Look for migration entries rather than present PTEs */
> > #define PVMW_MIGRATION (1 << 1)
> > +/* Break COW-ed PTE during walking */
> > +#define PVMW_BREAK_COW_PTE (1 << 2)
> >
> > struct page_vma_mapped_walk {
> > unsigned long pfn;
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index dff333593a8ae..a4be7e04c9b09 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -174,7 +174,8 @@ void putback_movable_pages(struct list_head *l)
> > static bool remove_migration_pte(struct folio *folio,
> > struct vm_area_struct *vma, unsigned long addr, void *old)
> > {
> > - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
> > + DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr,
> > + PVMW_SYNC | PVMW_MIGRATION | PVMW_BREAK_COW_PTE);
> >
> > while (page_vma_mapped_walk(&pvmw)) {
> > rmap_t rmap_flags = RMAP_NONE;
> > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > index 93e13fc17d3cb..5dfc9236dc505 100644
> > --- a/mm/page_vma_mapped.c
> > +++ b/mm/page_vma_mapped.c
> > @@ -251,6 +251,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> > step_forward(pvmw, PMD_SIZE);
> > continue;
> > }
> > + if (pvmw->flags & PVMW_BREAK_COW_PTE)
> > + break_cow_pte(vma, pvmw->pmd, pvmw->address);
> > if (!map_pte(pvmw))
> > goto next_pte;
> > this_pte:
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index 2ec925e5fa6a9..b1b7dcbd498be 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -807,7 +807,8 @@ static bool folio_referenced_one(struct folio *folio,
> > struct vm_area_struct *vma, unsigned long address, void *arg)
> > {
> > struct folio_referenced_arg *pra = arg;
> > - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> > + /* it will clear the entry, so we should break COW PTE. */
> > + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
>
> what do you mean by breaking cow pte? in memory reclamation case, we are only
> checking and clearing page referenced bit in pte, do we really need to
> break cow?
Since we might clear page referenced bit, it will modify the write
protection shared page table (COW-ed PTE). We should duplicate it.
Actually, I didn’t break COW at first because it will conditionally
modify the table and only clear the referenced bit.
So, if clearing page referenced bit is fine to the COW-ed PTE table
and the break COW PTE is unnecessary here, we can remove it.
Thanks,
Chih-En Lin
On Mon, Dec 26, 2022 at 11:56 PM Chih-En Lin <[email protected]> wrote:
>
> On Mon, Dec 26, 2022 at 10:40:49PM +1300, Barry Song wrote:
> > On Tue, Dec 20, 2022 at 8:25 PM Chih-En Lin <[email protected]> wrote:
> > >
> > > Some of the features (unmap, migrate, device exclusive, mkclean, etc)
> > > might modify the pte entry via rmap. Add a new page vma mapped walk
> > > flag, PVMW_BREAK_COW_PTE, to indicate the rmap walking to break COW PTE.
> > >
> > > Signed-off-by: Chih-En Lin <[email protected]>
> > > ---
> > > include/linux/rmap.h | 2 ++
> > > mm/migrate.c | 3 ++-
> > > mm/page_vma_mapped.c | 2 ++
> > > mm/rmap.c | 12 +++++++-----
> > > mm/vmscan.c | 7 ++++++-
> > > 5 files changed, 19 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > > index bd3504d11b155..d0f07e5519736 100644
> > > --- a/include/linux/rmap.h
> > > +++ b/include/linux/rmap.h
> > > @@ -368,6 +368,8 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
> > > #define PVMW_SYNC (1 << 0)
> > > /* Look for migration entries rather than present PTEs */
> > > #define PVMW_MIGRATION (1 << 1)
> > > +/* Break COW-ed PTE during walking */
> > > +#define PVMW_BREAK_COW_PTE (1 << 2)
> > >
> > > struct page_vma_mapped_walk {
> > > unsigned long pfn;
> > > diff --git a/mm/migrate.c b/mm/migrate.c
> > > index dff333593a8ae..a4be7e04c9b09 100644
> > > --- a/mm/migrate.c
> > > +++ b/mm/migrate.c
> > > @@ -174,7 +174,8 @@ void putback_movable_pages(struct list_head *l)
> > > static bool remove_migration_pte(struct folio *folio,
> > > struct vm_area_struct *vma, unsigned long addr, void *old)
> > > {
> > > - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
> > > + DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr,
> > > + PVMW_SYNC | PVMW_MIGRATION | PVMW_BREAK_COW_PTE);
> > >
> > > while (page_vma_mapped_walk(&pvmw)) {
> > > rmap_t rmap_flags = RMAP_NONE;
> > > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > > index 93e13fc17d3cb..5dfc9236dc505 100644
> > > --- a/mm/page_vma_mapped.c
> > > +++ b/mm/page_vma_mapped.c
> > > @@ -251,6 +251,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> > > step_forward(pvmw, PMD_SIZE);
> > > continue;
> > > }
> > > + if (pvmw->flags & PVMW_BREAK_COW_PTE)
> > > + break_cow_pte(vma, pvmw->pmd, pvmw->address);
> > > if (!map_pte(pvmw))
> > > goto next_pte;
> > > this_pte:
> > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > index 2ec925e5fa6a9..b1b7dcbd498be 100644
> > > --- a/mm/rmap.c
> > > +++ b/mm/rmap.c
> > > @@ -807,7 +807,8 @@ static bool folio_referenced_one(struct folio *folio,
> > > struct vm_area_struct *vma, unsigned long address, void *arg)
> > > {
> > > struct folio_referenced_arg *pra = arg;
> > > - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> > > + /* it will clear the entry, so we should break COW PTE. */
> > > + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
> >
> > what do you mean by breaking cow pte? in memory reclamation case, we are only
> > checking and clearing page referenced bit in pte, do we really need to
> > break cow?
>
> Since we might clear page referenced bit, it will modify the write
> protection shared page table (COW-ed PTE). We should duplicate it.
>
> Actually, I didn’t break COW at first because it will conditionally
> modify the table and only clear the referenced bit.
> So, if clearing page referenced bit is fine to the COW-ed PTE table
> and the break COW PTE is unnecessary here, we can remove it.
if a page is mapped by 100 processes and anyone of these 100 processes
access this page, we will get a reference bit in the PTE. Otherwise, we will
have to scan 100 PTEs to figure out if a page is accessed and should be
kept in LRU.
i don't see the fundamental necessity to duplicate PTE only because of clearing
the reference bit. as keeping the pte shared will help save a lot of cost for
memory reclamation for those CPUs which have hardware reference bits
in PTE.
>
> Thanks,
> Chih-En Lin
Thanks
barry
On Tue, Dec 27, 2022 at 02:15:00PM +1300, Barry Song wrote:
> On Mon, Dec 26, 2022 at 11:56 PM Chih-En Lin <[email protected]> wrote:
> >
> > On Mon, Dec 26, 2022 at 10:40:49PM +1300, Barry Song wrote:
> > > On Tue, Dec 20, 2022 at 8:25 PM Chih-En Lin <[email protected]> wrote:
> > > >
> > > > Some of the features (unmap, migrate, device exclusive, mkclean, etc)
> > > > might modify the pte entry via rmap. Add a new page vma mapped walk
> > > > flag, PVMW_BREAK_COW_PTE, to indicate the rmap walking to break COW PTE.
> > > >
> > > > Signed-off-by: Chih-En Lin <[email protected]>
> > > > ---
> > > > include/linux/rmap.h | 2 ++
> > > > mm/migrate.c | 3 ++-
> > > > mm/page_vma_mapped.c | 2 ++
> > > > mm/rmap.c | 12 +++++++-----
> > > > mm/vmscan.c | 7 ++++++-
> > > > 5 files changed, 19 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > > > index bd3504d11b155..d0f07e5519736 100644
> > > > --- a/include/linux/rmap.h
> > > > +++ b/include/linux/rmap.h
> > > > @@ -368,6 +368,8 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
> > > > #define PVMW_SYNC (1 << 0)
> > > > /* Look for migration entries rather than present PTEs */
> > > > #define PVMW_MIGRATION (1 << 1)
> > > > +/* Break COW-ed PTE during walking */
> > > > +#define PVMW_BREAK_COW_PTE (1 << 2)
> > > >
> > > > struct page_vma_mapped_walk {
> > > > unsigned long pfn;
> > > > diff --git a/mm/migrate.c b/mm/migrate.c
> > > > index dff333593a8ae..a4be7e04c9b09 100644
> > > > --- a/mm/migrate.c
> > > > +++ b/mm/migrate.c
> > > > @@ -174,7 +174,8 @@ void putback_movable_pages(struct list_head *l)
> > > > static bool remove_migration_pte(struct folio *folio,
> > > > struct vm_area_struct *vma, unsigned long addr, void *old)
> > > > {
> > > > - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
> > > > + DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr,
> > > > + PVMW_SYNC | PVMW_MIGRATION | PVMW_BREAK_COW_PTE);
> > > >
> > > > while (page_vma_mapped_walk(&pvmw)) {
> > > > rmap_t rmap_flags = RMAP_NONE;
> > > > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > > > index 93e13fc17d3cb..5dfc9236dc505 100644
> > > > --- a/mm/page_vma_mapped.c
> > > > +++ b/mm/page_vma_mapped.c
> > > > @@ -251,6 +251,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> > > > step_forward(pvmw, PMD_SIZE);
> > > > continue;
> > > > }
> > > > + if (pvmw->flags & PVMW_BREAK_COW_PTE)
> > > > + break_cow_pte(vma, pvmw->pmd, pvmw->address);
> > > > if (!map_pte(pvmw))
> > > > goto next_pte;
> > > > this_pte:
> > > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > > index 2ec925e5fa6a9..b1b7dcbd498be 100644
> > > > --- a/mm/rmap.c
> > > > +++ b/mm/rmap.c
> > > > @@ -807,7 +807,8 @@ static bool folio_referenced_one(struct folio *folio,
> > > > struct vm_area_struct *vma, unsigned long address, void *arg)
> > > > {
> > > > struct folio_referenced_arg *pra = arg;
> > > > - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> > > > + /* it will clear the entry, so we should break COW PTE. */
> > > > + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_BREAK_COW_PTE);
> > >
> > > what do you mean by breaking cow pte? in memory reclamation case, we are only
> > > checking and clearing page referenced bit in pte, do we really need to
> > > break cow?
> >
> > Since we might clear page referenced bit, it will modify the write
> > protection shared page table (COW-ed PTE). We should duplicate it.
> >
> > Actually, I didn’t break COW at first because it will conditionally
> > modify the table and only clear the referenced bit.
> > So, if clearing page referenced bit is fine to the COW-ed PTE table
> > and the break COW PTE is unnecessary here, we can remove it.
>
> if a page is mapped by 100 processes and anyone of these 100 processes
> access this page, we will get a reference bit in the PTE. Otherwise, we will
> have to scan 100 PTEs to figure out if a page is accessed and should be
> kept in LRU.
> i don't see the fundamental necessity to duplicate PTE only because of clearing
> the reference bit. as keeping the pte shared will help save a lot of cost for
> memory reclamation for those CPUs which have hardware reference bits
> in PTE.
>
As I knew, if the page reference bit is unset and the accessed bit of
the pte entry is set, the reclamation will clear the accessed bit and
set the reference bit of the page.
So, for the such case with COW PTE, the logic is same as the normal PTE
one. It makes sense. Thanks for helping me to clear up the point.
Thanks,
Chih-En Lin