From:   David Hildenbrand <david@redhat.com>
To:     linux-kernel@vger.kernel.org
Cc:     Andrew Morton <akpm@linux-foundation.org>,
        Hugh Dickins <hughd@google.com>,
        Linus Torvalds <torvalds@linux-foundation.org>,
        David Rientjes <rientjes@google.com>,
        Shakeel Butt <shakeelb@google.com>,
        John Hubbard <jhubbard@nvidia.com>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Mike Kravetz <mike.kravetz@oracle.com>,
        Mike Rapoport <rppt@linux.ibm.com>,
        Yang Shi <shy828301@gmail.com>,
        "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
        Matthew Wilcox <willy@infradead.org>,
        Vlastimil Babka <vbabka@suse.cz>, Jann Horn <jannh@google.com>,
        Michal Hocko <mhocko@kernel.org>,
        Nadav Amit <namit@vmware.com>, Rik van Riel <riel@surriel.com>,
        Roman Gushchin <guro@fb.com>,
        Andrea Arcangeli <aarcange@redhat.com>,
        Peter Xu <peterx@redhat.com>,
        Donald Dutile <ddutile@redhat.com>,
        Christoph Hellwig <hch@lst.de>,
        Oleg Nesterov <oleg@redhat.com>, Jan Kara <jack@suse.cz>,
        linux-mm@kvack.org, linux-kselftest@vger.kernel.org,
        linux-doc@vger.kernel.org, David Hildenbrand <david@redhat.com>
Subject: [PATCH v1 09/11] mm: gup: trigger unsharing via FAULT_FLAG_UNSHARE when required (hugetlb)
Date:   Fri, 17 Dec 2021 12:30:47 +0100
Message-Id: <20211217113049.23850-10-david@redhat.com>
In-Reply-To: <20211217113049.23850-1-david@redhat.com>
References: <20211217113049.23850-1-david@redhat.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Similar to the !hugetlb variant, invoke unsharing for shared anonymous
pages when required during GUP by setting FOLL_FAULT_UNSHARE in hugetlb
code as well.

FAULT_FLAG_UNSHARE will trigger unsharing of shared anonymous pages during
GUP, resulting in a child process no longer being able to observe memory
modifications performed by the parent after fork() to anonymous shared
hugetlb pages.

This commit is based on prototype patches by Andrea.

Co-developed-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/gup.c     |  3 +--
 mm/hugetlb.c | 43 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 2a83388c3fb4..35d1b28e3829 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -67,8 +67,7 @@ bool gup_must_unshare(unsigned int flags, struct page *page, bool is_head)
 	if (PageKsm(page))
 		return false;
 	if (PageHuge(page))
-		/* TODO: handle hugetlb as well. */
-		return false;
+		return __page_mapcount(page) > 1;
 	if (is_head) {
 		VM_BUG_ON(!PageTransHuge(page));
 		return page_trans_huge_mapcount(page, NULL) > 1;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5f2863b046ef..dc42018ee1a6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5971,6 +5971,25 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
 	}
 }
 
+static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+					       bool *unshare)
+{
+	pte_t pteval = huge_ptep_get(pte);
+
+	*unshare = false;
+	if (is_swap_pte(pteval))
+		return true;
+	if (huge_pte_write(pteval))
+		return false;
+	if (flags & FOLL_WRITE)
+		return true;
+	if (gup_must_unshare(flags, pte_page(pteval), true)) {
+		*unshare = true;
+		return true;
+	}
+	return false;
+}
+
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
 			 unsigned long *position, unsigned long *nr_pages,
@@ -5985,6 +6004,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
 		spinlock_t *ptl = NULL;
+		bool unshare;
 		int absent;
 		struct page *page;
 
@@ -6035,9 +6055,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * both cases, and because we can't follow correct pages
 		 * directly from any kind of swap entries.
 		 */
-		if (absent || is_swap_pte(huge_ptep_get(pte)) ||
-		    ((flags & FOLL_WRITE) &&
-		      !huge_pte_write(huge_ptep_get(pte)))) {
+		if (absent ||
+		    __follow_hugetlb_must_fault(flags, pte, &unshare)) {
 			vm_fault_t ret;
 			unsigned int fault_flags = 0;
 
@@ -6045,6 +6064,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 				spin_unlock(ptl);
 			if (flags & FOLL_WRITE)
 				fault_flags |= FAULT_FLAG_WRITE;
+			else if (unshare)
+				fault_flags |= FAULT_FLAG_UNSHARE;
 			if (locked)
 				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
 					FAULT_FLAG_KILLABLE;
@@ -6734,7 +6755,21 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		goto out;
 	pte = huge_ptep_get((pte_t *)pmd);
 	if (pte_present(pte)) {
-		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+		struct page *head_page = pmd_page(*pmd);
+
+		/*
+		 * follow_huge_pmd() is only called when coming via
+		 * follow_page(), where we set FOLL_NOUNSHARE. Ordinary GUP
+		 * goes via follow_hugetlb_page(), where we can properly unshare
+		 * if required.
+		 */
+		if (WARN_ON_ONCE(!huge_pte_write(pte) &&
+				 gup_must_unshare(flags, head_page, true))) {
+			page = NULL;
+			goto out;
+		}
+
+		page = head_page + ((address & ~PMD_MASK) >> PAGE_SHIFT);
 		/*
 		 * try_grab_page() should always succeed here, because: a) we
 		 * hold the pmd (ptl) lock, and b) we've just checked that the
-- 
2.31.1