Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752122AbdHHKZT (ORCPT ); Tue, 8 Aug 2017 06:25:19 -0400 Received: from mx0a-001b2d01.pphosted.com ([148.163.156.1]:56159 "EHLO mx0a-001b2d01.pphosted.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751464AbdHHKZR (ORCPT ); Tue, 8 Aug 2017 06:25:17 -0400 Subject: Re: [RFC v5 02/11] mm: Prepare for FAULT_FLAG_SPECULATIVE To: Laurent Dufour , paulmck@linux.vnet.ibm.com, peterz@infradead.org, akpm@linux-foundation.org, kirill@shutemov.name, ak@linux.intel.com, mhocko@kernel.org, dave@stgolabs.net, jack@suse.cz, Matthew Wilcox References: <1497635555-25679-1-git-send-email-ldufour@linux.vnet.ibm.com> <1497635555-25679-3-git-send-email-ldufour@linux.vnet.ibm.com> Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org, haren@linux.vnet.ibm.com, khandual@linux.vnet.ibm.com, npiggin@gmail.com, bsingharora@gmail.com, Tim Chen From: Anshuman Khandual Date: Tue, 8 Aug 2017 15:54:01 +0530 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.5.1 MIME-Version: 1.0 In-Reply-To: <1497635555-25679-3-git-send-email-ldufour@linux.vnet.ibm.com> Content-Type: text/plain; charset=windows-1252 Content-Transfer-Encoding: 7bit X-TM-AS-MML: disable x-cbid: 17080810-0048-0000-0000-00000257E577 X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused x-cbparentid: 17080810-0049-0000-0000-0000480B48D0 Message-Id: <7e770060-32b2-c136-5d34-2f078800df21@linux.vnet.ibm.com> X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:,, definitions=2017-08-08_04:,, signatures=0 X-Proofpoint-Spam-Details: rule=outbound_notspam policy=outbound score=0 spamscore=0 suspectscore=2 malwarescore=0 phishscore=0 adultscore=0 bulkscore=0 classifier=spam adjust=0 reason=mlx scancount=1 engine=8.0.1-1706020000 definitions=main-1708080167 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 6557 Lines: 191 On 06/16/2017 11:22 PM, Laurent Dufour wrote: > From: Peter Zijlstra > > When speculating faults (without holding mmap_sem) we need to validate > that the vma against which we loaded pages is still valid when we're > ready to install the new PTE. > > Therefore, replace the pte_offset_map_lock() calls that (re)take the > PTL with pte_map_lock() which can fail in case we find the VMA changed > since we started the fault. Where we are checking if VMA has changed or not since the fault ? > > Signed-off-by: Peter Zijlstra (Intel) > > [Port to 4.12 kernel] > [Remove the comment about the fault_env structure which has been > implemented as the vm_fault structure in the kernel] > Signed-off-by: Laurent Dufour > --- > include/linux/mm.h | 1 + > mm/memory.c | 55 ++++++++++++++++++++++++++++++++++++++---------------- > 2 files changed, 40 insertions(+), 16 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index b892e95d4929..6b7ec2a76953 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -286,6 +286,7 @@ extern pgprot_t protection_map[16]; > #define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ > #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ > #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ > +#define FAULT_FLAG_SPECULATIVE 0x200 /* Speculative fault, not holding mmap_sem */ We are not using this yet, may be can wait till late in the series. > > #define FAULT_FLAG_TRACE \ > { FAULT_FLAG_WRITE, "WRITE" }, \ > diff --git a/mm/memory.c b/mm/memory.c > index fd952f05e016..40834444ea0d 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -2240,6 +2240,12 @@ static inline void wp_page_reuse(struct vm_fault *vmf) > pte_unmap_unlock(vmf->pte, vmf->ptl); > } > > +static bool pte_map_lock(struct vm_fault *vmf) > +{ > + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); > + return true; > +} This is always true ? Then we should not have all these if (!pte_map_lock(vmf)) check blocks down below. > + > /* > * Handle the case of a page which we actually need to copy to a new page. > * > @@ -2267,6 +2273,7 @@ static int wp_page_copy(struct vm_fault *vmf) > const unsigned long mmun_start = vmf->address & PAGE_MASK; > const unsigned long mmun_end = mmun_start + PAGE_SIZE; > struct mem_cgroup *memcg; > + int ret = VM_FAULT_OOM; > If we remove the check block over pte_map_lock(), adding VM_FAULT_OOM becomes redundant here. > if (unlikely(anon_vma_prepare(vma))) > goto oom; > @@ -2294,7 +2301,11 @@ static int wp_page_copy(struct vm_fault *vmf) > /* > * Re-check the pte - we dropped the lock > */ > - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); > + if (!pte_map_lock(vmf)) { > + mem_cgroup_cancel_charge(new_page, memcg, false); > + ret = VM_FAULT_RETRY; > + goto oom_free_new; > + } > if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { > if (old_page) { > if (!PageAnon(old_page)) { > @@ -2382,7 +2393,7 @@ static int wp_page_copy(struct vm_fault *vmf) > oom: > if (old_page) > put_page(old_page); > - return VM_FAULT_OOM; > + return ret; > } > > /** > @@ -2403,8 +2414,8 @@ static int wp_page_copy(struct vm_fault *vmf) > int finish_mkwrite_fault(struct vm_fault *vmf) > { > WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); > - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, > - &vmf->ptl); > + if (!pte_map_lock(vmf)) > + return VM_FAULT_RETRY; Cant fail. > /* > * We might have raced with another page fault while we released the > * pte_offset_map_lock. > @@ -2522,8 +2533,11 @@ static int do_wp_page(struct vm_fault *vmf) > get_page(vmf->page); > pte_unmap_unlock(vmf->pte, vmf->ptl); > lock_page(vmf->page); > - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, > - vmf->address, &vmf->ptl); > + if (!pte_map_lock(vmf)) { > + unlock_page(vmf->page); > + put_page(vmf->page); > + return VM_FAULT_RETRY; > + } Same here. > if (!pte_same(*vmf->pte, vmf->orig_pte)) { > unlock_page(vmf->page); > pte_unmap_unlock(vmf->pte, vmf->ptl); > @@ -2681,8 +2695,10 @@ int do_swap_page(struct vm_fault *vmf) > * Back out if somebody else faulted in this pte > * while we released the pte lock. > */ > - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, > - vmf->address, &vmf->ptl); > + if (!pte_map_lock(vmf)) { > + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); > + return VM_FAULT_RETRY; > + } > if (likely(pte_same(*vmf->pte, vmf->orig_pte))) > ret = VM_FAULT_OOM; > delayacct_clear_flag(DELAYACCT_PF_SWAPIN); > @@ -2738,8 +2754,11 @@ int do_swap_page(struct vm_fault *vmf) > /* > * Back out if somebody else already faulted in this pte. > */ > - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > - &vmf->ptl); > + if (!pte_map_lock(vmf)) { > + ret = VM_FAULT_RETRY; > + mem_cgroup_cancel_charge(page, memcg, false); > + goto out_page; > + } > if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) > goto out_nomap; > > @@ -2903,8 +2922,8 @@ static int do_anonymous_page(struct vm_fault *vmf) > !mm_forbids_zeropage(vma->vm_mm)) { > entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), > vma->vm_page_prot)); > - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, > - vmf->address, &vmf->ptl); > + if (!pte_map_lock(vmf)) > + return VM_FAULT_RETRY; > if (!pte_none(*vmf->pte)) > goto unlock; > /* Deliver the page fault to userland, check inside PT lock */ > @@ -2936,8 +2955,11 @@ static int do_anonymous_page(struct vm_fault *vmf) > if (vma->vm_flags & VM_WRITE) > entry = pte_mkwrite(pte_mkdirty(entry)); > > - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > - &vmf->ptl); > + if (!pte_map_lock(vmf)) { > + mem_cgroup_cancel_charge(page, memcg, false); > + put_page(page); > + return VM_FAULT_RETRY; > + } > if (!pte_none(*vmf->pte)) > goto release; > > @@ -3057,8 +3079,9 @@ static int pte_alloc_one_map(struct vm_fault *vmf) > * pte_none() under vmf->ptl protection when we return to > * alloc_set_pte(). > */ > - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > - &vmf->ptl); > + if (!pte_map_lock(vmf)) > + return VM_FAULT_RETRY; > + > return 0; All these 'if' blocks seem redundant, unless I am missing something.