Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752690AbdIERN1 (ORCPT ); Tue, 5 Sep 2017 13:13:27 -0400 Received: from mx0a-001b2d01.pphosted.com ([148.163.156.1]:39426 "EHLO mx0a-001b2d01.pphosted.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752157AbdIERNY (ORCPT ); Tue, 5 Sep 2017 13:13:24 -0400 Subject: Re: [HMM-v25 10/19] mm/memcontrol: support MEMORY_DEVICE_PRIVATE v4 To: =?UTF-8?B?SsOpcsO0bWUgR2xpc3Nl?= , akpm@linux-foundation.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org Cc: John Hubbard , Dan Williams , David Nellans , Balbir Singh , Johannes Weiner , Michal Hocko , Vladimir Davydov , cgroups@vger.kernel.org References: <20170817000548.32038-1-jglisse@redhat.com> <20170817000548.32038-11-jglisse@redhat.com> From: Laurent Dufour Date: Tue, 5 Sep 2017 19:13:15 +0200 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.2.1 MIME-Version: 1.0 In-Reply-To: <20170817000548.32038-11-jglisse@redhat.com> Content-Type: text/plain; charset=utf-8 Content-Language: en-US Content-Transfer-Encoding: 8bit X-TM-AS-GCONF: 00 x-cbid: 17090517-0012-0000-0000-00000574D315 X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused x-cbparentid: 17090517-0013-0000-0000-000018ED0ADC Message-Id: X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:,, definitions=2017-09-05_07:,, signatures=0 X-Proofpoint-Spam-Details: rule=outbound_notspam policy=outbound score=0 spamscore=0 suspectscore=2 malwarescore=0 phishscore=0 adultscore=0 bulkscore=0 classifier=spam adjust=0 reason=mlx scancount=1 engine=8.0.1-1707230000 definitions=main-1709050248 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 6462 Lines: 187 On 17/08/2017 02:05, Jérôme Glisse wrote: > HMM pages (private or public device pages) are ZONE_DEVICE page and > thus need special handling when it comes to lru or refcount. This > patch make sure that memcontrol properly handle those when it face > them. Those pages are use like regular pages in a process address > space either as anonymous page or as file back page. So from memcg > point of view we want to handle them like regular page for now at > least. > > Changed since v3: > - remove public support and move those chunk to separate patch > Changed since v2: > - s/host/public > Changed since v1: > - s/public/host > - add comments explaining how device memory behave and why > > Signed-off-by: Jérôme Glisse > Acked-by: Balbir Singh > Cc: Johannes Weiner > Cc: Michal Hocko > Cc: Vladimir Davydov > Cc: cgroups@vger.kernel.org > --- > kernel/memremap.c | 1 + > mm/memcontrol.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++---- > 2 files changed, 49 insertions(+), 4 deletions(-) > > diff --git a/kernel/memremap.c b/kernel/memremap.c > index 398630c1fba3..f42d7483e886 100644 > --- a/kernel/memremap.c > +++ b/kernel/memremap.c > @@ -492,6 +492,7 @@ void put_zone_device_private_page(struct page *page) > __ClearPageWaiters(page); > > page->mapping = NULL; > + mem_cgroup_uncharge(page); > > page->pgmap->page_free(page, page->pgmap->data); > } else if (!count) > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 604fb3ca8028..977d1cf3493a 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -4407,12 +4407,13 @@ enum mc_target_type { > MC_TARGET_NONE = 0, > MC_TARGET_PAGE, > MC_TARGET_SWAP, > + MC_TARGET_DEVICE, > }; > > static struct page *mc_handle_present_pte(struct vm_area_struct *vma, > unsigned long addr, pte_t ptent) > { > - struct page *page = vm_normal_page(vma, addr, ptent); > + struct page *page = _vm_normal_page(vma, addr, ptent, true); Hi Jérôme, As _vm_normal_page() is defined later in the patch 18, so this patch should break the bisectability. Cheers, Laurent. > > if (!page || !page_mapped(page)) > return NULL; > @@ -4429,7 +4430,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, > return page; > } > > -#ifdef CONFIG_SWAP > +#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) > static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, > pte_t ptent, swp_entry_t *entry) > { > @@ -4438,6 +4439,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, > > if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) > return NULL; > + > + /* > + * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to > + * a device and because they are not accessible by CPU they are store > + * as special swap entry in the CPU page table. > + */ > + if (is_device_private_entry(ent)) { > + page = device_private_entry_to_page(ent); > + /* > + * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have > + * a refcount of 1 when free (unlike normal page) > + */ > + if (!page_ref_add_unless(page, 1, 1)) > + return NULL; > + return page; > + } > + > /* > * Because lookup_swap_cache() updates some statistics counter, > * we call find_get_page() with swapper_space directly. > @@ -4598,6 +4616,12 @@ static int mem_cgroup_move_account(struct page *page, > * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a > * target for charge migration. if @target is not NULL, the entry is stored > * in target->ent. > + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE > + * (so ZONE_DEVICE page and thus not on the lru). For now we such page is > + * charge like a regular page would be as for all intent and purposes it is > + * just special memory taking the place of a regular page. > + * > + * See Documentations/vm/hmm.txt and include/linux/hmm.h > * > * Called with pte lock held. > */ > @@ -4626,6 +4650,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, > */ > if (page->mem_cgroup == mc.from) { > ret = MC_TARGET_PAGE; > + if (is_device_private_page(page)) > + ret = MC_TARGET_DEVICE; > if (target) > target->page = page; > } > @@ -4693,6 +4719,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, > > ptl = pmd_trans_huge_lock(pmd, vma); > if (ptl) { > + /* > + * Note their can not be MC_TARGET_DEVICE for now as we do not > + * support transparent huge page with MEMORY_DEVICE_PUBLIC or > + * MEMORY_DEVICE_PRIVATE but this might change. > + */ > if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) > mc.precharge += HPAGE_PMD_NR; > spin_unlock(ptl); > @@ -4908,6 +4939,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, > putback_lru_page(page); > } > put_page(page); > + } else if (target_type == MC_TARGET_DEVICE) { > + page = target.page; > + if (!mem_cgroup_move_account(page, true, > + mc.from, mc.to)) { > + mc.precharge -= HPAGE_PMD_NR; > + mc.moved_charge += HPAGE_PMD_NR; > + } > + put_page(page); > } > spin_unlock(ptl); > return 0; > @@ -4919,12 +4958,16 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > for (; addr != end; addr += PAGE_SIZE) { > pte_t ptent = *(pte++); > + bool device = false; > swp_entry_t ent; > > if (!mc.precharge) > break; > > switch (get_mctgt_type(vma, addr, ptent, &target)) { > + case MC_TARGET_DEVICE: > + device = true; > + /* fall through */ > case MC_TARGET_PAGE: > page = target.page; > /* > @@ -4935,7 +4978,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, > */ > if (PageTransCompound(page)) > goto put; > - if (isolate_lru_page(page)) > + if (!device && isolate_lru_page(page)) > goto put; > if (!mem_cgroup_move_account(page, false, > mc.from, mc.to)) { > @@ -4943,7 +4986,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, > /* we uncharge from mc.from later. */ > mc.moved_charge++; > } > - putback_lru_page(page); > + if (!device) > + putback_lru_page(page); > put: /* get_mctgt_type() gets the page */ > put_page(page); > break; >