Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752778AbcKUCGx (ORCPT ); Sun, 20 Nov 2016 21:06:53 -0500 Received: from mail-pf0-f194.google.com ([209.85.192.194]:35189 "EHLO mail-pf0-f194.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752075AbcKUCGv (ORCPT ); Sun, 20 Nov 2016 21:06:51 -0500 Subject: Re: [HMM v13 06/18] mm/ZONE_DEVICE/unaddressable: add special swap for unaddressable To: =?UTF-8?B?SsOpcsO0bWUgR2xpc3Nl?= , akpm@linux-foundation.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org References: <1479493107-982-1-git-send-email-jglisse@redhat.com> <1479493107-982-7-git-send-email-jglisse@redhat.com> Cc: John Hubbard , Dan Williams , Ross Zwisler From: Balbir Singh Message-ID: <3f759fff-fe8d-89c4-5c86-c9f27403bf3b@gmail.com> Date: Mon, 21 Nov 2016 13:06:45 +1100 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.0 MIME-Version: 1.0 In-Reply-To: <1479493107-982-7-git-send-email-jglisse@redhat.com> Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 10677 Lines: 341 On 19/11/16 05:18, Jérôme Glisse wrote: > To allow use of device un-addressable memory inside a process add a > special swap type. Also add a new callback to handle page fault on > such entry. > > Signed-off-by: Jérôme Glisse > Cc: Dan Williams > Cc: Ross Zwisler > --- > fs/proc/task_mmu.c | 10 +++++++- > include/linux/memremap.h | 5 ++++ > include/linux/swap.h | 18 ++++++++++--- > include/linux/swapops.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ > kernel/memremap.c | 14 ++++++++++ > mm/Kconfig | 12 +++++++++ > mm/memory.c | 24 +++++++++++++++++ > mm/mprotect.c | 12 +++++++++ > 8 files changed, 158 insertions(+), 4 deletions(-) > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index 6909582..0726d39 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -544,8 +544,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, > } else { > mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; > } > - } else if (is_migration_entry(swpent)) > + } else if (is_migration_entry(swpent)) { > page = migration_entry_to_page(swpent); > + } else if (is_device_entry(swpent)) { > + page = device_entry_to_page(swpent); > + } So the reason there is a device swap entry for a page belonging to a user process is that it is in the middle of migration or is it always that a swap entry represents unaddressable memory belonging to a GPU device, but its tracked in the page table entries of the process. > } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap > && pte_none(*pte))) { > page = find_get_entry(vma->vm_file->f_mapping, > @@ -708,6 +711,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, > > if (is_migration_entry(swpent)) > page = migration_entry_to_page(swpent); > + if (is_device_entry(swpent)) > + page = device_entry_to_page(swpent); > } > if (page) { > int mapcount = page_mapcount(page); > @@ -1191,6 +1196,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, > flags |= PM_SWAP; > if (is_migration_entry(entry)) > page = migration_entry_to_page(entry); > + > + if (is_device_entry(entry)) > + page = device_entry_to_page(entry); > } > > if (page && !PageAnon(page)) > diff --git a/include/linux/memremap.h b/include/linux/memremap.h > index b6f03e9..d584c74 100644 > --- a/include/linux/memremap.h > +++ b/include/linux/memremap.h > @@ -47,6 +47,11 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) > */ > struct dev_pagemap { > void (*free_devpage)(struct page *page, void *data); > + int (*fault)(struct vm_area_struct *vma, > + unsigned long addr, > + struct page *page, > + unsigned flags, > + pmd_t *pmdp); > struct vmem_altmap *altmap; > const struct resource *res; > struct percpu_ref *ref; > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 7e553e1..599cb54 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -50,6 +50,17 @@ static inline int current_is_kswapd(void) > */ > > /* > + * Un-addressable device memory support > + */ > +#ifdef CONFIG_DEVICE_UNADDRESSABLE > +#define SWP_DEVICE_NUM 2 > +#define SWP_DEVICE_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + SWP_MIGRATION_NUM) > +#define SWP_DEVICE (MAX_SWAPFILES + SWP_HWPOISON_NUM + SWP_MIGRATION_NUM + 1) > +#else > +#define SWP_DEVICE_NUM 0 > +#endif > + > +/* > * NUMA node memory migration support > */ > #ifdef CONFIG_MIGRATION > @@ -71,7 +82,8 @@ static inline int current_is_kswapd(void) > #endif > > #define MAX_SWAPFILES \ > - ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) > + ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ > + SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) > > /* > * Magic header for a swap area. The first part of the union is > @@ -442,8 +454,8 @@ static inline void show_swap_cache_info(void) > { > } > > -#define free_swap_and_cache(swp) is_migration_entry(swp) > -#define swapcache_prepare(swp) is_migration_entry(swp) > +#define free_swap_and_cache(e) (is_migration_entry(e) || is_device_entry(e)) > +#define swapcache_prepare(e) (is_migration_entry(e) || is_device_entry(e)) > > static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) > { > diff --git a/include/linux/swapops.h b/include/linux/swapops.h > index 5c3a5f3..d1aa425 100644 > --- a/include/linux/swapops.h > +++ b/include/linux/swapops.h > @@ -100,6 +100,73 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) > return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY); > } > > +#ifdef CONFIG_DEVICE_UNADDRESSABLE > +static inline swp_entry_t make_device_entry(struct page *page, bool write) > +{ > + return swp_entry(write?SWP_DEVICE_WRITE:SWP_DEVICE, page_to_pfn(page)); Code style checks > +} > + > +static inline bool is_device_entry(swp_entry_t entry) > +{ > + int type = swp_type(entry); > + return type == SWP_DEVICE || type == SWP_DEVICE_WRITE; > +} > + > +static inline void make_device_entry_read(swp_entry_t *entry) > +{ > + *entry = swp_entry(SWP_DEVICE, swp_offset(*entry)); > +} > + > +static inline bool is_write_device_entry(swp_entry_t entry) > +{ > + return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); > +} > + > +static inline struct page *device_entry_to_page(swp_entry_t entry) > +{ > + return pfn_to_page(swp_offset(entry)); > +} > + > +int device_entry_fault(struct vm_area_struct *vma, > + unsigned long addr, > + swp_entry_t entry, > + unsigned flags, > + pmd_t *pmdp); > +#else /* CONFIG_DEVICE_UNADDRESSABLE */ > +static inline swp_entry_t make_device_entry(struct page *page, bool write) > +{ > + return swp_entry(0, 0); > +} > + > +static inline void make_device_entry_read(swp_entry_t *entry) > +{ > +} > + > +static inline bool is_device_entry(swp_entry_t entry) > +{ > + return false; > +} > + > +static inline bool is_write_device_entry(swp_entry_t entry) > +{ > + return false; > +} > + > +static inline struct page *device_entry_to_page(swp_entry_t entry) > +{ > + return NULL; > +} > + > +static inline int device_entry_fault(struct vm_area_struct *vma, > + unsigned long addr, > + swp_entry_t entry, > + unsigned flags, > + pmd_t *pmdp) > +{ > + return VM_FAULT_SIGBUS; > +} > +#endif /* CONFIG_DEVICE_UNADDRESSABLE */ > + > #ifdef CONFIG_MIGRATION > static inline swp_entry_t make_migration_entry(struct page *page, int write) > { > diff --git a/kernel/memremap.c b/kernel/memremap.c > index cf83928..0670015 100644 > --- a/kernel/memremap.c > +++ b/kernel/memremap.c > @@ -18,6 +18,8 @@ > #include > #include > #include > +#include > +#include > > #ifndef ioremap_cache > /* temporary while we convert existing ioremap_cache users to memremap */ > @@ -200,6 +202,18 @@ void put_zone_device_page(struct page *page) > } > EXPORT_SYMBOL(put_zone_device_page); > > +int device_entry_fault(struct vm_area_struct *vma, > + unsigned long addr, > + swp_entry_t entry, > + unsigned flags, > + pmd_t *pmdp) > +{ > + struct page *page = device_entry_to_page(entry); > + > + return page->pgmap->fault(vma, addr, page, flags, pmdp); > +} > +EXPORT_SYMBOL(device_entry_fault); > + > static void pgmap_radix_release(struct resource *res) > { > resource_size_t key, align_start, align_size, align_end; > diff --git a/mm/Kconfig b/mm/Kconfig > index be0ee11..0a21411 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -704,6 +704,18 @@ config ZONE_DEVICE > > If FS_DAX is enabled, then say Y. > > +config DEVICE_UNADDRESSABLE > + bool "Un-addressable device memory (GPU memory, ...)" > + depends on ZONE_DEVICE > + > + help > + Allow to create struct page for un-addressable device memory > + ie memory that is only accessible by the device (or group of > + devices). > + > + This allow to migrate chunk of process memory to device memory > + while that memory is use by the device. > + > config FRAME_VECTOR > bool > > diff --git a/mm/memory.c b/mm/memory.c > index 15f2908..a83d690 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -889,6 +889,21 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, > pte = pte_swp_mksoft_dirty(pte); > set_pte_at(src_mm, addr, src_pte, pte); > } > + } else if (is_device_entry(entry)) { > + page = device_entry_to_page(entry); > + > + get_page(page); > + rss[mm_counter(page)]++; Why does rss count go up? > + page_dup_rmap(page, false); > + > + if (is_write_device_entry(entry) && > + is_cow_mapping(vm_flags)) { > + make_device_entry_read(&entry); > + pte = swp_entry_to_pte(entry); > + if (pte_swp_soft_dirty(*src_pte)) > + pte = pte_swp_mksoft_dirty(pte); > + set_pte_at(src_mm, addr, src_pte, pte); > + } > } > goto out_set_pte; > } > @@ -1191,6 +1206,12 @@ again: > > page = migration_entry_to_page(entry); > rss[mm_counter(page)]--; > + } else if (is_device_entry(entry)) { > + struct page *page = device_entry_to_page(entry); > + rss[mm_counter(page)]--; > + > + page_remove_rmap(page, false); > + put_page(page); > } > if (unlikely(!free_swap_and_cache(entry))) > print_bad_pte(vma, addr, ptent, NULL); > @@ -2536,6 +2557,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) > if (unlikely(non_swap_entry(entry))) { > if (is_migration_entry(entry)) { > migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); > + } else if (is_device_entry(entry)) { > + ret = device_entry_fault(vma, fe->address, entry, > + fe->flags, fe->pmd); What does device_entry_fault() actually do here? > } else if (is_hwpoison_entry(entry)) { > ret = VM_FAULT_HWPOISON; > } else { > diff --git a/mm/mprotect.c b/mm/mprotect.c > index 1bc1eb3..70aff3a 100644 > --- a/mm/mprotect.c > +++ b/mm/mprotect.c > @@ -139,6 +139,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > > pages++; > } > + > + if (is_write_device_entry(entry)) { > + pte_t newpte; > + > + make_device_entry_read(&entry); > + newpte = swp_entry_to_pte(entry); > + if (pte_swp_soft_dirty(oldpte)) > + newpte = pte_swp_mksoft_dirty(newpte); > + set_pte_at(mm, addr, pte, newpte); > + > + pages++; > + } Does it make sense to call mprotect() on device memory ranges? > } > } while (pte++, addr += PAGE_SIZE, addr != end); > arch_leave_lazy_mmu_mode(); >