LinuxLists.cc - [WIP 11/18] Basic support (faulting) for huge pages for shmfs

2012-02-16 14:48:48

by Radosław Smogura

[permalink] [raw]

Subject: [WIP 11/18] Basic support (faulting) for huge pages for shmfs

This is basic support for shmfs, allowing bootstraping of huge pages
in user address space.

This patch is just one first setep, it breakes kernel, because of
missing other requirements for page cache, but establishing is
done :D. Yupi!

Signed-off-by: Radosław Smogura <[email protected]>
---
include/linux/fs.h | 4 ++--
include/linux/mm.h | 4 ++--
mm/shmem.c | 30 ++++++++++++++----------------
3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7288166..7afc38b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -595,7 +595,7 @@ struct address_space_operations {

/** Same as \a set_page_dirty but for huge page */
int (*set_page_dirty_huge)(struct page *page);
-
+
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);

@@ -627,7 +627,7 @@ struct address_space_operations {
*/
int (*split_page) (struct file *file, struct address_space *mapping,
loff_t pos, struct page *hueg_page);
-
+
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 27a10c8..236a6be 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -210,7 +210,7 @@ struct vm_operations_struct {
* If function fails, then caller may try again with fault.
*/
int (*fault_huge)(struct vm_area_struct *vma, struct vm_fault *vmf);
-
+
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -218,7 +218,7 @@ struct vm_operations_struct {
/** Same as \a page_mkwrite, but for huge page. */
int (*page_mkwrite_huge)(struct vm_area_struct *vma,
struct vm_fault *vmf);
-
+
/* called by access_process_vm when get_user_pages() fails, typically
* for use by special VMAs that can switch between memory and hardware
*/
diff --git a/mm/shmem.c b/mm/shmem.c
index a834488..97e76b9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -67,6 +67,10 @@ static struct vfsmount *shm_mnt;
#include <asm/uaccess.h>
#include <asm/pgtable.h>

+#ifdef CONFIG_HUGEPAGECACHE
+#include <linux/defrag-pagecache.h>
+#endif
+
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)

@@ -1119,24 +1123,12 @@ static int shmem_fault_huge(struct vm_area_struct *vma, struct vm_fault *vmf)
}
}

- /* XXX Page & compound lock ordering please... */
-
/* After standard fault page is getted. */
- if (PageCompound(vmf->page)) {
- compound_lock(vmf->page);
- if (!PageHead(vmf->page)) {
- compound_unlock(vmf->page);
- goto no_hugepage;
- }
- }else {
+ if (!compound_get(vmf->page))
goto no_hugepage;
- }
-
- if (!(ret & VM_FAULT_LOCKED))
- lock_page(vmf->page);
-
- ret |= VM_FAULT_LOCKED;
-
+
+ get_page_tails_for_fmap(vmf->page);
+
if (ret & VM_FAULT_MAJOR) {
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
@@ -2381,6 +2373,9 @@ static const struct address_space_operations shmem_aops = {
#endif
.migratepage = migrate_page,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_HUGEPAGECACHE
+ .defragpage = defrag_generic_shm,
+#endif
};

static const struct file_operations shmem_file_operations = {
@@ -2458,6 +2453,9 @@ static const struct super_operations shmem_ops = {

static const struct vm_operations_struct shmem_vm_ops = {
.fault = shmem_fault,
+#ifdef CONFIG_SHMEM_HUGEPAGECACHE
+ .fault_huge = shmem_fault_huge,
+#endif
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
--
1.7.3.4

2012-02-16 14:48:50

by Radosław Smogura

[permalink] [raw]

Subject: [WIP 13/18] Zapping and freeing huge mappings

Changes to VM subsytem allowing zapping and freeing huge pages,
additional functions for removing mapping.

Signed-off-by: Radosław Smogura <[email protected]>
---
include/asm-generic/tlb.h | 21 ++++++
include/linux/huge_mm.h | 13 ++++-
mm/huge_memory.c | 153 ++++++++++++++++++++++++++++++++++++++++++---
mm/memory.c | 39 +++++++-----
4 files changed, 202 insertions(+), 24 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index f96a5b5..f7fc543 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -126,6 +126,27 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
tlb_flush_mmu(tlb);
}

+/** Compound page must be getted frozen. */
+static inline void tlb_remove_page_huge(struct mmu_gather *tlb,
+ struct page *head)
+{
+ struct page *page;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+
+ tlb_remove_page(tlb, head);
+ tlb_remove_page(tlb, head + 1);
+ if (likely(compound_order(head) > 1)) {
+ for (page = head+2; page->__first_page == head; page++) {
+ tlb_remove_page(tlb, page);
+ /* Such situation should not happen, it means we mapped
+ * dangling page.
+ */
+ BUG_ON(!PageAnon(page) && !page->mapping);
+ }
+ }
+}
/**
* tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
*
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c2407e4..c72a849 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -88,12 +88,21 @@ extern int handle_pte_fault(struct mm_struct *mm,
pte_t *pte, pmd_t *pmd, unsigned int flags);
extern int split_huge_page(struct page *page);
extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
+extern void __split_huge_page_pmd_vma(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd);
+
#define split_huge_page_pmd(__mm, __pmd) \
do { \
pmd_t *____pmd = (__pmd); \
- if (unlikely(pmd_trans_huge(*____pmd))) \
+ if (unlikely(pmd_trans_huge(*____pmd))) \
__split_huge_page_pmd(__mm, ____pmd); \
} while (0)
+#define split_huge_page_pmd_vma(__vma, __addr, __pmd) \
+ do { \
+ pmd_t *____pmd = (__pmd); \
+ if (unlikely(pmd_trans_huge(*____pmd))) \
+ __split_huge_page_pmd_vma(__vma, __addr, ____pmd);\
+ } while (0)
#define wait_split_huge_page(__anon_vma, __pmd) \
do { \
pmd_t *____pmd = (__pmd); \
@@ -160,6 +169,8 @@ static inline int split_huge_page(struct page *page)
}
#define split_huge_page_pmd(__mm, __pmd) \
do { } while (0)
+#define split_huge_page_pmd_vma(__vma, __addr, __pmd) do { } while (0)
+
#define wait_split_huge_page(__anon_vma, __pmd) \
do { } while (0)
#define compound_trans_head(page) compound_head(page)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 74d2e84..95c9ce7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -807,6 +807,9 @@ pgtable_t get_pmd_huge_pte(struct mm_struct *mm)

/* FIFO */
pgtable = mm->pmd_huge_pte;
+ if (!pgtable)
+ return NULL;
+
if (list_empty(&pgtable->lru))
mm->pmd_huge_pte = NULL;
else {
@@ -1029,27 +1032,56 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
int ret = 0;
+ pmd_t pmd_val;

+ /* We are going to get page, but if we will be during split, split may
+ * lock page_table_lock, then we may wait for compound_get, and split
+ * may wait for page_table_lock, we have here. So... double check
+ * locking.
+ */
+again:
spin_lock(&tlb->mm->page_table_lock);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
+ pmd_val = *pmd;
+ if (likely(pmd_trans_huge(pmd_val))) {
+ if (unlikely(pmd_trans_splitting(pmd_val))) {
spin_unlock(&tlb->mm->page_table_lock);
wait_split_huge_page(vma->anon_vma,
pmd);
} else {
struct page *page;
pgtable_t pgtable;
+
pgtable = get_pmd_huge_pte(tlb->mm);
page = pmd_page(*pmd);
+ spin_unlock(&tlb->mm->page_table_lock);
+ if (!compound_get(page))
+ return 0;
+ spin_lock(&tlb->mm->page_table_lock);
+ smp_rmb();
+ if (unlikely(!pmd_same(pmd_val, *pmd))) {
+ spin_unlock(&tlb->mm->page_table_lock);
+ compound_put(page);
+ goto again;
+ }
pmd_clear(pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- page_remove_rmap(page);
+ if (PageAnon(page))
+ page_remove_rmap(page);
+ else
+ page_remove_rmap_huge(page);
+
VM_BUG_ON(page_mapcount(page) < 0);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ add_mm_counter(tlb->mm, PageAnon(page) ?
+ MM_ANONPAGES : MM_FILEPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
spin_unlock(&tlb->mm->page_table_lock);
- tlb_remove_page(tlb, page);
- pte_free(tlb->mm, pgtable);
+ if (PageAnon(page))
+ tlb_remove_page(tlb, page);
+ else
+ tlb_remove_page_huge(tlb, page);
+ if (pgtable)
+ pte_free(tlb->mm, pgtable);
+ compound_put(page);
}
} else
spin_unlock(&tlb->mm->page_table_lock);
@@ -2368,16 +2400,121 @@ static int khugepaged(void *none)
return 0;
}

-void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+/** Makes inplace split of huge pmd to normal pmd, pmd is filled
+ * with ptes compatible with pmd,
+ * <br/>
+ * On success new page table is modified and flushed.
+ * May work only for file pmds.
+ *
+ * This method copies logic from __pte_alloc.
+ */
+int __inplace_split_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd)
+{
+ unsigned long addr, end_addr;
+ pmd_t pmdv, pmd_fake;
+ pte_t pte, pte_pmd;
+ pte_t *ptep;
+ pgtable_t new;
+ struct page *page;
+
+ address &= HPAGE_PMD_MASK;
+
+ /* TODO Good place to change locking technique for pmds. */
+repeat:
+ addr = address & HPAGE_PMD_MASK;
+
+ smp_mb();
+ if (pmd_none(*pmd) || !pmd_trans_huge(*pmd))
+ return 0;
+
+ new = pte_alloc_one(mm, addr);
+
+ if (!new)
+ return -ENOMEM;
+ pmdv = *pmd;
+
+ pmd_fake = pmdv;
+ pte_pmd = pte_clrhuge(*((pte_t *) &pmd_fake));
+ pmd_fake = *((pmd_t *) &pte_pmd);
+
+ pmd_populate(mm, &pmd_fake, new);
+
+ page = pmd_page(pmdv);
+ end_addr = pmd_addr_end(addr, 0L);
+ for (; addr < end_addr; addr += PAGE_SIZE, page++) {
+ if (!pmd_present(pmdv))
+ continue;
+ /* Copy protection from pmd. */
+ pte = mk_pte(page, vma->vm_page_prot);
+
+ if (pmd_dirty(pmdv))
+ pte = pte_mkdirty(pte);
+ if (pmd_write(pmdv))
+ pte = pte_mkwrite(pte);
+ if (pmd_exec(pmdv))
+ pte = pte_mkexec(pte);
+ if (pmd_young(pmdv))
+ pte = pte_mkyoung(pte);
+
+ ptep = pte_offset_map(&pmd_fake, addr);
+ set_pte_at(mm, addr, ptep, pte);
+ pte_unmap(ptep);
+ }
+
+ /* Ensure everything is visible before populating pmd. */
+ smp_mb();
+
+ spin_lock(&mm->page_table_lock);
+ if (pmd_same(pmdv, *pmd)) {
+ set_pmd(pmd, pmd_fake);
+ mm->nr_ptes++;
+ new = NULL;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ /* Now we have new tlb, make it visible to all. */
+ flush_tlb_range(vma, address, address + HPAGE_SIZE);
+
+ if (new) {
+ pte_free(mm, new);
+ goto repeat;
+ }
+
+ return 0;
+}
+
+/** Splits huge page for vma. */
+void __split_huge_page_pmd_vma(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd)
{
struct page *page;
+ int anonPage;
+ /* XXX Ineficient locking for pmd. */
+ spin_lock(&vma->vm_mm->page_table_lock);
+ if (!pmd_trans_huge(*pmd)) {
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return;
+ }
+ page = pmd_page(*pmd);
+ anonPage = PageAnon(page);
+ spin_unlock(&vma->vm_mm->page_table_lock);

+ if (anonPage)
+ __split_huge_page_pmd(vma->vm_mm, pmd);
+ else
+ __inplace_split_pmd(vma->vm_mm, vma, address, pmd);
+}
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+ struct page *page = pmd_page(*pmd);
+
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(&mm->page_table_lock);
return;
}
- page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
spin_unlock(&mm->page_table_lock);
diff --git a/mm/memory.c b/mm/memory.c
index 7427c9b..539d1f4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -572,22 +572,28 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unlink_file_vma(vma);

if (is_vm_hugetlb_page(vma)) {
- hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
- floor, next? next->vm_start: ceiling);
- } else {
- /*
- * Optimization: gather nearby vmas into one call down
- */
- while (next && next->vm_start <= vma->vm_end + PMD_SIZE
- && !is_vm_hugetlb_page(next)) {
- vma = next;
- next = vma->vm_next;
- unlink_anon_vmas(vma);
- unlink_file_vma(vma);
+ if (vma->vm_file) {
+ if (vma->vm_file->f_mapping->a_ops->defragpage)
+ goto free_normal;
}
- free_pgd_range(tlb, addr, vma->vm_end,
+ hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
+ }
+
+free_normal:
+ /*
+ * Optimization: gather nearby vmas into one call down
+ */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE
+ && !is_vm_hugetlb_page(next)) {
+ vma = next;
+ next = vma->vm_next;
+ unlink_anon_vmas(vma);
+ unlink_file_vma(vma);
}
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next? next->vm_start: ceiling);
+
vma = next;
}
}
@@ -1248,8 +1254,11 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next-addr != HPAGE_PMD_SIZE) {
- VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
- split_huge_page_pmd(vma->vm_mm, pmd);
+ /* And now we go again in conflict with, THP...
+ * THP requires semaphore, we require compound
+ * frozen, why...?
+ */
+ split_huge_page_pmd_vma(vma, addr, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
continue;
/* fall through */
--
1.7.3.4

2012-02-16 14:48:51

by Radosław Smogura

[permalink] [raw]

Subject: [WIP 14/18] Fixes for proc memory

Fixed smaps to do not split page, and print information about
shared/private huge dirty/clean pages. This changes operates only
on dirty flag from pmd - it may not be enaugh, but checking in addition
PageDirty, like for pte, is too much, because of head of huge page may
be mapped to single pte, not only as huge pmd.

In pagemaps removed splitting and adding huge pmd as one page with shift
of huge page.

Signed-off-by: Radosław Smogura <[email protected]>
---
fs/proc/task_mmu.c | 97 ++++++++++++++++++++++++++++++++++++----------------
1 files changed, 67 insertions(+), 30 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7dcd2a2..111e64c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -333,8 +333,12 @@ struct mem_size_stats {
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
+ unsigned long shared_huge_clean;
+ unsigned long shared_huge_dirty;
unsigned long private_clean;
unsigned long private_dirty;
+ unsigned long private_huge_clean;
+ unsigned long private_huge_dirty;
unsigned long referenced;
unsigned long anonymous;
unsigned long anonymous_thp;
@@ -342,9 +346,8 @@ struct mem_size_stats {
u64 pss;
};

-
static void smaps_pte_entry(pte_t ptent, unsigned long addr,
- unsigned long ptent_size, struct mm_walk *walk)
+ unsigned long ptent_size, struct mm_walk *walk, int huge_file)
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
@@ -368,20 +371,33 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,

mss->resident += ptent_size;
/* Accumulate the size in pages that have been accessed. */
- if (pte_young(ptent) || PageReferenced(page))
+ if (pte_young(ptent) || (!huge_file && PageReferenced(page)))
mss->referenced += ptent_size;
mapcount = page_mapcount(page);
+ /* For huge file mapping only account by pte, as page may be made
+ * dirty, but not pmd (huge page may be mapped in ptes not pde).
+ */
if (mapcount >= 2) {
- if (pte_dirty(ptent) || PageDirty(page))
+ if (pte_dirty(ptent) || (!huge_file && PageDirty(page))) {
mss->shared_dirty += ptent_size;
- else
+ if (huge_file)
+ mss->shared_huge_dirty += ptent_size;
+ } else {
mss->shared_clean += ptent_size;
+ if (huge_file)
+ mss->shared_huge_clean += ptent_size;
+ }
mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
} else {
- if (pte_dirty(ptent) || PageDirty(page))
+ if (pte_dirty(ptent) || (!huge_file && PageDirty(page))) {
mss->private_dirty += ptent_size;
- else
+ if (huge_file)
+ mss->private_huge_dirty += ptent_size;
+ } else {
mss->private_clean += ptent_size;
+ if (huge_file)
+ mss->private_huge_clean += ptent_size;
+ }
mss->pss += (ptent_size << PSS_SHIFT);
}
}
@@ -401,9 +417,10 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
wait_split_huge_page(vma->anon_vma, pmd);
} else {
smaps_pte_entry(*(pte_t *)pmd, addr,
- HPAGE_PMD_SIZE, walk);
+ HPAGE_PMD_SIZE, walk,
+ vma->vm_ops != NULL);
spin_unlock(&walk->mm->page_table_lock);
- mss->anonymous_thp += HPAGE_PMD_SIZE;
+ mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
} else {
@@ -416,7 +433,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
*/
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
- smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
+ smaps_pte_entry(*pte, addr, PAGE_SIZE, walk, 0);
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
@@ -443,20 +460,24 @@ static int show_smap(struct seq_file *m, void *v)
show_map_vma(m, vma);

seq_printf(m,
- "Size: %8lu kB\n"
- "Rss: %8lu kB\n"
- "Pss: %8lu kB\n"
- "Shared_Clean: %8lu kB\n"
- "Shared_Dirty: %8lu kB\n"
- "Private_Clean: %8lu kB\n"
- "Private_Dirty: %8lu kB\n"
- "Referenced: %8lu kB\n"
- "Anonymous: %8lu kB\n"
- "AnonHugePages: %8lu kB\n"
- "Swap: %8lu kB\n"
- "KernelPageSize: %8lu kB\n"
- "MMUPageSize: %8lu kB\n"
- "Locked: %8lu kB\n",
+ "Size: %8lu kB\n"
+ "Rss: %8lu kB\n"
+ "Pss: %8lu kB\n"
+ "Shared_Clean: %8lu kB\n"
+ "Shared_Dirty: %8lu kB\n"
+ "Private_Clean: %8lu kB\n"
+ "Private_Dirty: %8lu kB\n"
+ "Shared_Huge_Clean: %8lu kB\n"
+ "Shared_Huge_Dirty: %8lu kB\n"
+ "Private_Huge_Clean: %8lu kB\n"
+ "Private_Huge_Dirty: %8lu kB\n"
+ "Referenced: %8lu kB\n"
+ "Anonymous: %8lu kB\n"
+ "AnonHugePages: %8lu kB\n"
+ "Swap: %8lu kB\n"
+ "KernelPageSize: %8lu kB\n"
+ "MMUPageSize: %8lu kB\n"
+ "Locked: %8lu kB\n",
(vma->vm_end - vma->vm_start) >> 10,
mss.resident >> 10,
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -464,6 +485,10 @@ static int show_smap(struct seq_file *m, void *v)
mss.shared_dirty >> 10,
mss.private_clean >> 10,
mss.private_dirty >> 10,
+ mss.shared_huge_clean >> 10,
+ mss.shared_huge_dirty >> 10,
+ mss.private_huge_clean >> 10,
+ mss.private_huge_dirty >> 10,
mss.referenced >> 10,
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
@@ -661,6 +686,15 @@ static u64 pte_to_pagemap_entry(pte_t pte)
return pme;
}

+static u64 pmd_to_pagemap_entry(pmd_t pmd)
+{
+ u64 pme = 0;
+ if (pmd_present(pmd))
+ pme = PM_PFRAME(pmd_pfn(pmd))
+ | PM_PSHIFT(HPAGE_SHIFT) | PM_PRESENT;
+ return pme | PM_PSHIFT(HPAGE_SHIFT);
+}
+
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@@ -669,8 +703,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
int err = 0;

- split_huge_page_pmd(walk->mm, pmd);
-
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
for (; addr != end; addr += PAGE_SIZE) {
@@ -685,10 +717,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
* and that it isn't a huge page vma */
if (vma && (vma->vm_start <= addr) &&
!is_vm_hugetlb_page(vma)) {
- pte = pte_offset_map(pmd, addr);
- pfn = pte_to_pagemap_entry(*pte);
- /* unmap before userspace copy */
- pte_unmap(pte);
+ pmd_t pmd_val = *pmd;
+ if (pmd_trans_huge(pmd_val)) {
+ pfn = pmd_to_pagemap_entry(pmd_val);
+ } else {
+ pte = pte_offset_map(pmd, addr);
+ pfn = pte_to_pagemap_entry(*pte);
+ /* unmap before userspace copy */
+ pte_unmap(pte);
+ }
}
err = add_to_pagemap(addr, pfn, pm);
if (err)
--
1.7.3.4

2012-02-16 14:47:56

by Radosław Smogura

[permalink] [raw]

Subject: [WIP 17/18] [Experimental] Support for huge pages in EXT 4

It's rather experimental to uncover all leaks in adding huge page cache
support for shm, not for giving any real support for huge pages for
EXT4 file system. This will test if some concepts was good or bad.

In any case target is that some segments of glibc may be mapped as huge
pages, only if it will be aligned to huge page boundaries.

Signed-off-by: Radosław Smogura <[email protected]>
---
fs/ext4/Kconfig | 9 ++++
fs/ext4/file.c | 3 +
fs/ext4/inode.c | 15 +++++++
include/linux/defrag-pagecache.h | 4 ++
include/linux/mm.h | 4 ++
mm/defrag-pagecache.c | 19 +++++++++
mm/filemap.c | 82 ++++++++++++++++++++++++++++++++++++++
7 files changed, 136 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9ed1bb1..1a33bb0 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -83,3 +83,12 @@ config EXT4_DEBUG

If you select Y here, then you will be able to turn on debugging
with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+
+config EXT4_HUGEPAGECACHE
+ bool "EXT4 Huge Page Cache Support [Danegerous]"
+ depends on EXT4_FS
+ depends on HUGEPAGECACHE
+ help
+ It's rather experimental to uncover all leaks in adding huge page cache
+ support for shm, not for giving any real support for huge pages for
+ EXT4 file system. This will test if some concepts was quite good.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cb70f18..57698df 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -143,6 +143,9 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,

static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+ .fault_huge = filemap_fault_huge,
+#endif
.page_mkwrite = ext4_page_mkwrite,
};

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82f..8bbda5a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -45,6 +45,9 @@

#include <trace/events/ext4.h>

+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+#include <linux/defrag-pagecache.h>
+#endif
#define MPAGE_DA_EXTENT_TAIL 0x01

static inline int ext4_begin_ordered_truncate(struct inode *inode,
@@ -3036,6 +3039,9 @@ static const struct address_space_operations ext4_ordered_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+ .defragpage = defrag_generic_file,
+#endif
};

static const struct address_space_operations ext4_writeback_aops = {
@@ -3051,6 +3057,9 @@ static const struct address_space_operations ext4_writeback_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+ .defragpage = defrag_generic_file,
+#endif
};

static const struct address_space_operations ext4_journalled_aops = {
@@ -3066,6 +3075,9 @@ static const struct address_space_operations ext4_journalled_aops = {
.direct_IO = ext4_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+ .defragpage = defrag_generic_file,
+#endif
};

static const struct address_space_operations ext4_da_aops = {
@@ -3082,6 +3094,9 @@ static const struct address_space_operations ext4_da_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+ .defragpage = defrag_generic_file,
+#endif
};

void ext4_set_aops(struct inode *inode)
diff --git a/include/linux/defrag-pagecache.h b/include/linux/defrag-pagecache.h
index 4ca3468..fb305c8 100644
--- a/include/linux/defrag-pagecache.h
+++ b/include/linux/defrag-pagecache.h
@@ -42,5 +42,9 @@ extern int defrag_generic_shm(struct file *file, struct address_space *mapping,
loff_t pos,
struct page **pagep,
struct defrag_pagecache_ctl *ctl);
+extern int defrag_generic_file(struct file *file, struct address_space *mapping,
+ loff_t pos,
+ struct page **pagep,
+ struct defrag_pagecache_ctl *ctl);
#endif /* DEFRAG_PAGECACHE_H */

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4c67555..24c2c6c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1620,6 +1620,10 @@ extern void truncate_inode_pages_range(struct address_space *,
/* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);

+#ifdef CONFIG_HUGEPAGECACHE
+extern int filemap_fault_huge(struct vm_area_struct *vma, struct vm_fault *vmf);
+#endif
+
/* mm/page-writeback.c */
int write_one_page(struct page *page, int wait);
void task_dirty_inc(struct task_struct *tsk);
diff --git a/mm/defrag-pagecache.c b/mm/defrag-pagecache.c
index 5a14fe8..6a87814 100644
--- a/mm/defrag-pagecache.c
+++ b/mm/defrag-pagecache.c
@@ -104,6 +104,16 @@ struct page *shmem_defrag_get_page(const struct defrag_pagecache_ctl *ctl,
mapping_gfp_mask(inode->i_mapping));
}

+/** Callback for getting page for tmpfs.
+ * Tmpfs uses {@link shmem_read_mapping_page_gfp} function to read
+ * page from page cache.
+ */
+struct page *file_defrag_get_page(const struct defrag_pagecache_ctl *ctl,
+ struct inode *inode, pgoff_t pageIndex)
+{
+ return read_mapping_page(inode->i_mapping, pageIndex, NULL);
+}
+
static void defrag_generic_mig_result(struct page *oldPage,
struct page *newPage, struct migration_ctl *ctl, int result)
{
@@ -258,6 +268,15 @@ int defrag_generic_shm(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(defrag_generic_shm);

+int defrag_generic_file(struct file *file, struct address_space *mapping,
+ loff_t pos,
+ struct page **pagep,
+ struct defrag_pagecache_ctl *ctl)
+{
+ return defrageOneHugePage(file, pos, pagep, ctl, file_defrag_get_page);
+}
+EXPORT_SYMBOL(defrag_generic_file);
+
int defrag_generic_pagecache(struct file *file,
struct address_space *mapping,
loff_t pos,
diff --git a/mm/filemap.c b/mm/filemap.c
index 8363cd9..f050209 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -43,6 +43,9 @@

#include <asm/mman.h>

+#ifdef CONFIG_HUGEPAGECACHE
+#include <linux/defrag-pagecache.h>
+#endif
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -1771,6 +1774,85 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);

+#ifdef CONFIG_HUGEPAGECACHE
+/** DO NOT USE THIS METHOD IS STILL EXPERIMENTAL. */
+int filemap_fault_huge(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ int error;
+ int ret = VM_FAULT_LOCKED;
+
+ error = vma->vm_ops->fault(vma, vmf);
+ /* XXX Repeatable flags in __do fault etc. */
+ if (error & (VM_FAULT_ERROR | VM_FAULT_NOPAGE
+ | VM_FAULT_RETRY | VM_FAULT_NOHUGE)) {
+ return error;
+ }
+
+ /* Just portion of developer code, to force defragmentation, as we have
+ * no external interface to make defragmentation (or daemon to do it).
+ */
+ if ((vma->vm_flags & VM_HUGEPAGE) && !PageCompound(vmf->page)) {
+ /* Force defrag - mainly devo code */
+ int defragResult;
+ const loff_t hugeChunkSize = 1 << (PMD_SHIFT - PAGE_SHIFT);
+
+ const loff_t vmaSizeToMap = (vma->vm_start
+ + ((vmf->pgoff + vma->vm_pgoff + hugeChunkSize)
+ << PAGE_SHIFT) <= vma->vm_end) ?
+ hugeChunkSize : 0;
+
+ const loff_t inodeSizeToMap =
+ (vmf->pgoff + vma->vm_pgoff + hugeChunkSize <
+ inode->i_size) ? hugeChunkSize : 0;
+
+ const struct defrag_pagecache_ctl defragControl = {
+ .fillPages = 1,
+ .requireFillPages = 1,
+ .force = 1
+ };
+
+ if (ret & VM_FAULT_LOCKED) {
+ unlock_page(vmf->page);
+ }
+ put_page(vmf->page);
+
+ defragResult = defragPageCache(vma->vm_file,
+ vmf->pgoff,
+ min(vmaSizeToMap, min(inodeSizeToMap, hugeChunkSize)),
+ &defragControl);
+ printk(KERN_INFO "Page defragmented with result %d\n",
+ defragResult);
+
+ /* Retake page. */
+ error = vma->vm_ops->fault(vma, vmf);
+ if (error & (VM_FAULT_ERROR | VM_FAULT_NOPAGE
+ | VM_FAULT_RETRY | VM_FAULT_NOHUGE)) {
+ return error;
+ }
+ }
+
+ /* After standard fault page is getted. */
+ if (!compound_get(vmf->page))
+ goto no_hugepage;
+
+ get_page_tails_for_fmap(vmf->page);
+
+ if (ret & VM_FAULT_MAJOR) {
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ }
+ return ret;
+no_hugepage:
+ if (ret & VM_FAULT_LOCKED)
+ unlock_page(vmf->page);
+ page_cache_release(vmf->page);
+ vmf->page = NULL;
+ return VM_FAULT_NOHUGE;
+}
+EXPORT_SYMBOL(filemap_fault_huge);
+#endif
+
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
};
--
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2012-02-16 14:48:55

by Radosław Smogura

[permalink] [raw]

Subject: [WIP 18/18] [WIP] Dummy patch for details

I send this dummy patch to describe a bit of work, maybe someone may
have additional ideas, concepts and tips. In any case I'm glad I mapped huge
EXT4 and data was synced to disk.

Some concepts about compounds:
- first_page moved to lru union to free place for buffers
- refcounting changed - compound pages are "auto managed",
page recovering is for backward
compatibilit with 2.6 kernels, actully those kernels allowed
getting tail page of count 0, but at eye glance moving few
times around 0 could cause dangling pointer bug

Compound view.
In distinction to huge pages and THP, file system
compound pages are really loosely treated, as a main difference there is no
implication huge page => huge pmd, huge page may exist and may have no
huge mappings at all.

Each page is managed almost like stand alone, have own count, mapcount, dirty
bit etc. It can't be added to any LRU nor list, because list_head is
shared with compound metadata.

Read / write locking of compound.

Splitting may be dequeued this is to prevent deadlocks, "legacy" code
will probably start with normal page locked, and then try to lock
compound, for splitting purposes this may cause deadlocks (actually this
flag was not included in faulting and enywhere else, but should be).

Still there is no defragmentation daemon nor anything simillar, this
behaviour is forced by MAP_HUGETLB.

Things not made:
* kswapd & co. not tested.
* mlock not fixed, fix will cover get_user_pages & follow_user_pages.
* fork, page_mkclean, mlock, not fixed.
* dropping caches = bug.
* migration not checked
* shmfs - writeback for reclaim should split, simple to make, but ext4
experiments should go first (syncing)
* no huge COW mapping allowed.
* code cleaning from all printk...

Signed-off-by: Radosław Smogura <[email protected]>
---
mm/filemap.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index f050209..7174fff 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1783,7 +1783,7 @@ int filemap_fault_huge(struct vm_area_struct *vma, struct vm_fault *vmf)
int ret = VM_FAULT_LOCKED;

error = vma->vm_ops->fault(vma, vmf);
- /* XXX Repeatable flags in __do fault etc. */
+ /* XXX Repeatable flags in __do fault etc. */
if (error & (VM_FAULT_ERROR | VM_FAULT_NOPAGE
| VM_FAULT_RETRY | VM_FAULT_NOHUGE)) {
return error;
--
1.7.3.4

2012-02-16 14:47:54

by Radosław Smogura

[permalink] [raw]

Subject: [WIP 15/18] Splitting and truncating

This add support for page splitting. Page splitting should be called
only in special situations (when continous region of compound page is
about to stop representing same continous region of mapping, e. g. some
tail pages are going to be removed from page cache).

We reuse zap vma for split purpose, it's not quite nice, but fast path,
should be corrected.

SHM support for this will be added later.

Signed-off-by: Radosław Smogura <[email protected]>
---
include/linux/huge_mm.h | 21 ++++++
include/linux/mm.h | 20 +++++
mm/filemap.c | 14 ++++-
mm/huge_memory.c | 178 +++++++++++++++++++++++++++++++++++++---------
mm/memory.c | 54 ++++++++++-----
mm/truncate.c | 18 +++++-
6 files changed, 251 insertions(+), 54 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c72a849..8e6bfc7 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -87,6 +87,23 @@ extern int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags);
extern int split_huge_page(struct page *page);
+
+/** Splits huge file page.
+ * @param head the head of page
+ * @param page the page that is going to be invalidated.
+ * @return 0 - inplace split, 1 - newly dequeued, 2 - dequeud and was dequeued
+ */
+extern int split_huge_page_file(struct page *head, struct page *page);
+
+/** Tries to aquire all possible locks on compound page. This includes,
+ * compound lock on all tails and normal locks on all tails. Function takes
+ * {@code page} as signle parameter head must be frozen, {@code page}
+ * must have normal ({@code lock_page}) lock.
+ *
+ * @param page locked page contained in compound page, may be head or tail
+ */
+extern int compound_try_lock_all(struct page *page);
+
extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
extern void __split_huge_page_pmd_vma(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd);
@@ -167,6 +184,10 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
+static inline int split_huge_page_file(struct page *head, struct page *page)
+{
+ return 0;
+}
#define split_huge_page_pmd(__mm, __pmd) \
do { } while (0)
#define split_huge_page_pmd_vma(__vma, __addr, __pmd) do { } while (0)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 236a6be..4c67555 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -279,6 +279,19 @@ struct inode;
extern int put_compound_head(struct page *head);
extern int put_compound_tail(struct page *page);

+/** Tries to aquire compound lock.
+ * @return not zero on success or when {@code CONFIG_TRANSPARENT_HUGEPAGE}
+ * is not enabled, {@code 0} otherwise
+ */
+static inline int compound_trylock(struct page *head)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ return (likely(!test_and_set_bit_lock(PG_compound_lock, &head->flags)));
+#else
+ return 1;
+#endif
+}
+
static inline void compound_lock(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1058,6 +1071,11 @@ struct zap_details {
struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */
+
+ /* Instead of unmapping areas just split it down to pte level. Used
+ * for splitting pages.
+ */
+ int just_split;
};

struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
@@ -1108,6 +1126,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
+void split_mapping_range(struct address_space *mapping, loff_t const holebegin,
+ loff_t const holelen);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/filemap.c b/mm/filemap.c
index b662757..8363cd9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -124,7 +124,19 @@ void __delete_from_page_cache(struct page *page)
cleancache_put_page(page);
else
cleancache_flush_page(mapping, page);
-
+#if CONFIG_DEBUG_VM
+ /** This is really strong assumption, but it may be usefull
+ * for finding problems when page is truncated, we actually allow
+ * situation when parts of huge page will be valid in page cache,
+ * but page should be marked & to mark page compund needs to be frozen.
+ * The bug will not only bug, but will show nice stack trace, what is
+ * wrong.
+ */
+ if (PageCompound(page)) {
+ struct page *head = compound_head(page);
+ VM_BUG_ON(PageCompound(page) && !PageSplitDeque(head));
+ }
+#endif
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 95c9ce7..87fb0b1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1256,11 +1256,17 @@ static int __split_huge_page_splitting(struct page *page,
return ret;
}

-static void __split_huge_page_refcount(struct page *page)
+static void __split_huge_page_refcount(struct page *page,
+ struct page *keep_locked)
{
int i;
int tail_counter;
struct zone *zone = page_zone(page);
+ int anon_mode = PageAnon(page);
+ const int pages = (1 << compound_order(page));
+
+ VM_BUG_ON(PageTail(page));
+ VM_BUG_ON(compound_order(page) < 2);

/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
@@ -1270,7 +1276,7 @@ static void __split_huge_page_refcount(struct page *page)

tail_counter = compound_elements(page);

- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ for (i = pages - 1; i >= 1; i--) {
struct page *page_tail = page + i;

/* tail_page->_mapcount cannot change */
@@ -1278,8 +1284,10 @@ static void __split_huge_page_refcount(struct page *page)

/*
* tail_page->_count represents actuall number of tail pages
+ * file backed pages have own map count.
*/
- atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+ if (anon_mode)
+ atomic_add(page_mapcount(page) + 1, &page_tail->_count);

/* after clearing PageTail the gup refcount can be released */
smp_mb();
@@ -1290,17 +1298,23 @@ static void __split_huge_page_refcount(struct page *page)
* by the memory-failure.
* retain lock, and compound lock
*/
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP
- | __PG_HWPOISON
- | PG_locked
- | PG_compound_lock;
-
- page_tail->flags |= (page->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate)));
- page_tail->flags |= (1L << PG_dirty);
+ if (anon_mode) {
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP
+ | __PG_HWPOISON
+ | PG_locked
+ | PG_compound_lock;
+
+ page_tail->flags |= (page->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate)));
+ page_tail->flags |= (1L << PG_dirty);
+ } else {
+ /* Retain all flags excepting tail, head :D */
+ int clearFlags = ~((1L << PG_tail) | (1L << PG_head));
+ page_tail->flags = (page_tail->flags & clearFlags);
+ }

/* clear PageTail before overwriting first_page */
smp_wmb();
@@ -1319,26 +1333,31 @@ static void __split_huge_page_refcount(struct page *page)
* status is achieved setting a reserved bit in the
* pmd, not by clearing the present bit.
*/
- page_tail->_mapcount = page->_mapcount;
+ if (anon_mode) {
+ page_tail->_mapcount = page->_mapcount;

- BUG_ON(page_tail->mapping);
- page_tail->mapping = page->mapping;
+ BUG_ON(page_tail->mapping);
+ page_tail->mapping = page->mapping;

- page_tail->index = page->index + i;
-
- BUG_ON(!PageAnon(page_tail));
- BUG_ON(!PageUptodate(page_tail));
- BUG_ON(!PageDirty(page_tail));
- BUG_ON(!PageSwapBacked(page_tail));
+ page_tail->index = page->index + i;

+ BUG_ON(!PageAnon(page_tail));
+ BUG_ON(!PageUptodate(page_tail));
+ BUG_ON(!PageDirty(page_tail));
+ BUG_ON(!PageSwapBacked(page_tail));
+ }
+ page_tail->__first_page = NULL;
lru_add_page_tail(zone, page, page_tail);
}
BUG_ON(atomic_read(&page->_count) <= 0);

- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+ if (anon_mode) {
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+ }

ClearPageCompound(page);
+ TestClearPageSplitDeque(page);
compound_unlock(page);
/* Remove additional reference used in compound. */
if (tail_counter)
@@ -1348,17 +1367,25 @@ static void __split_huge_page_refcount(struct page *page)

for (i = 1; i < HPAGE_PMD_NR; i++) {
struct page *page_tail = page + i;
- BUG_ON(page_count(page_tail) <= 0);
- /*
- * Tail pages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
- */
- put_page(page_tail);
+ if (anon_mode) {
+ BUG_ON(page_count(page_tail) <= 0);
+ /*
+ * Tail pages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(page_tail);
+ } else {
+ if (page_tail != keep_locked)
+ unlock_page(page_tail);
+ }
}

+ if (!anon_mode && page != keep_locked)
+ unlock_page(page);
+
/*
* Only the head page (now become a regular page) is required
* to be pinned by the caller.
@@ -1473,7 +1500,7 @@ static void __split_huge_page(struct page *page,
mapcount, page_mapcount(page));
BUG_ON(mapcount != page_mapcount(page));

- __split_huge_page_refcount(page);
+ __split_huge_page_refcount(page, NULL);

mapcount2 = 0;
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
@@ -1490,6 +1517,87 @@ static void __split_huge_page(struct page *page,
BUG_ON(mapcount != mapcount2);
}

+int compound_try_lock_all(struct page *page)
+{
+ struct page *head;
+ struct page *p;
+ int processed;
+ int toProcess;
+
+ VM_BUG_ON(!PageLocked(page));
+
+ /* Requirement compound must be getted so no split. */
+ head = compound_head(page);
+ VM_BUG_ON(compound_order(head) < 2);
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) != 0);
+
+ toProcess = 1 << compound_order(head);
+
+ /* First two passes will go explicite, next by __first_page to speed up.
+ */
+ if (head != page) {
+ if (!trylock_page(head))
+ return 0;
+ }
+
+ if ((head + 1) != page) {
+ if (!trylock_page(head + 1)) {
+ unlock_page(head);
+ return 0;
+ }
+ }
+
+ processed = 2;
+ /* Lock ordering page lock, then compound lock */
+ for (p = head + 2; p->__first_page == head; p++, processed++) {
+ if (p != page) {
+ if (!trylock_page(p))
+ break;
+ }
+ }
+ if (processed == toProcess)
+ return 1;
+
+ /** Rollback - reverse order */
+ do {
+ p--;
+ if (p != page)
+ unlock_page(p);
+ if (p == head)
+ return 0;
+ } while (1);
+}
+/** Splits huge file page.
+ * @param head the head of page
+ * @param page the page that is going to be invalidated.
+ * @return 0 - inplace split, 1 - newly dequeued, 2 - dequeud and was dequeued
+ */
+int split_huge_page_file(struct page *head, struct page *page)
+{
+ VM_BUG_ON(compound_order(head) < 2);
+ VM_BUG_ON(atomic_read(&compound_head(head)[2]._compound_usage));
+ VM_BUG_ON(PageAnon(head));
+
+ if (PageSplitDeque(head))
+ return 2;
+
+ /* Split all vma's. */
+ split_mapping_range(page_mapping(head),
+ (loff_t)page->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE * (1 << compound_order(head)));
+
+ if (compound_try_lock_all(page)) {
+ /* Do in place split. */
+ __split_huge_page_refcount(head, page);
+ return 0;
+ } else {
+ /* We can't lock all tail pages, mark page as split dequed. */
+ if (TestSetPageSplitDeque(head))
+ return 2;
+ else
+ return 1;
+ }
+}
int split_huge_page(struct page *page)
{
struct anon_vma *anon_vma;
diff --git a/mm/memory.c b/mm/memory.c
index 539d1f4..2b43661 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1253,12 +1253,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
do {
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
- if (next-addr != HPAGE_PMD_SIZE) {
+ if (unlikely(details && details->just_split) ||
+ next - addr != HPAGE_PMD_SIZE) {
/* And now we go again in conflict with, THP...
* THP requires semaphore, we require compound
* frozen, why...?
*/
split_huge_page_pmd_vma(vma, addr, pmd);
+ if (unlikely(details && details->just_split))
+ continue;
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
continue;
/* fall through */
@@ -2826,22 +2829,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
}
}

-/**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
- * @mapping: the address space containing mmaps to be unmapped.
- * @holebegin: byte in first page to unmap, relative to the start of
- * the underlying file. This will be rounded down to a PAGE_SIZE
- * boundary. Note that this is different from truncate_pagecache(), which
- * must keep the partial page. In contrast, we must get rid of
- * partial pages.
- * @holelen: size of prospective hole in bytes. This will be rounded
- * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
- * end of the file.
- * @even_cows: 1 when truncating a file, unmap even private COWed pages;
- * but 0 when invalidating pagecache, don't throw away private data.
- */
-void unmap_mapping_range(struct address_space *mapping,
- loff_t const holebegin, loff_t const holelen, int even_cows)
+static void _unmap_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen, int even_cows,
+ int just_split)
{
struct zap_details details;
pgoff_t hba = holebegin >> PAGE_SHIFT;
@@ -2859,6 +2849,8 @@ void unmap_mapping_range(struct address_space *mapping,
details.nonlinear_vma = NULL;
details.first_index = hba;
details.last_index = hba + hlen - 1;
+ details.just_split = just_split;
+
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;

@@ -2870,8 +2862,36 @@ void unmap_mapping_range(struct address_space *mapping,
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
mutex_unlock(&mapping->i_mmap_mutex);
}
+/**
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * @mapping: the address space containing mmaps to be unmapped.
+ * @holebegin: byte in first page to unmap, relative to the start of
+ * the underlying file. This will be rounded down to a PAGE_SIZE
+ * boundary. Note that this is different from truncate_pagecache(), which
+ * must keep the partial page. In contrast, we must get rid of
+ * partial pages.
+ * @holelen: size of prospective hole in bytes. This will be rounded
+ * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
+ * end of the file.
+ * @even_cows: 1 when truncating a file, unmap even private COWed pages;
+ * but 0 when invalidating pagecache, don't throw away private data.
+ */
+void unmap_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen, int even_cows)
+{
+ _unmap_mapping_range(mapping, holebegin, holelen, even_cows, false);
+}
EXPORT_SYMBOL(unmap_mapping_range);

+void split_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ _unmap_mapping_range(mapping, holebegin, holelen, false, true);
+#endif
+}
+EXPORT_SYMBOL(split_mapping_range);
+
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
diff --git a/mm/truncate.c b/mm/truncate.c
index 632b15e..6112a76 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -140,12 +140,28 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)

int truncate_inode_page(struct address_space *mapping, struct page *page)
{
+ struct page *head = NULL;
+ int result;
+
+ if (unlikely(PageCompound(page))) {
+ head = compound_head(page);
+ if (compound_freeze(head)) {
+ if (!split_huge_page_file(head, page))
+ head = NULL;
+ } else {
+ head = NULL;
+ }
+ }
+
if (page_mapped(page)) {
unmap_mapping_range(mapping,
(loff_t)page->index << PAGE_CACHE_SHIFT,
PAGE_CACHE_SIZE, 0);
}
- return truncate_complete_page(mapping, page);
+ result = truncate_complete_page(mapping, page);
+ if (head)
+ compound_unfreeze(head);
+ return result;
}

/*
--
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2012-02-16 14:47:51

by Radosław Smogura

[permalink] [raw]

Subject: [WIP 12/18] Additional macros for pmd operations

Macros for operating on pmd in simillar way like for pte.

Signed-off-by: Radosław Smogura <[email protected]>
---
arch/x86/include/asm/pgtable.h | 21 +++++++++++++++++++++
1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 49afb3f..38fd008 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -265,6 +265,11 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_RW);
}

+static inline int pmd_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_DIRTY;
+}
+
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_DIRTY);
@@ -285,6 +290,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_RW);
}

+static inline pmd_t pmd_writeprotect(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
static inline pmd_t pmd_mknotpresent(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_PRESENT);
@@ -731,6 +741,17 @@ static inline int pmd_write(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_RW;
}

+#define __HAVE_ARCH_PMD_EXEC
+static inline int pmd_exec(pmd_t pmd)
+{
+ return !(pmd_flags(pmd) & _PAGE_NX);
+}
+
+static inline void pmd_mkexec(pmd_t pmd)
+{
+ pmd_clear_flags(pmd, _PAGE_NX);
+}
+
#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
--
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2012-02-16 14:48:53

by Radosław Smogura

[permalink] [raw]

Subject: [WIP 16/18] SHM: Support for splitting on truncation

Writeback will be added in next patches, but after experimental support
for huge pages for EXT 4.

Signed-off-by: Radosław Smogura <[email protected]>
---
mm/shmem.c | 39 ++++++++++++++++++++++++++++++++++++++-
1 files changed, 38 insertions(+), 1 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 97e76b9..db377bf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -454,6 +454,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
+ struct page *head = NULL;

index = indices[i];
if (index > end)
@@ -464,12 +465,32 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
index, page);
continue;
}
-
if (!trylock_page(page))
continue;
+ if (PageCompound(page)) {
+ head = compound_head(page);
+ switch (compound_try_freeze(head, false)) {
+ case -1:
+ head = NULL;
+ break;
+ case 1:
+ unlock_page(page);
+ continue;
+ case 0:
+ if (!split_huge_page_file(head, page))
+ head = NULL;
+ break;
+ }
+ }
+ /* Truncate inode page may try to freez, so unfreez. */
if (page->mapping == mapping) {
VM_BUG_ON(PageWriteback(page));
+ if (head != NULL)
+ compound_unfreeze(head);
truncate_inode_page(mapping, page);
+ } else {
+ if (head != NULL)
+ compound_unfreeze(head);
}
unlock_page(page);
}
@@ -511,6 +532,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
+ struct page *head = NULL;

index = indices[i];
if (index > end)
@@ -523,9 +545,24 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
}

lock_page(page);
+ if (PageCompound(page)) {
+ head = compound_head(page);
+ if (compound_freeze(head)) {
+ if (!split_huge_page_file(head, page))
+ head = NULL;
+ } else {
+ head = NULL;
+ }
+ }
+ /* Truncate inode page may try to freez, so unfreez. */
if (page->mapping == mapping) {
VM_BUG_ON(PageWriteback(page));
+ if (head != NULL)
+ compound_unfreeze(head);
truncate_inode_page(mapping, page);
+ } else {
+ if (head != NULL)
+ compound_unfreeze(head);
}
unlock_page(page);
}
--
1.7.3.4

2012-02-16 23:42:38

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [WIP 11/18] Basic support (faulting) for huge pages for shmfs

OK, stupid question... where are patches 1 through 10? I'm guessing
linux-ext4 wasn't cc'ed on them?

- Ted

2012-02-17 14:12:47

by Radosław Smogura

[permalink] [raw]

Subject: Re: [WIP 11/18] Basic support (faulting) for huge pages for shmfs

On Thu, 16 Feb 2012 18:42:33 -0500, Ted Ts'o wrote:
> OK, stupid question... where are patches 1 through 10? I'm guessing
> linux-ext4 wasn't cc'ed on them?
>
> - Ted
Actually, I added those for --cc (checked in command history). I think
problems went from mail server, it first sent 10 first patches, then no
more, after 20 min I resented patches from 11, but after few hours some
"not sent" patches were sent (I putted self to cc, so I know). Now, I
see those at http://www.spinics.net/lists/linux-ext4/.

Really sorry I haven't suppose of such behavior.

Regards,
Radek

2012-02-17 14:41:23

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [WIP 11/18] Basic support (faulting) for huge pages for shmfs

On Fri, Feb 17, 2012 at 03:12:44PM +0100, Radosław Smogura wrote:
> On Thu, 16 Feb 2012 18:42:33 -0500, Ted Ts'o wrote:
> >OK, stupid question... where are patches 1 through 10? I'm guessing
> >linux-ext4 wasn't cc'ed on them?
> >
> > - Ted
> Actually, I added those for --cc (checked in command history). I
> think problems went from mail server, it first sent 10 first
> patches, then no more, after 20 min I resented patches from 11, but
> after few hours some "not sent" patches were sent (I putted self to
> cc, so I know). Now, I see those at
> http://www.spinics.net/lists/linux-ext4/.

Ok, so there's no difference between the patches sent yesterday with
"WIP" in the subject line, and the ones which didn't? It was just a
resend, then, correct?

- Ted

2012-02-17 14:51:11

by Radosław Smogura

[permalink] [raw]

Subject: Re: [WIP 11/18] Basic support (faulting) for huge pages for shmfs

On Fri, 17 Feb 2012 09:41:18 -0500, Ted Ts'o wrote:
> On Fri, Feb 17, 2012 at 03:12:44PM +0100, Radosław Smogura wrote:
>> On Thu, 16 Feb 2012 18:42:33 -0500, Ted Ts'o wrote:
>> >OK, stupid question... where are patches 1 through 10? I'm
>> guessing
>> >linux-ext4 wasn't cc'ed on them?
>> >
>> > - Ted
>> Actually, I added those for --cc (checked in command history). I
>> think problems went from mail server, it first sent 10 first
>> patches, then no more, after 20 min I resented patches from 11, but
>> after few hours some "not sent" patches were sent (I putted self to
>> cc, so I know). Now, I see those at
>> http://www.spinics.net/lists/linux-ext4/.
>
> Ok, so there's no difference between the patches sent yesterday with
> "WIP" in the subject line, and the ones which didn't? It was just a
> resend, then, correct?
>
> - Ted
Yes, indeed.
Regards

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>