From: [email protected] <[email protected]>
This patch gives ability for add some "-fno-..." options for GCC
and to force -O1 optimization. Supporting files, like Kconfig, Makefile
are auto-generated due to large amount of available options.
Patch helps to debug kernel.
---
Makefile | 11 ++++
lib/Kconfig.debug | 2 +
lib/Kconfig.debug.optim | 102 ++++++++++++++++++++++++++++++++++++
scripts/Makefile.optim.inc | 23 ++++++++
scripts/debug/make_config_optim.sh | 88 +++++++++++++++++++++++++++++++
5 files changed, 226 insertions(+), 0 deletions(-)
create mode 100644 lib/Kconfig.debug.optim
create mode 100644 scripts/Makefile.optim.inc
create mode 100644 scripts/debug/make_config_optim.sh
diff --git a/Makefile b/Makefile
index 7c44b67..bc9a961 100644
--- a/Makefile
+++ b/Makefile
@@ -558,12 +558,23 @@ endif # $(dot-config)
# Defaults to vmlinux, but the arch makefile usually adds further targets
all: vmlinux
+ifdef CONFIG_HACK_OPTIM_FORCE_O1_LEVEL
+KBUILD_CFLAGS += -O1
+else
+
ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
KBUILD_CFLAGS += -Os
else
KBUILD_CFLAGS += -O2
endif
+endif
+
+# Include makefile for optimization override
+ifdef CONFIG_HACK_OPTIM
+include $(srctree)/scripts/Makefile.optim.inc
+endif
+
include $(srctree)/arch/$(SRCARCH)/Makefile
ifneq ($(CONFIG_FRAME_WARN),0)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8745ac7..928265e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1274,5 +1274,7 @@ source "lib/Kconfig.kgdb"
source "lib/Kconfig.kmemcheck"
+source "lib/Kconfig.debug.optim"
+
config TEST_KSTRTOX
tristate "Test kstrto*() family of functions at runtime"
diff --git a/lib/Kconfig.debug.optim b/lib/Kconfig.debug.optim
new file mode 100644
index 0000000..09b1012
--- /dev/null
+++ b/lib/Kconfig.debug.optim
@@ -0,0 +1,102 @@
+# This file was auto generated. It's utility configuration
+# Distributed under GPL v2 License
+
+menuconfig HACK_OPTIM
+ bool "Allows to override GCC optimization"
+ depends on DEBUG_KERNEL && EXPERIMENTAL
+ help
+ If you say Y here you will be able to override
+ how GCC optimize kernel code. This will create
+ more debug friendly, but with not guarentee
+ about same runi, like production, kernel.
+
+ If you say Y here probably You will want say
+ for all suboptions
+
+if HACK_OPTIM
+
+config HACK_OPTIM_FORCE_O1_LEVEL
+ bool "Forces -O1 optimization level"
+ ---help---
+ This will change how GCC optimize code. Code
+ may be slower and larger but will be more debug
+ "friendly".
+
+ In some cases there is low chance that kernel
+ will run different then normal, reporting or not
+ some bugs or errors. Refere to GCC manual for
+ more details.
+
+ You SHOULD say N here.
+
+config HACK_OPTIM__fno_inline_functions_called_once
+ bool "Adds -fno-inline-functions-called-once parameter to gcc invoke line."
+ ---help---
+ This will change how GCC optimize code. Code
+ may be slower and larger but will be more debug
+ "friendly".
+
+ In some cases there is low chance that kernel
+ will run different then normal, reporting or not
+ some bugs or errors. Refere to GCC manual for
+ more details.
+
+ You SHOULD say N here.
+
+config HACK_OPTIM__fno_combine_stack_adjustments
+ bool "Adds -fno-combine-stack-adjustments parameter to gcc invoke line."
+ ---help---
+ This will change how GCC optimize code. Code
+ may be slower and larger but will be more debug
+ "friendly".
+
+ In some cases there is low chance that kernel
+ will run different then normal, reporting or not
+ some bugs or errors. Refere to GCC manual for
+ more details.
+
+ You SHOULD say N here.
+
+config HACK_OPTIM__fno_tree_dce
+ bool "Adds -fno-tree-dce parameter to gcc invoke line."
+ ---help---
+ This will change how GCC optimize code. Code
+ may be slower and larger but will be more debug
+ "friendly".
+
+ In some cases there is low chance that kernel
+ will run different then normal, reporting or not
+ some bugs or errors. Refere to GCC manual for
+ more details.
+
+ You SHOULD say N here.
+
+config HACK_OPTIM__fno_tree_dominator_opts
+ bool "Adds -fno-tree-dominator-opts parameter to gcc invoke line."
+ ---help---
+ This will change how GCC optimize code. Code
+ may be slower and larger but will be more debug
+ "friendly".
+
+ In some cases there is low chance that kernel
+ will run different then normal, reporting or not
+ some bugs or errors. Refere to GCC manual for
+ more details.
+
+ You SHOULD say N here.
+
+config HACK_OPTIM__fno_dse
+ bool "Adds -fno-dse parameter to gcc invoke line."
+ ---help---
+ This will change how GCC optimize code. Code
+ may be slower and larger but will be more debug
+ "friendly".
+
+ In some cases there is low chance that kernel
+ will run different then normal, reporting or not
+ some bugs or errors. Refere to GCC manual for
+ more details.
+
+ You SHOULD say N here.
+
+endif #HACK_OPTIM
diff --git a/scripts/Makefile.optim.inc b/scripts/Makefile.optim.inc
new file mode 100644
index 0000000..e78cc92
--- /dev/null
+++ b/scripts/Makefile.optim.inc
@@ -0,0 +1,23 @@
+# This file was auto generated. It's utility configuration
+# Distributed under GPL v2 License
+
+ifdef CONFIG_HACK_OPTIM__fno_inline_functions_called_once
+ KBUILD_CFLAGS += -fno-inline-functions-called-once
+endif
+
+ifdef CONFIG_HACK_OPTIM__fno_combine_stack_adjustments
+ KBUILD_CFLAGS += -fno-combine-stack-adjustments
+endif
+
+ifdef CONFIG_HACK_OPTIM__fno_tree_dce
+ KBUILD_CFLAGS += -fno-tree-dce
+endif
+
+ifdef CONFIG_HACK_OPTIM__fno_tree_dominator_opts
+ KBUILD_CFLAGS += -fno-tree-dominator-opts
+endif
+
+ifdef CONFIG_HACK_OPTIM__fno_dse
+ KBUILD_CFLAGS += -fno-dse
+endif
+
diff --git a/scripts/debug/make_config_optim.sh b/scripts/debug/make_config_optim.sh
new file mode 100644
index 0000000..26865923
--- /dev/null
+++ b/scripts/debug/make_config_optim.sh
@@ -0,0 +1,88 @@
+#!/bin/sh
+
+## Utility script for generating optimization override options
+## for kernel compilation.
+##
+## Distributed under GPL v2 license
+## (c) Radosław Smogura, 2011
+
+# Prefix added for variable
+CFG_PREFIX="HACK_OPTIM"
+
+KCFG="Kconfig.debug.optim"
+MKFI="Makefile.optim.inc"
+
+OPTIMIZATIONS_PARAMS="-fno-inline-functions-called-once \
+ -fno-combine-stack-adjustments \
+ -fno-tree-dce \
+ -fno-tree-dominator-opts \
+ -fno-dse "
+
+echo "# This file was auto generated. It's utility configuration" > $KCFG
+echo "# Distributed under GPL v2 License" >> $KCFG
+echo >> $KCFG
+echo "menuconfig ${CFG_PREFIX}" >> $KCFG
+echo -e "\tbool \"Allows to override GCC optimization\"" >> $KCFG
+echo -e "\tdepends on DEBUG_KERNEL && EXPERIMENTAL" >> $KCFG
+echo -e "\thelp" >> $KCFG
+echo -e "\t If you say Y here you will be able to override" >> $KCFG
+echo -e "\t how GCC optimize kernel code. This will create" >> $KCFG
+echo -e "\t more debug friendly, but with not guarentee" >> $KCFG
+echo -e "\t about same runi, like production, kernel." >> $KCFG
+echo >> $KCFG
+echo -e "\t If you say Y here probably You will want say" >> $KCFG
+echo -e "\t for all suboptions" >> $KCFG
+echo >> $KCFG
+echo "if ${CFG_PREFIX}" >> $KCFG
+echo >> $KCFG
+
+echo "# This file was auto generated. It's utility configuration" > $MKFI
+echo "# Distributed under GPL v2 License" >> $MKFI
+echo >> $MKFI
+
+# Insert standard override optimization level
+# This is exception, and this value will not be included
+# in auto generated makefile. Support for this value
+# is hard coded in main Makefile.
+echo -e "config ${CFG_PREFIX}_FORCE_O1_LEVEL" >> $KCFG
+echo -e "\tbool \"Forces -O1 optimization level\"" >> $KCFG
+echo -e "\t---help---" >> $KCFG
+echo -e "\t This will change how GCC optimize code. Code" >> $KCFG
+echo -e "\t may be slower and larger but will be more debug" >> $KCFG
+echo -e "\t \"friendly\"." >> $KCFG
+echo >> $KCFG
+echo -e "\t In some cases there is low chance that kernel" >> $KCFG
+echo -e "\t will run different then normal, reporting or not" >> $KCFG
+echo -e "\t some bugs or errors. Refere to GCC manual for" >> $KCFG
+echo -e "\t more details." >> $KCFG
+echo >> $KCFG
+echo -e "\t You SHOULD say N here." >> $KCFG
+echo >> $KCFG
+
+for o in $OPTIMIZATIONS_PARAMS ; do
+ cfg_o="${CFG_PREFIX}_${o//-/_}";
+ echo "Processing param ${o} config variable will be $cfg_o";
+
+ # Generate kconfig entry
+ echo -e "config ${cfg_o}" >> $KCFG
+ echo -e "\tbool \"Adds $o parameter to gcc invoke line.\"" >> $KCFG
+ echo -e "\t---help---" >> $KCFG
+ echo -e "\t This will change how GCC optimize code. Code" >> $KCFG
+ echo -e "\t may be slower and larger but will be more debug" >> $KCFG
+ echo -e "\t \"friendly\"." >> $KCFG
+ echo >> $KCFG
+ echo -e "\t In some cases there is low chance that kernel" >> $KCFG
+ echo -e "\t will run different then normal, reporting or not" >> $KCFG
+ echo -e "\t some bugs or errors. Refere to GCC manual for" >> $KCFG
+ echo -e "\t more details." >> $KCFG
+ echo >> $KCFG
+ echo -e "\t You SHOULD say N here." >> $KCFG
+ echo >> $KCFG
+
+ #Generate Make for include
+ echo "ifdef CONFIG_${cfg_o}" >> $MKFI
+ echo -e "\tKBUILD_CFLAGS += $o" >> $MKFI
+ echo "endif" >> $MKFI
+ echo >> $MKFI
+done;
+echo "endif #${CFG_PREFIX}" >> $KCFG
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Compound pages are now refcounted in way allowing tracking of tail pages
and automatically free of compound page when all references (counter)
fell to zero. This in addition make get_page and get_page_unless_zero
similar in work, as well put_page and put_page_unless_zero. In addition
it makes procedures more friendly. One thing that should be taken, by
developer, on account is to take care when page is putted or geted when
compound lock is obtained, to avoid deadlocks. Locking is used to
prevent concurrent compound split and only when page refcount goes from
0 to 1 or vice versa.
Technically, implementation uses 3rd element of compound page to store
"tails usage counter". This counter is decremented when tail pages count
goes to zero, and bumped when tail page is getted from zero usage
(recovered) – this is to keep backward compatible usage of tail pages.
If "tails usage counter" fell to zero head counter is decremented, if
"tails usage counter" is increased to one the head count is increased,
too. For compound pages without 3rd element (order of 1, two pages) 2nd
page's _count is used in similar way as for higher order pages
_tail_count.
Previous memory barrier logic actually made this safe for
getting page head, but assume that we have cleared tail bit
Signed-off-by: Radosław Smogura <[email protected]>
---
include/linux/huge_mm.h | 21 ++--
include/linux/mm.h | 147 +++++++++++++++---------
include/linux/mm_types.h | 72 +++++++++---
include/linux/page-flags.h | 1 +
include/linux/pagemap.h | 1 -
mm/huge_memory.c | 40 +++----
mm/hugetlb.c | 3 +-
mm/internal.h | 46 --------
mm/memory.c | 2 +-
mm/page_alloc.c | 13 ++-
mm/swap.c | 275 +++++++++++++++++++++++++++++---------------
11 files changed, 373 insertions(+), 248 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1b92129..c2407e4 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -130,18 +130,17 @@ static inline int hpage_nr_pages(struct page *page)
}
static inline struct page *compound_trans_head(struct page *page)
{
- if (PageTail(page)) {
- struct page *head;
- head = page->first_page;
+ if (unlikely(PageTail(page))) {
+ void *result = page->_compound_order;
smp_rmb();
- /*
- * head may be a dangling pointer.
- * __split_huge_page_refcount clears PageTail before
- * overwriting first_page, so if PageTail is still
- * there it means the head pointer isn't dangling.
- */
- if (PageTail(page))
- return head;
+ if (PageTail(page)) {
+ if (((unsigned long) result) == 1)
+ return page - 1;
+ else
+ return (struct page *) result;
+ } else {
+ return page;
+ }
}
return page;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 17b27cd..bacb023 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -267,14 +267,55 @@ struct inode;
* Also, many kernel routines increase the page count before a critical
* routine so they can be sure the page doesn't go away from under them.
*/
+extern int put_compound_head(struct page *head);
+extern int put_compound_tail(struct page *page);
-/*
+static inline void compound_lock(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bit_spin_lock(PG_compound_lock, &page->flags);
+#endif
+}
+
+static inline void compound_unlock(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bit_spin_unlock(PG_compound_lock, &page->flags);
+#endif
+}
+
+/** Gets head of compound page. If page is no longer head returns {@code page}.
+ * This function involves makes memory barrier to ensure page was not splitted.
+ */
+static inline struct page *compound_head(struct page *page)
+{
+ if (unlikely(PageTail(page))) {
+ void *result = page->_compound_order;
+ smp_rmb();
+ if (PageTail(page)) {
+ if (((unsigned long) result) < 64)
+ return page - 1;
+ else
+ return (struct page *) result;
+ } else {
+ return page;
+ }
+ }
+ return page;
+}
+/**
* Drop a ref, return true if the refcount fell to zero (the page has no users)
*/
static inline int put_page_testzero(struct page *page)
{
- VM_BUG_ON(atomic_read(&page->_count) == 0);
- return atomic_dec_and_test(&page->_count);
+ if (unlikely(PageCompound(page))) {
+ if (likely(PageTail(page)))
+ return put_compound_tail(page);
+ else
+ return put_compound_head(page);
+ } else {
+ return atomic_dec_and_test(&page->_count);
+ }
}
/*
@@ -317,20 +358,6 @@ static inline int is_vmalloc_or_module_addr(const void *x)
}
#endif
-static inline void compound_lock(struct page *page)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- bit_spin_lock(PG_compound_lock, &page->flags);
-#endif
-}
-
-static inline void compound_unlock(struct page *page)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- bit_spin_unlock(PG_compound_lock, &page->flags);
-#endif
-}
-
static inline unsigned long compound_lock_irqsave(struct page *page)
{
unsigned long uninitialized_var(flags);
@@ -350,13 +377,6 @@ static inline void compound_unlock_irqrestore(struct page *page,
#endif
}
-static inline struct page *compound_head(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return page->first_page;
- return page;
-}
-
/*
* The atomic page->_mapcount, starts from -1: so that transitions
* both from it and to it can be tracked, using atomic_inc_and_test
@@ -374,33 +394,35 @@ static inline int page_mapcount(struct page *page)
static inline int page_count(struct page *page)
{
- return atomic_read(&compound_head(page)->_count);
+ return atomic_read(&page->_count);
}
-static inline void get_huge_page_tail(struct page *page)
+extern void __recover_compound(struct page *page);
+
+static inline void get_page(struct page *page)
{
- /*
- * __split_huge_page_refcount() cannot run
- * from under us.
+ /* Disallow of getting any page (event tail) if it refcount felt
+ * to zero
*/
- VM_BUG_ON(page_mapcount(page) < 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
- atomic_inc(&page->_mapcount);
+ if (likely(!PageCompound(page) || PageHead(page))) {
+ VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ atomic_inc(&page->_count);
+ } else {
+ /* PageCompound(page) && !PageHead(page) == tail */
+ if (!get_page_unless_zero(page))
+ __recover_compound(page);
+ }
}
-extern bool __get_page_tail(struct page *page);
-
-static inline void get_page(struct page *page)
+static inline void get_huge_page_tail(struct page *page)
{
- if (unlikely(PageTail(page)))
- if (likely(__get_page_tail(page)))
- return;
/*
- * Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
+ * __split_huge_page_refcount() cannot run
+ * from under us. Hoply current do not have compound_lock.
*/
- VM_BUG_ON(atomic_read(&page->_count) <= 0);
- atomic_inc(&page->_count);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ VM_BUG_ON(atomic_read(&page->_count) != 0);
+ get_page(page);
}
static inline struct page *virt_to_head_page(const void *x)
@@ -452,29 +474,22 @@ void put_pages_list(struct list_head *pages);
void split_page(struct page *page, unsigned int order);
int split_free_page(struct page *page);
-/*
- * Compound pages have a destructor function. Provide a
- * prototype for that function and accessor functions.
- * These are _only_ valid on the head of a PG_compound page.
- */
-typedef void compound_page_dtor(struct page *);
-
static inline void set_compound_page_dtor(struct page *page,
compound_page_dtor *dtor)
{
- page[1].lru.next = (void *)dtor;
+ page[1]._dtor = (void *)dtor;
}
static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
{
- return (compound_page_dtor *)page[1].lru.next;
+ return page[1]._dtor;
}
static inline int compound_order(struct page *page)
{
if (!PageHead(page))
return 0;
- return (unsigned long)page[1].lru.prev;
+ return (unsigned long)page[1]._compound_order;
}
static inline int compound_trans_order(struct page *page)
@@ -493,9 +508,33 @@ static inline int compound_trans_order(struct page *page)
static inline void set_compound_order(struct page *page, unsigned long order)
{
- page[1].lru.prev = (void *)order;
+ page[1]._compound_order = (void *)order;
+}
+/** Returns number of used tails (not including head). The tail is used when
+ * its {@code _count > 0}.
+ * <p>
+ * <b>Warning!</b> This operation is not atomic and do not involves any page
+ * or compound page locks. In certain cases page may be cuncurrently splitted,
+ * so returned number may be invalid, or may be read from freed page.
+ * </p>
+ */
+static inline int compound_elements(struct page *page)
+{
+ if (likely(PageCompound(page))) {
+ struct page *head = compound_head(page);
+ if (likely(compound_order(head) > 1)) {
+ return atomic_add_return(0, &head[3]._tail_count);
+ } else {
+ /* This bug informs about under us operations. It is not
+ * desired situation in any way :)
+ */
+ VM_BUG_ON(compound_order(head) == 0);
+ return !!atomic_add_return(0, &head[1]._count);
+ }
+ } else {
+ return page_count(page);
+ }
}
-
#ifdef CONFIG_MMU
/*
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cc3062..05fefae 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -24,6 +24,9 @@ struct address_space;
#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
+/** Type describing destructor of compound page. */
+typedef void compound_page_dtor(struct page *);
+
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
@@ -67,16 +70,6 @@ struct page {
* mms, to show when page is
* mapped & limit reverse map
* searches.
- *
- * Used also for tail pages
- * refcounting instead of
- * _count. Tail pages cannot
- * be mapped and keeping the
- * tail page _count zero at
- * all times guarantees
- * get_page_unless_zero() will
- * never succeed on tail
- * pages.
*/
atomic_t _mapcount;
@@ -93,9 +86,61 @@ struct page {
/* Third double word block */
union {
- struct list_head lru; /* Pageout list, eg. active_list
- * protected by zone->lru_lock !
- */
+ /** Pageout list, eg. active_list protected by
+ * {@code zone->lru_lock} !
+ * Valid on head pages and "single" pages.
+ */
+ struct list_head lru;
+
+ /** Represent special structures for compound page's tail. Some
+ * values are specific only for higher order pages, so if page
+ * has order e. g. 1 (two pages) then there are no values as
+ * head[2].
+ */
+ struct {
+ /** First union of compound page, overlaps first pointer
+ * in list_head.
+ */
+ union {
+ /* This should be cast to int, and it must be
+ * pointer to keep align and size with other.
+ * <b>Valid only on head[1].</b>
+ */
+ void *_compound_order;
+
+ /** Pointer to first page in compound.
+ * Distinction between first page and valid
+ * order depends on simple observation page
+ * struct pointer can't have some values. It's
+ * rather architecture specific where 1st page
+ * header pointer may exists, but it is after
+ * address 64L. So if we will see here value
+ * less then 64L we are sure it's 2nd page of
+ * compound (so first page is "this - 1").
+ * <b>Valid only on 3rd and next elements</b>
+ */
+ struct page *__first_page;
+ };
+
+ /** 2nd union of compound page, overlaps first pointer
+ * in list_head.
+ */
+ union {
+ /** Destructor of compound page, stored in
+ * head[1].
+ */
+ compound_page_dtor *_dtor;
+
+ /** Number of pages in compound page(including
+ * head and tails) that are used (having
+ * {@code _count > 0}). If this number fell to
+ * zero, then compound page may be freed by
+ * kernel. This is stored in head[3].
+ */
+ atomic_t _tail_count;
+ };
+ };
+
struct { /* slub per cpu partial pages */
struct page *next; /* Next partial slab */
#ifdef CONFIG_64BIT
@@ -121,7 +166,6 @@ struct page {
spinlock_t ptl;
#endif
struct kmem_cache *slab; /* SLUB: Pointer to slab */
- struct page *first_page; /* Compound tail pages */
};
/*
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e90a673..393b8af 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -105,6 +105,7 @@ enum pageflags {
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /** For page head it's raised to protect page from spliting */
PG_compound_lock,
#endif
__NR_PAGEFLAGS,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index cfaaa69..8ee9d13 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -159,7 +159,6 @@ static inline int page_cache_get_speculative(struct page *page)
return 0;
}
#endif
- VM_BUG_ON(PageTail(page));
return 1;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 91d3efb..e3b4c38 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1019,7 +1019,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON(!PageCompound(page));
if (flags & FOLL_GET)
- get_page_foll(page);
+ get_page(page);
out:
return page;
@@ -1050,7 +1050,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
spin_unlock(&tlb->mm->page_table_lock);
tlb_remove_page(tlb, page);
pte_free(tlb->mm, pgtable);
- ret = 1;
}
} else
spin_unlock(&tlb->mm->page_table_lock);
@@ -1228,8 +1227,8 @@ static int __split_huge_page_splitting(struct page *page,
static void __split_huge_page_refcount(struct page *page)
{
int i;
+ int tail_counter;
struct zone *zone = page_zone(page);
- int tail_count = 0;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
@@ -1237,30 +1236,18 @@ static void __split_huge_page_refcount(struct page *page)
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(page);
+ tail_counter = compound_elements(page);
+
for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
struct page *page_tail = page + i;
/* tail_page->_mapcount cannot change */
BUG_ON(page_mapcount(page_tail) < 0);
- tail_count += page_mapcount(page_tail);
- /* check for overflow */
- BUG_ON(tail_count < 0);
- BUG_ON(atomic_read(&page_tail->_count) != 0);
+
/*
- * tail_page->_count is zero and not changing from
- * under us. But get_page_unless_zero() may be running
- * from under us on the tail_page. If we used
- * atomic_set() below instead of atomic_add(), we
- * would then run atomic_set() concurrently with
- * get_page_unless_zero(), and atomic_set() is
- * implemented in C not using locked ops. spin_unlock
- * on x86 sometime uses locked ops because of PPro
- * errata 66, 92, so unless somebody can guarantee
- * atomic_set() here would be safe on all archs (and
- * not only on x86), it's safer to use atomic_add().
+ * tail_page->_count represents actuall number of tail pages
*/
- atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
- &page_tail->_count);
+ atomic_add(page_mapcount(page) + 1, &page_tail->_count);
/* after clearing PageTail the gup refcount can be released */
smp_mb();
@@ -1269,8 +1256,13 @@ static void __split_huge_page_refcount(struct page *page)
* retain hwpoison flag of the poisoned tail page:
* fix for the unsuitable process killed on Guest Machine(KVM)
* by the memory-failure.
+ * retain lock, and compound lock
*/
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP
+ | __PG_HWPOISON
+ | PG_locked
+ | PG_compound_lock;
+
page_tail->flags |= (page->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
@@ -1307,10 +1299,8 @@ static void __split_huge_page_refcount(struct page *page)
BUG_ON(!PageDirty(page_tail));
BUG_ON(!PageSwapBacked(page_tail));
-
lru_add_page_tail(zone, page, page_tail);
}
- atomic_sub(tail_count, &page->_count);
BUG_ON(atomic_read(&page->_count) <= 0);
__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
@@ -1318,6 +1308,10 @@ static void __split_huge_page_refcount(struct page *page)
ClearPageCompound(page);
compound_unlock(page);
+ /* Remove additional reference used in compound. */
+ if (tail_counter)
+ put_page(page);
+
spin_unlock_irq(&zone->lru_lock);
for (i = 1; i < HPAGE_PMD_NR; i++) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5f34bd8..d3f3f30 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -577,7 +577,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
__SetPageTail(p);
set_page_count(p, 0);
- p->first_page = page;
+ if (order > 1)
+ p->__first_page = page;
}
}
diff --git a/mm/internal.h b/mm/internal.h
index 2189af4..d071d38 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,52 +37,6 @@ static inline void __put_page(struct page *page)
atomic_dec(&page->_count);
}
-static inline void __get_page_tail_foll(struct page *page,
- bool get_page_head)
-{
- /*
- * If we're getting a tail page, the elevated page->_count is
- * required only in the head page and we will elevate the head
- * page->_count and tail page->_mapcount.
- *
- * We elevate page_tail->_mapcount for tail pages to force
- * page_tail->_count to be zero at all times to avoid getting
- * false positives from get_page_unless_zero() with
- * speculative page access (like in
- * page_cache_get_speculative()) on tail pages.
- */
- VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
- VM_BUG_ON(page_mapcount(page) < 0);
- if (get_page_head)
- atomic_inc(&page->first_page->_count);
- atomic_inc(&page->_mapcount);
-}
-
-/*
- * This is meant to be called as the FOLL_GET operation of
- * follow_page() and it must be called while holding the proper PT
- * lock while the pte (or pmd_trans_huge) is still mapping the page.
- */
-static inline void get_page_foll(struct page *page)
-{
- if (unlikely(PageTail(page)))
- /*
- * This is safe only because
- * __split_huge_page_refcount() can't run under
- * get_page_foll() because we hold the proper PT lock.
- */
- __get_page_tail_foll(page, true);
- else {
- /*
- * Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
- */
- VM_BUG_ON(atomic_read(&page->_count) <= 0);
- atomic_inc(&page->_count);
- }
-}
-
extern unsigned long highest_memmap_pfn;
/*
diff --git a/mm/memory.c b/mm/memory.c
index fa2f04e..a0ab73c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1522,7 +1522,7 @@ split_fallthrough:
}
if (flags & FOLL_GET)
- get_page_foll(page);
+ get_page(page);
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d2186ec..b48e313 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -345,15 +345,20 @@ void prep_compound_page(struct page *page, unsigned long order)
int i;
int nr_pages = 1 << order;
- set_compound_page_dtor(page, free_compound_page);
- set_compound_order(page, order);
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
__SetPageTail(p);
set_page_count(p, 0);
- p->first_page = page;
+ if (order > 1)
+ p->__first_page = page;
}
+
+ /* Order, dtor was replaced in for loop, set it correctly. */
+ set_compound_order(page, order);
+ set_compound_page_dtor(page, free_compound_page);
+ if (order > 1)
+ atomic_set(&page[3]._tail_count, 0);
}
/* update __split_huge_page_refcount if you change this function */
@@ -374,7 +379,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- if (unlikely(!PageTail(p) || (p->first_page != page))) {
+ if (unlikely(!PageTail(p) || (compound_head(page) != page))) {
bad_page(page);
bad++;
}
diff --git a/mm/swap.c b/mm/swap.c
index fff1ff7..365363c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
+
#include "internal.h"
/* How many pages do we try to swap or page in/out together? */
@@ -64,123 +65,211 @@ static void __put_single_page(struct page *page)
free_hot_cold_page(page, 0);
}
-static void __put_compound_page(struct page *page)
+static void __free_compound_page(struct page *head)
{
compound_page_dtor *dtor;
+ VM_BUG_ON(PageTail(head));
+ VM_BUG_ON(!PageCompound(head));
- __page_cache_release(page);
- dtor = get_compound_page_dtor(page);
- (*dtor)(page);
+#if CONFIG_DEBUG_VM
+ /* Debug test if all tails are zero ref - we do not have lock,
+ * but we shuld not have refcount, so no one should split us!
+ */
+ do {
+ unsigned long toCheck = 1 << compound_order(head);
+ unsigned long i;
+ for (i = 0; i < toCheck; i++) {
+ if (atomic_read(&head[i]._count))
+ VM_BUG_ON(atomic_read(&head[i]._count));
+ }
+ } while (0);
+#endif
+ __page_cache_release(head);
+ dtor = get_compound_page_dtor(head);
+ (*dtor)(head);
}
-static void put_compound_page(struct page *page)
+int put_compound_head(struct page *head)
{
- if (unlikely(PageTail(page))) {
- /* __split_huge_page_refcount can run under us */
- struct page *page_head = compound_trans_head(page);
+ VM_BUG_ON(PageTail(head));
- if (likely(page != page_head &&
- get_page_unless_zero(page_head))) {
- unsigned long flags;
- /*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
- */
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
- compound_unlock_irqrestore(page_head, flags);
- VM_BUG_ON(PageHead(page_head));
- if (put_page_testzero(page_head))
- __put_single_page(page_head);
- out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
- }
- VM_BUG_ON(page_head != page->first_page);
- /*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on
- * the compound_lock.
- */
- if (put_page_testzero(page_head))
- VM_BUG_ON(1);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON(page_mapcount(page) <= 0);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
- compound_unlock_irqrestore(page_head, flags);
- if (put_page_testzero(page_head)) {
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
+ if (atomic_dec_and_test(&head->_count)) {
+ /* We have putted head, and it's refcount fell to zero.
+ *
+ * head->_count may be bummped only in following situations
+ * 1. get_page - this should not happend, there is VM_BUG_ON
+ * for this situation.
+ * 2. __recover_page - bumps head->count, only after
+ * get_page_unless_zero, so only one may be winner, because
+ * __recover_page bumps if head->_count > 0, then at this
+ * point head->_count will be 1 - contradiction.
+ */
+ if (PageCompound(head))
+ __free_compound_page(head);
+ else
+ __put_single_page(head);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(put_compound_head);
+
+int put_compound_tail(struct page *page)
+{
+ unsigned long flags;
+ VM_BUG_ON(PageHead(page));
+
+ /* We need, first, test if we may drop reference to zero. If we would
+ * drop reference to zero (e. g. by atomic_dec_and_test), split
+ * refcount could raise compound lock, before us, and we decreased
+ * _tail_count. Whith improper _tail_count "split" may not decrease
+ * head refcount, and head page would leak.
+ */
+ if (__atomic_add_unless(&page->_count, -1, 1) == 1) {
+ struct page *head = compound_head(page);
+
+ VM_BUG_ON(!atomic_read(&page->_count));
+
+ if (!get_page_unless_zero(head)) {
+ /* Page was splitted or freed - nothing to do */
+ __put_single_page(page);
+ return 1;
+ }
+
+ flags = compound_lock_irqsave(head);
+
+ /* Having exclusive lock check if we putted page to 0, meantime
+ * others could get_page. This is like double check lock.
+ */
+ if (!atomic_dec_and_test(&page->_count)) {
+ compound_unlock_irqrestore(head, flags);
+ put_page(head);
+ return 0;
+ }
+
+ if (!PageCompound(page)) {
+ /* Page was splitted .*/
+ compound_unlock_irqrestore(head, flags);
+ put_page(head);
+ __put_single_page(page);
+ return 1;
+ }
+
+ /* Page is compound. */
+ if (compound_order(head) > 1) {
+ if (atomic_dec_and_test(
+ (atomic_t *) &head[3]._tail_count)) {
+ /* Tail count has fallen to zero. No one may
+ * concurrently recover page, bacause we have
+ * compound_lock, so &head[3]._tail_count
+ * is managed only by us, because of this
+ * no one may recover tail page.
+ *
+ * This drops usage count for tail pages.
+ */
+ atomic_dec(&head->_count);
+
+ /* At least one ref should exists. */
+ VM_BUG_ON(!atomic_read(&head->_count));
+
+ /* and this one for get_page_unless_zero(head)*/
+ if (atomic_dec_and_test(&head->_count)) {
+ /* Putted last ref - now noone may get
+ * head. Details in put_compound_head
+ */
+ compound_unlock_irqrestore(head, flags);
+ __free_compound_page(head);
+ return 1;
+ } else {
+ compound_unlock_irqrestore(head, flags);
+ return 1;
+ }
}
} else {
- /* page_head is a dangling pointer */
- VM_BUG_ON(PageTail(page));
- goto out_put_single;
+ /* Almost same as for order >= 2. */
+ if (atomic_dec_and_test(&head->_count)) {
+ compound_unlock_irqrestore(head, flags);
+ __free_compound_page(head);
+ }
}
- } else if (put_page_testzero(page)) {
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
+ /* One ref is "managed by" _tail_count, so head->_count >= 2. */
+ atomic_dec(&head->_count);
+ compound_unlock_irqrestore(head, flags);
+ return 1;
}
+ return 1;
}
+EXPORT_SYMBOL(put_compound_tail);
void put_page(struct page *page)
{
- if (unlikely(PageCompound(page)))
- put_compound_page(page);
- else if (put_page_testzero(page))
+ if (unlikely(PageCompound(page))) {
+ if (likely(PageTail(page)))
+ put_compound_tail(page);
+ else
+ put_compound_head(page);
+ } else if (put_page_testzero(page)) {
__put_single_page(page);
+ }
}
EXPORT_SYMBOL(put_page);
-/*
- * This function is exported but must not be called by anything other
- * than get_page(). It implements the slow path of get_page().
- */
-bool __get_page_tail(struct page *page)
+void __recover_compound(struct page *page)
{
- /*
- * This takes care of get_page() if run on a tail page
- * returned by one of the get_user_pages/follow_page variants.
- * get_user_pages/follow_page itself doesn't need the compound
- * lock because it runs __get_page_tail_foll() under the
- * proper PT lock that already serializes against
- * split_huge_page().
- */
unsigned long flags;
- bool got = false;
- struct page *page_head = compound_trans_head(page);
+ struct page *head = compound_head(page);
+
+ if (get_page_unless_zero(head)) {
+ flags = compound_lock_irqsave(head);
+ if (!PageCompound(head)) {
+ /* Page was splitted under us. */
+ compound_unlock_irqrestore(head, flags);
+ put_page(head);
+ return;
+ }
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- /*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
+ /* Now, page can't be splitted, because we have lock, we
+ * exclusivly manage _tail_count, too. Head->_count >= 2.
*/
- flags = compound_lock_irqsave(page_head);
- /* here __split_huge_page_refcount won't run anymore */
- if (likely(PageTail(page))) {
- __get_page_tail_foll(page, false);
- got = true;
+ if (likely(compound_order(head) > 1)) {
+ /* If put_page will be called here, then we may bump
+ * _tail_count, but this tail count will be dropped
+ * down, by put_page, because it waits for
+ * compound_lock.
+ */
+ if (atomic_add_return(1, &page->_count) > 1) {
+ /* Page was recovered by someone else,
+ * before we have taken compound lock.
+ * Nothing to do.
+ */
+ } else {
+ /* If put_page was called here, then it waits
+ * for compound_lock, and will immediatly
+ * decrease _tail_count.
+ */
+ if (atomic_add_return(1,
+ &head[3]._tail_count) == 1) {
+ /* _tail_count was 0, bump head. */
+ atomic_inc(&head->_count);
+ }
+ }
+ } else {
+ if (!(atomic_add_return(1, &page->_count) > 1)) {
+ /* Page wasn't recovered by someone else,
+ * before we have taken compound lock.
+ */
+ atomic_inc(&head->_count);
+ }
}
- compound_unlock_irqrestore(page_head, flags);
- if (unlikely(!got))
- put_page(page_head);
+ compound_unlock_irqrestore(head, flags);
+ put_page(head);
+ } else {
+ /* If compound head fell to zero this means whole page was
+ * splited - recall normal get_page. */
+ get_page(page);
}
- return got;
}
-EXPORT_SYMBOL(__get_page_tail);
+EXPORT_SYMBOL(__recover_compound);
/**
* put_pages_list() - release a list of pages
@@ -598,7 +687,7 @@ void release_pages(struct page **pages, int nr, int cold)
spin_unlock_irqrestore(&zone->lru_lock, flags);
zone = NULL;
}
- put_compound_page(page);
+ put_page(page);
continue;
}
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Adds compound usage count for pages of higher order. This change is
required to add faster locking techniques then compound_lock, and to
prevents dead locks during operating on compound page.
Signed-off-by: Radosław Smogura <[email protected]>
---
include/linux/mm.h | 127 +++++++++++++++++++++++++++++++++++++++++++---
include/linux/mm_types.h | 15 +++++-
mm/page_alloc.c | 4 +-
mm/swap.c | 58 +++++++++++++++++++--
4 files changed, 191 insertions(+), 13 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bacb023..72f6a50 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -284,6 +284,126 @@ static inline void compound_unlock(struct page *page)
#endif
}
+static inline int compound_order(struct page *page)
+{
+ if (!PageHead(page))
+ return 0;
+ return (unsigned long)page[1]._compound_order;
+}
+
+/** Get's usage count for compound page.
+ * This involves compound_lock, so do not call it having compound lock
+ * raised.
+ * @return 1 - success, 0 - page was splitted.
+ */
+static inline int compound_get(struct page *head)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ VM_BUG_ON(PageTail(head));
+repeat:
+ if (!PageHead(head))
+ return 0;
+
+ VM_BUG_ON(!atomic_read(&head->_count));
+ VM_BUG_ON(compound_order(head) < 2);
+
+ compound_lock(head);
+ if (unlikely(!PageHead(head))) {
+ compound_unlock(head);
+ return 0;
+ }
+
+ if (atomic_inc_not_zero(&head[2]._compound_usage)) {
+ compound_unlock(head);
+ return 1;
+ } else {
+ compound_unlock(head);
+ goto repeat;
+ }
+#else
+ return 0;
+#endif
+}
+
+/** Decrases compound usage count.
+ * This involves compound_lock, so do not call it having compound lock
+ * raised.
+ */
+extern void compound_put(struct page *head);
+
+
+
+/** Tries to freeze compound page. If upgrade_lock is true function tries to
+ * <b>exchange</b> page "gotten" to "forozen" (so after unfreeze page will be
+ * "not used"), caller must have page excatly once. If upgrade_lock is false
+ * then page must be "not gotten".
+ *
+ * @return 0 - success, -1 splitted, 1 - can't freez, but not splitted
+ */
+static inline int compound_try_freeze(struct page *head, int upgrade_lock)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int expected_usage;
+
+ VM_BUG_ON(PageTail(head));
+ VM_BUG_ON(compound_order(head) < 2);
+ VM_BUG_ON(!atomic_read(&head->_count));
+ VM_BUG_ON(upgrade_lock && atomic_read(&head[2]._compound_usage) == 1);
+
+ if (!PageHead(head))
+ return 0;
+
+ compound_lock(head);
+ if (!upgrade_lock) {
+ /* Not needed. Page is gotten so no split, GCC will make this
+ * faster.
+ */
+ if (unlikely(!PageHead(head))) {
+ return -1;
+ }
+ }
+
+ expected_usage = upgrade_lock ? 2 : 1;
+ if (atomic_cmpxchg(&head[2]._compound_usage, expected_usage, 0) == 1) {
+ compound_unlock(head);
+ return 0;
+ } else {
+ compound_unlock(head);
+ return 1;
+ }
+#else
+ return 0;
+#endif
+}
+
+/** Freeze compound page (like write barrier.
+ * This involves compound_lock, so do not call it having compound lock
+ * raised.
+ *
+ * @return 1 - success, 0 - page was splitted.
+ */
+static inline int compound_freeze(struct page *head)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+repeat:
+ switch (compound_try_freeze(head, false)) {
+ case 0:
+ return 1;
+ case -1:
+ return 0;
+ default:
+ goto repeat;
+ }
+#else
+ return 1;
+#endif
+}
+
+/** Unfreezes compound page.
+ * Do not call this after you splitted page or you may corrupt memory.
+ */
+extern void compound_unfreeze(struct page *head);
+
/** Gets head of compound page. If page is no longer head returns {@code page}.
* This function involves makes memory barrier to ensure page was not splitted.
*/
@@ -485,13 +605,6 @@ static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
return page[1]._dtor;
}
-static inline int compound_order(struct page *page)
-{
- if (!PageHead(page))
- return 0;
- return (unsigned long)page[1]._compound_order;
-}
-
static inline int compound_trans_order(struct page *page)
{
int order;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 05fefae..7649722 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -117,7 +117,8 @@ struct page {
* address 64L. So if we will see here value
* less then 64L we are sure it's 2nd page of
* compound (so first page is "this - 1").
- * <b>Valid only on 3rd and next elements</b>
+ * <b>Valid only on 3rd and next elements,
+ * head[2], head[3]...</b>
*/
struct page *__first_page;
};
@@ -131,6 +132,18 @@ struct page {
*/
compound_page_dtor *_dtor;
+ /** Usage count of compound page "as whole".
+ * This is rather split barrier then something
+ * usefull. Compound page with order greater
+ * then 1 should start with this value setted to
+ * {@code 1} - mean no lock, locking page for
+ * reading is obtained by bumping lock if not
+ * zero, locking for splitting by setting it
+ * to zero when value of counter is {@code 1}.
+ * <b>Valid only on 3rd element (head[2])</b>
+ */
+ atomic_t _compound_usage;
+
/** Number of pages in compound page(including
* head and tails) that are used (having
* {@code _count > 0}). If this number fell to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b48e313..bbdd94e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -357,8 +357,10 @@ void prep_compound_page(struct page *page, unsigned long order)
/* Order, dtor was replaced in for loop, set it correctly. */
set_compound_order(page, order);
set_compound_page_dtor(page, free_compound_page);
- if (order > 1)
+ if (order > 1) {
atomic_set(&page[3]._tail_count, 0);
+ atomic_set(&page[2]._compound_usage, 1);
+ }
}
/* update __split_huge_page_refcount if you change this function */
diff --git a/mm/swap.c b/mm/swap.c
index 365363c..ded81c9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -104,10 +104,17 @@ int put_compound_head(struct page *head)
* __recover_page bumps if head->_count > 0, then at this
* point head->_count will be 1 - contradiction.
*/
- if (PageCompound(head))
- __free_compound_page(head);
- else
+ smp_rmb();
+ if (PageCompound(head)) {
+ if (compound_order(head) > 1) {
+ if (atomic_read(&head[2]._compound_usage) == 1)
+ __free_compound_page(head);
+ } else {
+ __free_compound_page(head);
+ }
+ } else {
__put_single_page(head);
+ }
return 1;
}
return 0;
@@ -173,7 +180,9 @@ int put_compound_tail(struct page *page)
VM_BUG_ON(!atomic_read(&head->_count));
/* and this one for get_page_unless_zero(head)*/
- if (atomic_dec_and_test(&head->_count)) {
+ if (atomic_dec_and_test(&head->_count) &&
+ (atomic_read(&head[2]._compound_usage)
+ == 1)) {
/* Putted last ref - now noone may get
* head. Details in put_compound_head
*/
@@ -201,6 +210,47 @@ int put_compound_tail(struct page *page)
}
EXPORT_SYMBOL(put_compound_tail);
+extern void compound_put(struct page *head)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ VM_BUG_ON(PageTail(head));
+ /* Bug if page was splitted. */
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(!atomic_read(&head[2]._compound_usage));
+ VM_BUG_ON(compound_order(head) < 2);
+ compound_lock(head);
+ if (atomic_add_return(-1, &head[2]._compound_usage) == 1) {
+ if (!atomic_read(&head->_count)) {
+ compound_unlock(head);
+ __free_compound_page(head);
+ }
+ }
+ compound_unlock(head);
+#endif
+}
+
+extern void compound_unfreeze(struct page *head)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ VM_BUG_ON(PageTail(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage));
+ VM_BUG_ON(compound_order(head) < 2);
+
+ /* It's quite important to check during "experimental" phase if page is
+ * unfrozen on splitted page (the counter overlaps lru, so this may
+ * cause problems.
+ */
+ BUG_ON(!PageCompound(head));
+ compound_lock(head);
+ atomic_set(&head[2]._compound_usage, 1);
+ if (!atomic_read(&head->_count)) {
+ compound_unlock(head);
+ __free_compound_page(head);
+ }
+ compound_unlock(head);
+#endif
+}
+
void put_page(struct page *page)
{
if (unlikely(PageCompound(page))) {
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
This flag is to inform caller that page is about to be splitted, caller
should not depend on fact that page is compound during page mapping.
In first approach we assumed page may be splitted conurently during
operations that makes compound cache pages not coherent with mapping
indices - e. g. when someone replaces 1st page in page cache, but at
this palce exists huge page.
Problem with above is that if two threads makes such change, they may
concurently deadlock. It's quite reasonable that both will have page
lock and should achieve compound lock (compound lock will be achieved
after page lock in many situations, probably during split), split method
should aquire page lock for each tail page at it changes some important
flags.
This is mainly dictated because of LRU, we can't use tail->lru as it
stores some compound data, from other side some code depends on PageLRU,
which should evolve to, event those it's quite reasonable that tail
pages should have LRU flag set, and mainly isolate_lru_pages should bug
on tail.
Signed-off-by: Radosław Smogura <[email protected]>
---
include/linux/page-flags.h | 31 +++++++++++++++++++++++++++++++
1 files changed, 31 insertions(+), 0 deletions(-)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 393b8af..0d17a6f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -108,6 +108,18 @@ enum pageflags {
/** For page head it's raised to protect page from spliting */
PG_compound_lock,
#endif
+#ifdef CONFIG_HUGEPAGECACHE
+ /** Setted on head to inform that page is subject to split, but split
+ * hasn't started yet or is it in progress. When this flag is set
+ * caller should not belive that this compound page represents some
+ * "continous data".<br/>
+ * It's currently used for huge page cache and file base mapping,
+ * and it informs that compound page dosen't represents continuous
+ * region of file, in particullary some pages including head may be
+ * removed or replaced in cache by other pages, or may be "invalid"
+ */
+ PG_split_deque,
+#endif
__NR_PAGEFLAGS,
/* Filesystems */
@@ -433,6 +445,25 @@ static inline int PageTransCompound(struct page *page)
}
#endif
+#ifdef CONFIG_HUGEPAGECACHE
+TESTPAGEFLAG(SplitDeque, split_deque);
+TESTSETFLAG(SplitDeque, split_deque);
+TESTCLEARFLAG(SplitDeque, split_deque);
+#else
+static inline int PageSplitDeque(struct page *page)
+{
+ return 0;
+}
+static inline int TestClearPageSplitDeque(struct page *page)
+{
+ return 0;
+}
+static inline int TestSetPageSplitDeque(struct page *page)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_MMU
#define __PG_MLOCKED (1 << PG_mlocked)
#else
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Signed-off-by: Radosław Smogura <[email protected]>
---
mm/vmscan.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c52b235..7299b71 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -636,7 +636,7 @@ void putback_lru_page(struct page *page)
int was_unevictable = PageUnevictable(page);
VM_BUG_ON(PageLRU(page));
-
+ VM_BUG_ON(PageTail(page));
redo:
ClearPageUnevictable(page);
@@ -1177,6 +1177,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
prefetchw_prev_lru_page(page, src, flags);
VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON(PageTail(page));
switch (__isolate_lru_page(page, mode, file)) {
case 0:
@@ -1239,6 +1240,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
unsigned int isolated_pages;
+ VM_BUG_ON(PageTail(cursor_page));
+
mem_cgroup_lru_del(cursor_page);
list_move(&cursor_page->lru, dst);
isolated_pages = hpage_nr_pages(cursor_page);
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Changes migrate pages to more flexible form, allowing more complex usage then
LRU list and advanced page managing during migration.
Those changes are designed for Huge Page Cache to safly pass and migrate page
to new place, in particullary allowing passing locked and getted pages.
New implementation uses configuration structure with various
"life-cycle" methods for making callbacks when getting next, new page or
notifing result.
Signed-off-by: Radosław Smogura <[email protected]>
---
include/linux/migrate.h | 52 ++++++++++++++++++++++++++++++++++++++++++
include/linux/migrate_mode.h | 8 ++++--
mm/migrate.c | 48 ++++++++++++++++++++++++++++++++++++--
3 files changed, 102 insertions(+), 6 deletions(-)
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 05ed282..0438aff 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,8 +5,42 @@
#include <linux/mempolicy.h>
#include <linux/migrate_mode.h>
+struct migration_ctl;
+
+typedef enum {
+ PAGE_LOCKED = (1 << 0)
+} page_mode;
+
+/** Keept for simplified, backward comptaible, list based migrate_pages */
typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *mig_page_new_t(struct page *, struct migration_ctl *);
+
+typedef struct page *mig_page_next_t(struct migration_ctl *, page_mode *mode);
+
+typedef void mig_page_result_t(struct page *oldPage, struct page *newPage,
+ struct migration_ctl *ctl, int result);
+
+/** Control for extended migration support. */
+struct migration_ctl {
+ /** Attach some private data if you need one. */
+ unsigned long privateData;
+
+ /** Will be called to get next page for migration, {@code NULL} means
+ * to end migration. In certain cases function may return same page
+ * twice or more, depending on migration success.
+ */
+ mig_page_next_t *getNextPage;
+
+ /** Will be called after getNextPage to get target page. */
+ mig_page_new_t *getNewPage;
+
+ /** Called after migration page ended, despiting success or failure.
+ * This function is reponsible for cleanuping etc.
+ */
+ mig_page_result_t *notifyResult;
+};
+
#ifdef CONFIG_MIGRATION
#define PAGE_MIGRATION 1
@@ -16,6 +50,24 @@ extern int migrate_page(struct address_space *,
extern int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
enum migrate_mode mode);
+
+/** Calback version of migrate_pages.
+ *
+ * Instead of getting pages from list passed callbacks are used
+ * to get next, new page and to notify result. If obtained old page
+ * was with PAGE_LOCKED flag then it will not be unlocked.<br/>
+ * Caller is responsible for cleaning (putting back if he wants) old and
+ * newpage. <br/>
+ * Function have following pseudo-call flow:
+ * while ({@link migration_ctl.getNextPage}) <br/>
+ * if ({@link migration_ctl.getNewPage} != null) {
+ * internal_processing(...);
+ * {@link migration_ctl.notifyResult};
+ * }
+ */
+extern void migrate_pages_cb(struct migration_ctl *ctl, bool offlining,
+ enum migrate_mode mode);
+
extern int migrate_huge_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
enum migrate_mode mode);
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index ebf3d89..3256eda 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -8,9 +8,11 @@
* MIGRATE_SYNC will block when migrating pages
*/
enum migrate_mode {
- MIGRATE_ASYNC,
- MIGRATE_SYNC_LIGHT,
- MIGRATE_SYNC,
+ MIGRATE_ASYNC = 1 << 0,
+ MIGRATE_SYNC_LIGHT = 1 << 1,
+ MIGRATE_SYNC = 1 << 2,
+ /** Source page is getted, by caller. */
+ MIGRATE_SRC_GETTED = 1 << 3
};
#endif /* MIGRATE_MODE_H_INCLUDED */
diff --git a/mm/migrate.c b/mm/migrate.c
index df141f6..456f680 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -273,6 +273,7 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
* 1 for anonymous pages without a mapping
* 2 for pages with a mapping
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
+ * {@code +1} if mode has MIGRATE_SRC_GETTED setted
*/
static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
@@ -294,6 +295,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
page_index(page));
expected_count = 2 + page_has_private(page);
+ if (mode | MIGRATE_SRC_GETTED)
+ expected_count++;
+
if (page_count(page) != expected_count ||
radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
@@ -675,6 +679,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
}
static int __unmap_and_move(struct page *page, struct page *newpage,
+ page_mode pageMode, struct migration_ctl *ctl,
int force, bool offlining, enum migrate_mode mode)
{
int rc = -EAGAIN;
@@ -683,6 +688,9 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;
+ if (pageMode & PAGE_LOCKED)
+ goto skip_lock;
+
if (!trylock_page(page)) {
if (!force || mode == MIGRATE_ASYNC)
goto out;
@@ -706,6 +714,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
lock_page(page);
}
+skip_lock:
/*
* Only memory hotplug's offline_pages() caller has locked out KSM,
* and can safely migrate a KSM page. The other cases have skipped
@@ -830,11 +839,17 @@ out:
*/
static int unmap_and_move(new_page_t get_new_page, unsigned long private,
struct page *page, int force, bool offlining,
+ page_mode pageMode, struct migration_ctl *ctl,
enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
- struct page *newpage = get_new_page(page, private, &result);
+ struct page *newpage;
+
+ if (ctl)
+ newpage = ctl->getNewPage(page, ctl);
+ else
+ newpage = get_new_page(page, private, &result);
if (!newpage)
return -ENOMEM;
@@ -850,7 +865,13 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
if (unlikely(split_huge_page(page)))
goto out;
- rc = __unmap_and_move(page, newpage, force, offlining, mode);
+ rc = __unmap_and_move(page, newpage, pageMode, ctl,
+ force, offlining, mode);
+
+ if (ctl) {
+ ctl->notifyResult(page, newpage, ctl, rc);
+ goto skip_self_clean;
+ }
out:
if (rc != -EAGAIN) {
/*
@@ -875,6 +896,8 @@ out:
else
*result = page_to_nid(newpage);
}
+
+skip_self_clean:
return rc;
}
@@ -987,7 +1010,7 @@ int migrate_pages(struct list_head *from,
rc = unmap_and_move(get_new_page, private,
page, pass > 2, offlining,
- mode);
+ 0, NULL, mode);
switch(rc) {
case -ENOMEM:
@@ -1015,6 +1038,25 @@ out:
return nr_failed + retry;
}
+extern void migrate_pages_cb(struct migration_ctl *ctl, bool offlining,
+ enum migrate_mode migrationMode)
+{
+ struct page *page;
+ page_mode pageMode;
+ const int swapwrite = current->flags & PF_SWAPWRITE;
+
+ if (!swapwrite)
+ current->flags |= PF_SWAPWRITE;
+
+ while ((page = ctl->getNextPage(ctl, &pageMode)))
+ unmap_and_move(NULL, 0, page, 0, offlining, pageMode, ctl,
+ migrationMode);
+
+ if (!swapwrite)
+ current->flags &= ~PF_SWAPWRITE;
+
+}
+
int migrate_huge_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
enum migrate_mode mode)
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Just, adds config options for enabling huge page cache and
enabling it in shmfs (tmpfs).
Signed-off-by: Radosław Smogura <[email protected]>
---
init/Kconfig | 6 ++++++
mm/Kconfig | 11 +++++++++++
2 files changed, 17 insertions(+), 0 deletions(-)
diff --git a/init/Kconfig b/init/Kconfig
index 3f42cd6..a58b622 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1135,6 +1135,12 @@ config SHMEM
option replaces shmem and tmpfs with the much simpler ramfs code,
which may be appropriate on small systems without swap.
+config SHMEM_HUGEPAGECACHE
+ bool "Allow usage of transparent huge pages"
+ depends on HUGEPAGECACHE && SHMEM
+ help
+ This allows usage of huge pages in shmfs (tmpfs)
+
config AIO
bool "Enable AIO support" if EXPERT
default y
diff --git a/mm/Kconfig b/mm/Kconfig
index e338407..494122d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -349,6 +349,17 @@ choice
benefit.
endchoice
+config HUGEPAGECACHE
+ bool "Support for huge pages in page cache"
+ depends on TRANSPARENT_HUGEPAGE
+ select COMPACTION
+ help
+ Huge pages in page cache allows to transaprently use huge
+ pages in file maped regions. This options just exports
+ required interfaces, You will need to enable support
+ for Huge Page Cache for particullar filesystems.
+ Currently only shmfs supports huge pages in page cache.
+
#
# UP and nommu archs use km based percpu allocator
#
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Those are generic rountines with support for SHMFS (TMPFS).
Signed-off-by: Radosław Smogura <[email protected]>
---
include/linux/defrag-pagecache.h | 62 +++++
include/linux/fs.h | 23 ++
mm/Makefile | 1 +
mm/defrag-pagecache.c | 489 ++++++++++++++++++++++++++++++++++++++
4 files changed, 575 insertions(+), 0 deletions(-)
create mode 100644 include/linux/defrag-pagecache.h
create mode 100644 mm/defrag-pagecache.c
diff --git a/include/linux/defrag-pagecache.h b/include/linux/defrag-pagecache.h
new file mode 100644
index 0000000..46793de
--- /dev/null
+++ b/include/linux/defrag-pagecache.h
@@ -0,0 +1,62 @@
+/*
+ * linux/include/linux/defrag-pagecache.h
+ *
+ * Defragments pagecache into compound pages
+ *
+ * (c) 2011 Radosław Smogura
+ */
+
+#ifndef DEFRAG_PAGECACHE_H
+#define DEFRAG_PAGECACHE_H
+#include <linux/fs.h>
+
+/* XXX Split this file into two public and protected - comments below
+ * Protected will contain
+ * declaration of generic and helper methods for file systems developers,
+ * public just general structures and controls.
+ */
+struct file;
+struct inode;
+struct defrag_pagecache_ctl;
+struct address_space;
+
+typedef struct page *defrag_generic_get_page(
+ const struct defrag_pagecache_ctl *ctl, struct inode *inode,
+ pgoff_t pageIndex);
+
+/** Passes additional information and controls to page defragmentation. */
+struct defrag_pagecache_ctl {
+ /** If yes defragmentation will try to fill page caches. */
+ char fillPages:1;
+
+ /** If filling of page fails, defragmentation will fail too. Setting
+ * this requires {@link #fillPages} will be setted.
+ */
+ char requireFillPages:1;
+
+ /** If yes defragmentation will try to force in many aspects, this may
+ * cause, operation to run longer, but with greater probability of
+ * success. */
+ char force:1;
+};
+
+/** Defragments page cache of specified file and migrates it's to huge pages.
+ *
+ * @param f
+ * @param offset
+ * @param size
+ * @return
+ */
+extern int defragPageCache(struct file *f, unsigned long offset,
+ unsigned long size, const struct defrag_pagecache_ctl *defragCtl);
+
+/** Tries to fix to huge page mappings, buy walking through given Trnapsarent
+ * Huge Page */
+extern int thpFixMappings(struct page *hugePage);
+
+extern int defrag_generic_shm(struct file *file, struct address_space *mapping,
+ loff_t pos,
+ struct page **pagep,
+ struct defrag_pagecache_ctl *ctl);
+#endif /* DEFRAG_PAGECACHE_H */
+
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 386da09..bfd9122 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -11,6 +11,10 @@
#include <linux/blk_types.h>
#include <linux/types.h>
+#ifdef CONFIG_HUGEPAGECACHE
+#include <linux/defrag-pagecache.h>
+#endif
+
/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
* the file limit at runtime and only root can increase the per-process
@@ -602,6 +606,25 @@ struct address_space_operations {
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
+#ifdef CONFIG_HUGEPAGECACHE
+ /** Used to defrag (migrate) pages at position {@code pos}
+ * to huge pages. Having this not {@code NULL} will indicate that
+ * address space, generally, supports huge pages (transaprent
+ * huge page may be established).
+ * <br/>
+ * It's like migrate pages, but different :)
+ *
+ * @param pagep on success will be setted to established huge page
+ *
+ * @returns TODO What to return?
+ * {@code 0} on success, value less then {@code 0} on error
+ */
+ int (*defragpage) (struct file *, struct address_space *mapping,
+ loff_t pos,
+ struct page **pagep,
+ const struct defrag_pagecache_ctl *ctl);
+#endif
+
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00e..75389c8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,3 +51,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_HUGEPAGECACHE) += defrag-pagecache.o
\ No newline at end of file
diff --git a/mm/defrag-pagecache.c b/mm/defrag-pagecache.c
new file mode 100644
index 0000000..5a14fe8
--- /dev/null
+++ b/mm/defrag-pagecache.c
@@ -0,0 +1,489 @@
+/*
+ * linux/mm/defrag-pagecache.c
+ *
+ * Defragments pagecache into compound pages
+ *
+ * (c) 2011 Radosław Smogura
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <asm/pgtable.h>
+#include <linux/migrate.h>
+#include <linux/defrag-pagecache.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/rmap.h>
+#include <linux/page-flags.h>
+#include <linux/shmem_fs.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+/*#include <linux/pgtable_helper.h>*/
+
+struct migration_private {
+ loff_t startIndex;
+ pgoff_t nextIndex;
+ pgoff_t pagesToMigrateCount;
+
+ struct page *hugePage;
+ struct inode *inode;
+
+ const struct defrag_pagecache_ctl *defragCtl;
+
+ int stop;
+ int result;
+ int stoppedCompoundFound;
+
+ /** Callback method used to obtain next page. */
+ defrag_generic_get_page *getNextPage;
+};
+
+static const struct defrag_pagecache_ctl defaultDefragCtl = {
+ .fillPages = 0,
+ .requireFillPages = 0,
+ .force = 0
+};
+
+#define HUGEPAGE_ALLOC_GFP (GFP_HIGHUSER | __GFP_COMP \
+ | __GFP_REPEAT | __GFP_NOWARN | __GFP_WAIT)
+
+static int defrageOneHugePage(struct file *file, loff_t offset,
+ struct page **pagep,
+ const struct defrag_pagecache_ctl *defragCtl,
+ defrag_generic_get_page *getPage);
+
+int defragPageCache(struct file *f, unsigned long offset, unsigned long size,
+ const struct defrag_pagecache_ctl *defragCtl)
+{
+ /* Calculate requested huge page order.
+ * XXX Is below caluclation mutliplatform?
+ */
+ const int hugePageOrder = (PMD_SHIFT - PAGE_SHIFT);
+ const int chunkSize = 1 << hugePageOrder;
+ unsigned long offsetIdx = offset;
+ unsigned long chunksToProceed;
+
+ struct inode *inode = f->f_path.dentry->d_inode;
+
+ const struct address_space_operations *aops =
+ inode->i_mapping->a_ops;
+
+ /* TODO: Use hugepage state or something better instead of hardcoded...
+ * value. */
+ if ((offset != ((offset >> hugePageOrder) << hugePageOrder) ||
+ size != ((size >> hugePageOrder) << hugePageOrder))
+ /* && (size != (1 << hugePageOrder))*/) {
+ /* Start and length must be huge page "aligned". */
+ return -EINVAL;
+ }
+
+ offsetIdx = offset;
+ chunksToProceed = size >> hugePageOrder;
+ for (; chunksToProceed; chunksToProceed--, offsetIdx += chunkSize) {
+ struct page *pagep;
+ int result = aops->defragpage(f, inode->i_mapping, offsetIdx,
+ &pagep,
+ defragCtl);
+ if (result)
+ return result;
+ }
+
+ return 0;
+}
+
+/** Callback for getting page for tmpfs.
+ * Tmpfs uses {@link shmem_read_mapping_page_gfp} function to read
+ * page from page cache.
+ */
+struct page *shmem_defrag_get_page(const struct defrag_pagecache_ctl *ctl,
+ struct inode *inode, pgoff_t pageIndex)
+{
+
+ return shmem_read_mapping_page_gfp(
+ inode->i_mapping, pageIndex,
+ mapping_gfp_mask(inode->i_mapping));
+}
+
+static void defrag_generic_mig_result(struct page *oldPage,
+ struct page *newPage, struct migration_ctl *ctl, int result)
+{
+ struct migration_private *prv =
+ (struct migration_private *) ctl->privateData;
+
+ if (!result) {
+ /* Update index only on success; on fail, index will be used to
+ * clean up. */
+ prv->nextIndex++;
+
+ if (!PageTail(newPage))
+ putback_lru_page(newPage);
+ else
+ put_page(newPage);
+ } else {
+ prv->stop = 1;
+ }
+
+ /* XXX No isolated zone status update! */
+ putback_lru_page(oldPage);
+ put_page(oldPage);
+/*
+ unlock_page(oldPage);
+*/
+
+ prv->result = result;
+}
+
+static struct page *defrag_generic_mig_page_new(struct page *oldPage,
+ struct migration_ctl *ctl)
+{
+ struct migration_private *prv =
+ (struct migration_private *) ctl->privateData;
+
+ return prv->hugePage + prv->nextIndex;
+}
+
+static struct page *defrag_generic_mig_page_next(struct migration_ctl *ctl,
+ page_mode *mode)
+{
+ struct migration_private *prv =
+ (struct migration_private *) ctl->privateData;
+ const struct defrag_pagecache_ctl *defragCtl;
+
+ /** Hold current page cache page, we are going to migrate. */
+ struct page *filePage;
+
+ struct inode *inode;
+
+ pgoff_t pageIndex;
+
+ if (!(prv->nextIndex < prv->pagesToMigrateCount))
+ return NULL;
+
+ if (prv->result || prv->stop)
+ return NULL;
+
+ inode = prv->inode;
+ pageIndex = prv->startIndex + prv->nextIndex;
+ defragCtl = prv->defragCtl;
+
+repeat_find:
+ filePage = find_lock_page(inode->i_mapping, pageIndex);
+
+ if (filePage)
+ if (PageUptodate(filePage))
+ goto skip_fill_pages;
+
+ /* Try to upread page, if this was intention of caller,
+ * we don't need to check if page is writeback, migrate pages do it. */
+ if (!defragCtl->fillPages) {
+ prv->result = 0;
+ prv->stop = 1;
+ return NULL;
+ }
+
+ filePage = prv->getNextPage(prv->defragCtl, inode, pageIndex);
+
+ if (IS_ERR(filePage)) {
+ prv->result = PTR_ERR(filePage);
+ prv->stop = 1;
+ return NULL;
+ }
+
+ lock_page(filePage);
+ /* Validate page */
+ if (!filePage->mapping
+ || filePage->index != pageIndex
+ || !PageUptodate(filePage)) {
+ unlock_page(filePage);
+ goto repeat_find;
+ }
+
+skip_fill_pages:
+ if (/* ??? !defragCtl->fillPages && */ PageCompound(filePage)) {
+ /* Heare I think about giving support that in page
+ * cache may exists huge page but not uptodate whole.
+ *
+ * Currently this idea is suspended, due to many
+ * complications.
+ */
+ prv->stoppedCompoundFound = 1;
+ goto out_unlock_and_stop;
+ }
+
+ /* Prepare page for isolation, check if it can be isolated. */
+ if (!PageLRU(filePage)) {
+ if (defragCtl->force) {
+ /* Isolation requires page in LRU, we may need to drain
+ * it if not present. */
+ lru_add_drain();
+ if (!PageLRU(filePage)) {
+ lru_add_drain_all();
+ if (!PageLRU(filePage)) {
+ prv->result = -EBUSY;
+ goto out_unlock_and_stop;
+ }
+ }
+ } else {
+ prv->result = -EBUSY;
+ goto out_unlock_and_stop;
+ }
+ }
+
+ /* Isolate pages. */
+ if (isolate_lru_page(filePage)) {
+ prv->result = -EBUSY;
+ goto putback_page_and_stop;
+ }
+
+ *mode = PAGE_LOCKED;
+ return filePage;
+
+putback_page_and_stop:
+ putback_lru_page(filePage);
+
+out_unlock_and_stop:
+ unlock_page(filePage);
+ put_page(filePage);
+
+ return NULL;
+
+}
+
+int defrag_generic_shm(struct file *file, struct address_space *mapping,
+ loff_t pos,
+ struct page **pagep,
+ struct defrag_pagecache_ctl *ctl)
+{
+ return defrageOneHugePage(file, pos, pagep, ctl, shmem_defrag_get_page);
+}
+EXPORT_SYMBOL(defrag_generic_shm);
+
+int defrag_generic_pagecache(struct file *file,
+ struct address_space *mapping,
+ loff_t pos,
+ struct page **pagep,
+ struct defrag_pagecache_ctl *ctl)
+{
+ /* As we do not support generic page cache defragmentaion, yet. */
+ BUG();
+ return 0;
+}
+/** Internal method for defragmenting one chunk of page cache.
+ *
+ * <br/>
+ * This is in some
+ * way common logic to operate on page cache. It's highly probably that this
+ * method will be exposed as "generic" to add support for transparent
+ * huge pages for page cache.
+ */
+static int defrageOneHugePage(struct file *file, loff_t offset,
+ struct page **pagep,
+ const struct defrag_pagecache_ctl *defragCtl,
+ defrag_generic_get_page *getPage)
+{
+ const int hugePageOrder = (PMD_SHIFT - PAGE_SHIFT);
+
+ /** Huge page we migrate to. */
+ struct page *hugePage;
+
+ /** Private migration data. */
+ struct migration_private migrationPrv;
+
+ struct migration_ctl migration_ctl;
+
+ struct inode *inode = file->f_path.dentry->d_inode;
+
+ const int size = 1 << hugePageOrder;
+
+ /** Helpers */
+ pgoff_t i;
+
+ /* Over here we callback based migration. */
+ /* READ.
+ *
+ * This code is in develop stage, and following problems must be
+ * resolved:
+ * - page is read from page cache, but lock is droped, in meantime
+ * page may be no longer up to date, or may be removed from
+ * page cache. This will be resolved by changing migrat function
+ */
+ /* Allocate one huge page. */
+ hugePage = alloc_pages(HUGEPAGE_ALLOC_GFP, hugePageOrder);
+ if (!hugePage)
+ return -ENOMEM;
+
+ migrationPrv.nextIndex = 0;
+ migrationPrv.pagesToMigrateCount = size;
+ migrationPrv.hugePage = hugePage;
+ migrationPrv.stop = 0;
+ migrationPrv.result = 0;
+ migrationPrv.stoppedCompoundFound = 0;
+ migrationPrv.getNextPage = getPage;
+ migrationPrv.startIndex = offset;
+ migrationPrv.inode = inode;
+ migrationPrv.defragCtl =
+ (const struct defrag_pagecache_ctl *) defragCtl;
+ /* Elevate page counts */
+ for (i = 1; i < size; i++) {
+ struct page *p = hugePage + i;
+ /* Elevate page counters. */
+ get_page(p);
+ }
+
+ migration_ctl.getNextPage = defrag_generic_mig_page_next;
+ migration_ctl.getNewPage = defrag_generic_mig_page_new;
+ migration_ctl.notifyResult = defrag_generic_mig_result;
+ migration_ctl.privateData = (unsigned long) &migrationPrv;
+
+ /* Aquire compund lock. */
+ compound_lock(hugePage);
+
+ /* Migrate pages. Currently page migrate will auto put back pages,
+ * and may fail and repeat, we need array of pages, to match
+ * each subpage. This behaviour isn't good.
+ */
+ migrate_pages_cb(&migration_ctl, true,
+ MIGRATE_SYNC | MIGRATE_SRC_GETTED);
+ if (migrationPrv.nextIndex < migrationPrv.pagesToMigrateCount) {
+ /* XXX Simulate various bugs, at least do it hardcoded. */
+ /* XXX Everything here is BUG, because need to opcode spliting
+ */
+ if (migrationPrv.stoppedCompoundFound) {
+ /* If any page has been migrated it's a BUG */
+ BUG_ON(migrationPrv.nextIndex);
+ goto compound_unlock_end;
+ }
+ /* Not all pages has been migrated, split target page. */
+ /* Downgrade counts of tail pages - may cause deadlock. */
+ VM_BUG_ON(1);
+ } else {
+ goto compound_unlock_end;
+ }
+
+compound_unlock_end:
+ compound_unlock(hugePage);
+/*
+ put_page(hugePage);
+*/
+
+ /* All file pages are unlocked, and should be freed. Huge should be on
+ * Unevictable list.
+ */
+ return migrationPrv.result;
+}
+
+static int thpFixMappingsRmapWalk(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, void *prvData) {
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+
+ int i;
+
+/*
+ printk(KERN_INFO "Starting address is %lx", addr);
+*/
+ if (vma->vm_flags & VM_NONLINEAR || (addr & ~HPAGE_PMD_MASK)) {
+ /* Skip nonlinear VMAs, and not aligned addresses*/
+ return SWAP_AGAIN;
+ }
+
+ /* We will set pmd only if all tail pages meets following requirements:
+ * - all pages are up to data
+ * - all pages have same protection bits
+ * - ???
+ */
+ pgd = pgd_offset(vma->vm_mm, addr);
+ if (!pgd_present(*pgd))
+ return SWAP_AGAIN;
+
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ return SWAP_AGAIN;
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ return SWAP_AGAIN;
+
+ pte = (pte_t *) pmd;
+ if (pte_huge(*pte))
+ return SWAP_AGAIN;
+
+
+ /*printk(KERN_INFO "Checking head flags"); */
+ pte = pte_offset_map(pmd, addr);
+ if (!pte_present(*pte)) {
+ /* printk(KERN_INFO "Pte not present."); */
+ pte_unmap(pte);
+ return SWAP_AGAIN;
+ }
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *tail_page;
+
+ addr += PAGE_SIZE;
+
+ pte = pte_offset_map(pmd, addr);
+ if (!pte_present(*pte)) {
+ /*
+ * printk(KERN_INFO "No %d pte returning.", i);
+ */
+ pte_unmap(pte);
+ return SWAP_AGAIN;
+ }
+
+ tail_page = pte_page(*pte);
+ if (!tail_page) {
+ /* printk(KERN_INFO "Page +%d not present.", i); */
+ goto unmap_out;
+ }
+
+ /* We check index, howver we do not allow not linear mapping :)
+ */
+ /* smp_mb(); */
+ int i1 = tail_page->mapping == page->mapping;
+ int i2 = tail_page->index == (page->index + i);
+ if (i1 && i2) {
+ /*
+ printk(KERN_INFO "Page +%d present mappings and"
+ " indices ok", i);
+ */
+ } else {
+ printk(KERN_INFO "Page +%d has good mapping %d, and"
+ " good index %d (%d, %d).",
+ i,
+ i1,
+ i2,
+ tail_page->index,
+ page->index);
+ goto unmap_out;
+ }
+ pte_unmap(pte);
+ }
+ pmd_clear(pmd);
+ _pmd = pmd_mkhuge(pmd_modify(*pmd, vma->vm_page_prot));
+
+ set_pmd_at(vma->vm_mm, addr, pmd, _pmd);
+ /* Everything is ok. */
+
+ /* TODO Do not flush all :) */
+ flush_tlb_mm(vma->vm_mm);
+ printk(KERN_INFO "Replaced by pmd");
+ return SWAP_AGAIN;
+unmap_out:
+ pte_unmap(pte);
+
+ return SWAP_AGAIN;
+}
+
+int thpFixMappings(struct page *hugePage)
+{
+ BUG_ON(PageAnon(hugePage));
+ /* lock_page(hugePage); */
+ BUG_ON(!PageTransHuge(hugePage));
+ rmap_walk(hugePage, thpFixMappingsRmapWalk, NULL);
+ /* unlock_page(hugePage); */
+
+ return 0;
+}
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Patch adds support for mapping file with MAP_HUGETLB and does
checks if filesystem supports huge page cache.
Signed-off-by: Radosław Smogura <[email protected]>
---
mm/mmap.c | 24 +++++++++++++++-
mm/shmem.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 106 insertions(+), 2 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f758c7..19f3016 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -992,6 +992,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+ if (flags & MAP_HUGETLB) {
+ vm_flags &= ~VM_NOHUGEPAGE;
+ vm_flags |= VM_HUGEPAGE;
+ printk(KERN_INFO "Setted huge page mapping in do_mmap_pgoff.");
+ }
+
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
@@ -1086,11 +1092,25 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
- if (unlikely(flags & MAP_HUGETLB))
- return -EINVAL;
file = fget(fd);
if (!file)
goto out;
+
+ if (unlikely(flags & MAP_HUGETLB)) {
+#ifdef CONFIG_HUGEPAGECACHE
+ if (!(file->f_mapping->a_ops->defragpage)) {
+ fput(file);
+ retval = -EINVAL;
+ goto out;
+ } else {
+ printk(KERN_INFO "Called to mmap huge with"
+ " good fs type.\n");
+ }
+#else
+ fput(file);
+ return -EINVAL;
+#endif
+ }
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 269d049..a834488 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1065,6 +1065,90 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return ret;
}
+static int shmem_fault_huge(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ int error;
+ int ret = VM_FAULT_LOCKED;
+
+ error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
+ if (error)
+ return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
+
+ /* Just portion of developer code, to force defragmentation, as we have
+ * no external interface to make defragmentation (or daemon to do it).
+ */
+ if ((vma->vm_flags & VM_HUGEPAGE) && !PageCompound(vmf->page)) {
+ /* Force defrag - mainly devo code */
+ int defragResult;
+ const loff_t hugeChunkSize = 1 << (PMD_SHIFT - PAGE_SHIFT);
+
+ const loff_t vmaSizeToMap = (vma->vm_start
+ + ((vmf->pgoff + vma->vm_pgoff + hugeChunkSize)
+ << PAGE_SHIFT) <= vma->vm_end) ?
+ hugeChunkSize : 0;
+
+ const loff_t inodeSizeToMap =
+ (vmf->pgoff + vma->vm_pgoff + hugeChunkSize <
+ inode->i_size) ? hugeChunkSize : 0;
+
+ const struct defrag_pagecache_ctl defragControl = {
+ .fillPages = 1,
+ .requireFillPages = 1,
+ .force = 1
+ };
+
+ if (ret & VM_FAULT_LOCKED) {
+ unlock_page(vmf->page);
+ }
+ put_page(vmf->page);
+
+ defragResult = defragPageCache(vma->vm_file,
+ vmf->pgoff,
+ min(vmaSizeToMap, min(inodeSizeToMap, hugeChunkSize)),
+ &defragControl);
+ printk(KERN_INFO "Page defragmented with result %d\n",
+ defragResult);
+
+ /* Retake page. */
+ error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
+ &ret);
+ if (error) {
+ return ((error == -ENOMEM) ?
+ VM_FAULT_OOM : VM_FAULT_SIGBUS);
+ }
+ }
+
+ /* XXX Page & compound lock ordering please... */
+
+ /* After standard fault page is getted. */
+ if (PageCompound(vmf->page)) {
+ compound_lock(vmf->page);
+ if (!PageHead(vmf->page)) {
+ compound_unlock(vmf->page);
+ goto no_hugepage;
+ }
+ }else {
+ goto no_hugepage;
+ }
+
+ if (!(ret & VM_FAULT_LOCKED))
+ lock_page(vmf->page);
+
+ ret |= VM_FAULT_LOCKED;
+
+ if (ret & VM_FAULT_MAJOR) {
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ }
+ return ret;
+no_hugepage:
+ if (ret & VM_FAULT_LOCKED)
+ unlock_page(vmf->page);
+ page_cache_release(vmf->page);
+ vmf->page = NULL;
+ return VM_FAULT_NOHUGE;
+}
#ifdef CONFIG_NUMA
static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Adds some basic vm routines and macros to operate on huge page
cache, designed to proper faulting of huge pages.
1. __do_fault - made it common for huge and small.
2. Simple wrappers for huge pages for rmapping.
3. Other changes.
Signed-off-by: Radosław Smogura <[email protected]>
---
include/linux/defrag-pagecache.h | 18 +--
include/linux/fs.h | 19 +-
include/linux/mm.h | 28 ++
include/linux/mm_types.h | 2 +-
include/linux/rmap.h | 9 +
mm/huge_memory.c | 42 +++
mm/memory.c | 528 +++++++++++++++++++++++++++++++-------
mm/page-writeback.c | 31 +++
mm/rmap.c | 29 ++
9 files changed, 582 insertions(+), 124 deletions(-)
diff --git a/include/linux/defrag-pagecache.h b/include/linux/defrag-pagecache.h
index 46793de..4ca3468 100644
--- a/include/linux/defrag-pagecache.h
+++ b/include/linux/defrag-pagecache.h
@@ -8,7 +8,7 @@
#ifndef DEFRAG_PAGECACHE_H
#define DEFRAG_PAGECACHE_H
-#include <linux/fs.h>
+#include <linux/defrag-pagecache.h>
/* XXX Split this file into two public and protected - comments below
* Protected will contain
@@ -24,22 +24,6 @@ typedef struct page *defrag_generic_get_page(
const struct defrag_pagecache_ctl *ctl, struct inode *inode,
pgoff_t pageIndex);
-/** Passes additional information and controls to page defragmentation. */
-struct defrag_pagecache_ctl {
- /** If yes defragmentation will try to fill page caches. */
- char fillPages:1;
-
- /** If filling of page fails, defragmentation will fail too. Setting
- * this requires {@link #fillPages} will be setted.
- */
- char requireFillPages:1;
-
- /** If yes defragmentation will try to force in many aspects, this may
- * cause, operation to run longer, but with greater probability of
- * success. */
- char force:1;
-};
-
/** Defragments page cache of specified file and migrates it's to huge pages.
*
* @param f
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bfd9122..7288166 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,10 +10,7 @@
#include <linux/ioctl.h>
#include <linux/blk_types.h>
#include <linux/types.h>
-
-#ifdef CONFIG_HUGEPAGECACHE
-#include <linux/defrag-pagecache.h>
-#endif
+#include <linux/defrag-pagecache-base.h>
/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -596,6 +593,9 @@ struct address_space_operations {
/* Set a page dirty. Return true if this dirtied it */
int (*set_page_dirty)(struct page *page);
+ /** Same as \a set_page_dirty but for huge page */
+ int (*set_page_dirty_huge)(struct page *page);
+
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
@@ -606,7 +606,6 @@ struct address_space_operations {
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-#ifdef CONFIG_HUGEPAGECACHE
/** Used to defrag (migrate) pages at position {@code pos}
* to huge pages. Having this not {@code NULL} will indicate that
* address space, generally, supports huge pages (transaprent
@@ -616,15 +615,19 @@ struct address_space_operations {
*
* @param pagep on success will be setted to established huge page
*
- * @returns TODO What to return?
- * {@code 0} on success, value less then {@code 0} on error
+ * @returns {@code 0} on success, value less then {@code 0} on error
*/
int (*defragpage) (struct file *, struct address_space *mapping,
loff_t pos,
struct page **pagep,
const struct defrag_pagecache_ctl *ctl);
-#endif
+ /** Used to split page, this method may be called under memory
+ * preasure. Actaully, You should not split page.
+ */
+ int (*split_page) (struct file *file, struct address_space *mapping,
+ loff_t pos, struct page *hueg_page);
+
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 72f6a50..27a10c8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -206,10 +206,19 @@ struct vm_operations_struct {
void (*close)(struct vm_area_struct * area);
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+ /** Same as \a fault but should return huge page, instead of single one.
+ * If function fails, then caller may try again with fault.
+ */
+ int (*fault_huge)(struct vm_area_struct *vma, struct vm_fault *vmf);
+
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
+ /** Same as \a page_mkwrite, but for huge page. */
+ int (*page_mkwrite_huge)(struct vm_area_struct *vma,
+ struct vm_fault *vmf);
+
/* called by access_process_vm when get_user_pages() fails, typically
* for use by special VMAs that can switch between memory and hardware
*/
@@ -534,6 +543,16 @@ static inline void get_page(struct page *page)
}
}
+/** Bumps tail pages usage count. If there is at least one page that do not have
+ * valid mapping page count is left untoach.
+ */
+extern void get_page_tails_for_fmap(struct page *head);
+
+/** Decrease tail pages usage count.
+ * This function assumes you have getted compound or forozen compound.
+ */
+extern void put_page_tails_for_fmap(struct page *head);
+
static inline void get_huge_page_tail(struct page *page)
{
/*
@@ -996,6 +1015,7 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
+#define VM_FAULT_NOHUGE 0x0800 /* ->fault_huge, no huge page available .*/
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
@@ -1161,6 +1181,14 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
void account_page_dirtied(struct page *page, struct address_space *mapping);
void account_page_writeback(struct page *page);
int set_page_dirty(struct page *page);
+
+/** Sets huge page dirty, this will lock all tails, head should be locked.
+ * Compound should be getted or frozen. Skips all pages that have no mapping
+ *
+ * @param head
+ * @return number of sucessfull set_page_dirty
+ */
+int set_page_dirty_huge(struct page *page);
int set_page_dirty_lock(struct page *page);
int clear_page_dirty_for_io(struct page *page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7649722..7d2c09d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -296,7 +296,7 @@ struct vm_area_struct {
/* Function pointers to deal with this struct. */
const struct vm_operations_struct *vm_ops;
-
+
/* Information about our backing store: */
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
units, *not* PAGE_CACHE_SIZE */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1cdd62a..bc547cb 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -142,8 +142,17 @@ void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, int);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
void page_add_file_rmap(struct page *);
+
+/** Adds remap for huge page, compound page must be getted or frozen.
+ */
+extern void page_add_file_rmap_huge(struct page *head);
+
void page_remove_rmap(struct page *);
+/** Removes rmap for huge page, compound page must be getted or frozen.
+ */
+void page_remove_rmap_huge(struct page *);
+
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e3b4c38..74d2e84 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2455,3 +2455,45 @@ void __vma_adjust_trans_huge(struct vm_area_struct *vma,
split_huge_page_address(next->vm_mm, nstart);
}
}
+
+/** Bumps tail pages usage count. This function assumes you have getted compound
+ * or forozen compound.
+ */
+void get_page_tails_for_fmap(struct page *head)
+{
+ struct page *page;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+ VM_BUG_ON(compound_order(head) < 2);
+
+ get_page(head + 1);
+ /* We may use __first_page, because we getts compound at whole. */
+ for (page = head + 2; page->__first_page == head; page++) {
+ VM_BUG_ON(!atomic_read(&page->_count));
+ VM_BUG_ON(!page->mapping);
+ VM_BUG_ON(!PageTail(page));
+ get_page(page);
+ }
+}
+
+/** Decrease tail pages usage count.
+ * This function assumes you have getted compound or forozen compound.
+ */
+void put_page_tails_for_fmap(struct page *head)
+{
+ struct page *page;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+ VM_BUG_ON(compound_order(head) < 2);
+
+ put_page(head + 1);
+ /* We may use __first_page, because we getts compound at whole. */
+ for (page = head + 2; page->__first_page == head; page++) {
+ VM_BUG_ON(!atomic_read(&page->_count));
+ VM_BUG_ON(!page->mapping);
+ VM_BUG_ON(!PageTail(page));
+ put_page(page);
+ }
+}
diff --git a/mm/memory.c b/mm/memory.c
index a0ab73c..7427c9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3148,7 +3148,137 @@ oom:
return VM_FAULT_OOM;
}
-/*
+/** Level 0 check if it's possible to establish huge pmd in process address
+ * space.
+ */
+static int check_if_hugemapping_is_possible0(
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pgoff_t pgoff,
+ pmd_t pmdVal /* Keep pmd for THP for Pivate Mapping. */)
+{
+ if (vma->vm_ops) {
+ /* This is base chcek. */
+ if (!vma->vm_ops->fault_huge)
+ return 0;
+ } else {
+ return 0;
+ }
+
+ if (vma->vm_flags & VM_SHARED && !(vma->vm_flags & VM_NONLINEAR)) {
+ /* Check if VMA address is pmd aligned */
+ if ((address & ~PMD_MASK) != 0)
+ return 0;
+
+ /* Check if pgoff is huge page aligned */
+ /* XXX This should be exported as it's reused in defrag. */
+ if ((pgoff & ((1 << (PMD_SHIFT - PAGE_SHIFT)) - 1)) != 0)
+ return 0;
+
+ /* Check if huge pmd will fit inside VMA.
+ * pmd_address_end returns first byte after end, not last byte!
+ */
+ if (!(pmd_addr_end(address, (unsigned long) -1) <= vma->vm_end))
+ return 0;
+
+ /* WIP [Private THP], check if pmd is marked as do not make THP,
+ * e.g. because it has COWs. (COWs gives milk).
+ * We need add such flag because
+ */
+
+ /* Check if file has enaugh length - not needed if there is
+ * huge page in page cache, this implies file has enaugh lenght.
+ * TODO Think on above. If true make requirement for THP support
+ * in page cache (put in documentation).
+ * This may break some concepts that page cache may have not
+ * up to date huge page, too.
+ */
+ } else {
+ /* Anonymous VMA - not opcoded, yet. */
+ return 0;
+ }
+
+ /* All tests passed */
+ printk(KERN_INFO "Chk - All passed");
+ return 1;
+}
+
+
+/** Commons function for performing faulting with support for huge pages.
+ * This method is designed to be facade-ed, by others.
+ *
+ * TODO Still need to consider locking order, to prevent dead locks...
+ * it's looks like better will be compound_lock -> page_lock
+ *
+ * @param page loaded head page, locked iff compound_lock, getted
+ *
+ * @return {@code 0} on success
+ */
+static /*inline*/ int __huge_lock_check(
+ struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pud_t *pud,
+ pmd_t pmd,
+ pgoff_t pgoff,
+ unsigned int flags,
+ struct page *head)
+{
+ struct page *workPage;
+ unsigned long workAddress;
+ unsigned int processedPages;
+
+ int result = 0;
+
+ VM_BUG_ON(!check_if_hugemapping_is_possible0(vma, address, pgoff,
+ pmd));
+ VM_BUG_ON(atomic_read(&head->_count) <= 2);
+ VM_BUG_ON(!PageHead(head));
+
+ /* TODO [Documentation] expose below rules, from code.
+ *
+ * XXX Is it possible to with tests in loop to map not uptodate pages?
+ *
+ * It's looks like that with following designe we require that removing
+ * page uptodate flag, for compound pages, may require compound lock
+ * or something else.
+ */
+
+ /* Check if tail pages are uptodate, this should not happen,
+ * as we have compound_lock, but I can't guarantee and linear ordered.
+ */
+ processedPages = 0;
+ workAddress = address;
+ /** XXX [Performance] compound_head is rather slow make new macro, when
+ * we have compound page getted.
+ */
+ for (workPage = head; compound_head(workPage) == head; workPage++) {
+ if (!PageUptodate(workPage)
+ || !workPage->mapping
+ || (workPage->index - processedPages != pgoff)) {
+ result = -EINVAL;
+ goto exit_processing;
+ }
+ /* We don't check ptes, because we have shared mapping
+ * so all ptes should be (or could be in future) same, meaning
+ * mainly protection flags. This check will be required for
+ * private mapping.
+ */
+ processedPages++;
+ workAddress += PAGE_SIZE;
+ }
+ if (processedPages != (1 << (PMD_SHIFT - PAGE_SHIFT))) {
+ /* Not enaugh processed pages, why? */
+ return processedPages + 1;
+ }
+
+exit_processing:
+ printk("Processed %d", processedPages);
+
+ return result;
+}
+
+/**
* __do_fault() tries to create a new page mapping. It aggressively
* tries to share with existing pages, but makes a separate copy if
* the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
@@ -3160,28 +3290,45 @@ oom:
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte neither mapped nor locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
+ *
+ * This method shares same concepts for single and huge pages.
+ *
+ * @param pud pud entry, if NULL method operates in single page mode, otherwise
+ * operates in huge page mode.
*/
-static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+static inline int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pud_t *pud, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags,
+ pmd_t orig_pmd, pte_t orig_pte)
{
pte_t *page_table;
+ pmd_t *huge_table;
+
+ pte_t entry;
+ pmd_t hentry;
+
spinlock_t *ptl;
struct page *page;
struct page *cow_page;
- pte_t entry;
+
int anon = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
+ const struct vm_operations_struct *vm_ops = vma->vm_ops;
int ret;
int page_mkwrite = 0;
+ VM_BUG_ON((!!pmd) == (!!pud));
+
/*
* If we do COW later, allocate page befor taking lock_page()
* on the file cache page. This will reduce lock holding time.
*/
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
-
+ if (pud) {
+ /* Privte mapping write not supported yet. */
+ BUG();
+ }
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
@@ -3196,14 +3343,20 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else
cow_page = NULL;
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+ vmf.virtual_address = (void __user *)
+ (address & (pud ? HPAGE_MASK : PAGE_MASK));
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
- ret = vma->vm_ops->fault(vma, &vmf);
+ /** XXX Tails should be getted to. */
+ if (pud)
+ ret = vm_ops->fault_huge(vma, &vmf);
+ else
+ ret = vm_ops->fault(vma, &vmf);
+
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
- VM_FAULT_RETRY)))
+ VM_FAULT_RETRY | VM_FAULT_NOHUGE)))
goto uncharge_out;
if (unlikely(PageHWPoison(vmf.page))) {
@@ -3213,21 +3366,36 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto uncharge_out;
}
- /*
- * For consistency in subsequent calls, make the faulted page always
- * locked.
+ /* For consistency in subsequent calls, make the faulted page
+ * always locked.
*/
if (unlikely(!(ret & VM_FAULT_LOCKED)))
- lock_page(vmf.page);
+ lock_page(vmf.page);
else
VM_BUG_ON(!PageLocked(vmf.page));
+ page = vmf.page;
+ if (pud) {
+ /* Check consystency of page, if it is applicable for huge
+ * mapping.
+ */
+ if (__huge_lock_check(mm, vma, address, pud, orig_pmd, pgoff,
+ flags, vmf.page)) {
+ unlock_page(page);
+ goto unwritable_page;
+ }
+ }
+
/*
* Should we do an early C-O-W break?
*/
- page = vmf.page;
if (flags & FAULT_FLAG_WRITE) {
if (!(vma->vm_flags & VM_SHARED)) {
+ if (pud) {
+ /* Private cowing not supported yet for huge. */
+ BUG();
+ }
+
page = cow_page;
anon = 1;
copy_user_highpage(page, vmf.page, address, vma);
@@ -3238,89 +3406,156 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* address space wants to know that the page is about
* to become writable
*/
- if (vma->vm_ops->page_mkwrite) {
+ if ((!pud && vm_ops->page_mkwrite) ||
+ (pud && vm_ops->page_mkwrite_huge)) {
int tmp;
-
unlock_page(page);
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+ tmp = vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+ if (pud)
+ BUG();
lock_page(page);
if (!page->mapping) {
ret = 0; /* retry the fault */
- unlock_page(page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(page));
- page_mkwrite = 1;
+ page_mkwrite = 1 << (PMD_SHIFT - PAGE_SHIFT);
}
}
}
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-
- /*
- * This silly early PAGE_DIRTY setting removes a race
- * due to the bad i386 page protection. But it's valid
- * for other architectures too.
- *
- * Note that if FAULT_FLAG_WRITE is set, we either now have
- * an exclusive copy of the page, or this is a shared mapping,
- * so we can make it writable and dirty to avoid having to
- * handle that later.
+ /* Following if is almost same for pud and not pud just, specified
+ * methods changed. Keep it as far as possi ble synchronized
*/
- /* Only go through if we didn't race with anybody else... */
- if (likely(pte_same(*page_table, orig_pte))) {
- flush_icache_page(vma, page);
- entry = mk_pte(page, vma->vm_page_prot);
- if (flags & FAULT_FLAG_WRITE)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (anon) {
- inc_mm_counter_fast(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
- } else {
- inc_mm_counter_fast(mm, MM_FILEPAGES);
- page_add_file_rmap(page);
+ if (pud) {
+ huge_table = pmd_offset(pud, address);
+ /* During allocation of pte pte_alloc uses, mm's page table lock
+ * it is not best solution, but we reuse it here.
+ */
+ ptl = &mm->page_table_lock;
+ spin_lock(ptl);
+ if (likely(pmd_same(*huge_table, orig_pmd))) {
+ flush_icache_page(vma, page);/* TODO Arch specific? */
+ hentry = mk_pmd(page, vma->vm_page_prot);
+ hentry = pmd_mkhuge(hentry);
+
if (flags & FAULT_FLAG_WRITE) {
- dirty_page = page;
- get_page(dirty_page);
+ hentry = pmd_mkdirty(hentry);
+ /* TODO make it pmd_maybe_mkwrite*/
+ if (likely(vma->vm_flags & VM_WRITE))
+ hentry = pmd_mkwrite(hentry);
}
- }
- set_pte_at(mm, address, page_table, entry);
+ if (anon) {
+ BUG();
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, address);
+ } else {
+ /* TODO Inc of huge pages counter...*/
+ add_mm_counter_fast(mm, MM_FILEPAGES,
+ HPAGE_PMD_NR);
+ page_add_file_rmap_huge(page);
+ if (flags & FAULT_FLAG_WRITE) {
+ dirty_page = page;
+ get_page(dirty_page);
+ get_page_tails_for_fmap(dirty_page);
+ }
+ }
+ set_pmd_at(mm, address, huge_table, hentry);
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, address, page_table);
+ /* no need to invalidate: a not-present page won't be
+ * cached */
+ update_mmu_cache(vma, address, page_table);
+ } else {
+ if (cow_page)
+ mem_cgroup_uncharge_page(cow_page);
+ if (anon)
+ page_cache_release(page);
+ else
+ anon = 1; /* no anon but release faulted_page */
+ }
+ spin_unlock(ptl);
} else {
- if (cow_page)
- mem_cgroup_uncharge_page(cow_page);
- if (anon)
- page_cache_release(page);
- else
- anon = 1; /* no anon but release faulted_page */
- }
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ /*
+ * This silly early PAGE_DIRTY setting removes a race
+ * due to the bad i386 page protection. But it's valid
+ * for other architectures too.
+ *
+ * Note that if FAULT_FLAG_WRITE is set, we either now have
+ * an exclusive copy of the page, or this is a shared mapping,
+ * so we can make it writable and dirty to avoid having to
+ * handle that later.
+ */
+ /* Only go through if we didn't race with anybody else... */
+ if (likely(pte_same(*page_table, orig_pte))) {
+ flush_icache_page(vma, page);
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (flags & FAULT_FLAG_WRITE)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (anon) {
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, address);
+ } else {
+ inc_mm_counter_fast(mm, MM_FILEPAGES);
+ page_add_file_rmap(page);
+ if (flags & FAULT_FLAG_WRITE) {
+ dirty_page = page;
+ get_page(dirty_page);
+ }
+ }
+ set_pte_at(mm, address, page_table, entry);
- pte_unmap_unlock(page_table, ptl);
+ /* no need to invalidate: a not-present page won't be
+ * cached */
+ update_mmu_cache(vma, address, page_table);
+ } else {
+ if (cow_page)
+ mem_cgroup_uncharge_page(cow_page);
+ if (anon)
+ page_cache_release(page);
+ else
+ anon = 1; /* no anon but release faulted_page */
+ }
+ pte_unmap_unlock(page_table, ptl);
+ }
if (dirty_page) {
struct address_space *mapping = page->mapping;
- if (set_page_dirty(dirty_page))
- page_mkwrite = 1;
- unlock_page(dirty_page);
+ if (pud) {
+ int dirtied;
+ dirtied = set_page_dirty_huge(dirty_page);
+ unlock_page(dirty_page);
+ if (dirtied)
+ page_mkwrite = dirtied;
+ } else {
+ if (set_page_dirty(dirty_page))
+ page_mkwrite = 1;
+ unlock_page(dirty_page);
+ }
+
+ if (pud) {
+ put_page_tails_for_fmap(dirty_page);
+ compound_put(page);
+ }
+
put_page(dirty_page);
if (page_mkwrite && mapping) {
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
- balance_dirty_pages_ratelimited(mapping);
+ balance_dirty_pages_ratelimited_nr(mapping,
+ page_mkwrite);
}
/* file_update_time outside page_lock */
@@ -3328,6 +3563,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
file_update_time(vma->vm_file);
} else {
unlock_page(vmf.page);
+ if (pud)
+ compound_put(page);
if (anon)
page_cache_release(vmf.page);
}
@@ -3335,6 +3572,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
unwritable_page:
+ if (pud) {
+ compound_put(page);
+ put_page_tails_for_fmap(page);
+ }
page_cache_release(page);
return ret;
uncharge_out:
@@ -3346,6 +3587,33 @@ uncharge_out:
return ret;
}
+/** Facade for {@link __do_fault} to fault "huge" pages.
+ * GCC will strip unneeded code basing on parameters passed.
+ */
+static int __do_fault_huge(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pud_t *pud,
+ pgoff_t pgoff, unsigned int flags,
+ pmd_t orig_pmd)
+{
+ pte_t pte_any;
+ return __do_fault(
+ mm, vma, address, pud, NULL, pgoff, flags, orig_pmd, pte_any);
+}
+
+/** Facade for {@link __do_fault} to fault "normal", pte level pages.
+ * GCC will strip unneeded code basing on parameters passed.
+ */
+static int __do_fault_normal(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+ pmd_t pmd_any;
+ return __do_fault(
+ mm, vma, address, NULL, pmd, pgoff, flags, pmd_any, orig_pte);
+}
+
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
@@ -3354,7 +3622,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
- return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ return __do_fault_normal(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
/*
@@ -3386,7 +3654,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
pgoff = pte_to_pgoff(orig_pte);
- return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ return __do_fault_normal(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
/*
@@ -3455,6 +3723,105 @@ unlock:
return 0;
}
+/** Handles fault on pde level.*/
+int handle_pmd_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pud_t *pud, pmd_t *pmd, unsigned int flags)
+{
+ pte_t *pte;
+ pgoff_t pgoff;
+ pmd_t pmdVal;
+ int faultResult;
+
+ if (!vma->vm_file) {
+ /* Anonymous THP handling */
+ if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+ if (!vma->vm_ops) {
+ return do_huge_pmd_anonymous_page(mm, vma,
+ address, pmd, flags);
+ }
+ } else {
+ pmd_t orig_pmd = *pmd;
+ barrier();
+ if (pmd_trans_huge(orig_pmd)) {
+ if (flags & FAULT_FLAG_WRITE &&
+ !pmd_write(orig_pmd) &&
+ !pmd_trans_splitting(orig_pmd))
+ return do_huge_pmd_wp_page(mm, vma,
+ address, pmd, orig_pmd);
+ return 0;
+ }
+ goto handle_pte_level;
+ }
+ }
+ /***************************
+ * Page cache THP handling *
+ ***************************/
+ pmdVal = *pmd;
+ if (pmd_present(pmdVal) && !pmd_trans_huge(pmdVal))
+ goto handle_pte_level;
+
+ if ((address & HPAGE_MASK) < vma->vm_start)
+ goto handle_pte_level;
+
+ /* Even if possible we currently support only for SHARED VMA.
+ *
+ * We support this only for shmem fs, but everyone is encorege
+ * to add few simple methods and test it for other file systems.
+ * Notes, warrnings etc are always welcome.
+ */
+ if (!(vma->vm_flags & VM_SHARED))
+ goto handle_pte_level;
+
+ /* Handle fault of possible vma with huge page. */
+ pgoff = (((address & HPAGE_MASK) - vma->vm_start) >> PAGE_SHIFT)
+ + vma->vm_pgoff;
+
+ if (!pmd_present(pmdVal)) {
+ /* No page at all. */
+ if (!check_if_hugemapping_is_possible0(vma, address, pgoff,
+ pmdVal))
+ goto handle_pte_level;
+ } else {
+ /* TODO Jump to make page writable. If not for regular
+ * filesystems, full fault path will be reused.
+ */
+ }
+
+ faultResult = __do_fault_huge(mm, vma, address, pud, pgoff, flags,
+ pmdVal);
+ if (!(faultResult & (VM_FAULT_ERROR | VM_FAULT_NOHUGE))) {
+ printk(KERN_INFO "Setted huge pmd");
+ return faultResult;
+ }
+
+handle_pte_level:
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+ return VM_FAULT_OOM;
+ /* Page cache THP uses mm->page_table_lock to check if pmd is still
+ * none just before setting ne huge pmd, is __pte_alloc suceeded
+ * then pmd may be huge or "normal" with ptes page.
+ *
+ * if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
+
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*/
@@ -3464,7 +3831,6 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte;
__set_current_state(TASK_RUNNING);
@@ -3484,42 +3850,8 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
return VM_FAULT_OOM;
- if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
- if (!vma->vm_ops)
- return do_huge_pmd_anonymous_page(mm, vma, address,
- pmd, flags);
- } else {
- pmd_t orig_pmd = *pmd;
- barrier();
- if (pmd_trans_huge(orig_pmd)) {
- if (flags & FAULT_FLAG_WRITE &&
- !pmd_write(orig_pmd) &&
- !pmd_trans_splitting(orig_pmd))
- return do_huge_pmd_wp_page(mm, vma, address,
- pmd, orig_pmd);
- return 0;
- }
- }
- /*
- * Use __pte_alloc instead of pte_alloc_map, because we can't
- * run pte_offset_map on the pmd, if an huge pmd could
- * materialize from under us from a different thread.
- */
- if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
- return VM_FAULT_OOM;
- /* if an huge pmd materialized from under us just retry later */
- if (unlikely(pmd_trans_huge(*pmd)))
- return 0;
- /*
- * A regular pmd is established and it can't morph into a huge pmd
- * from under us anymore at this point because we hold the mmap_sem
- * read mode and khugepaged takes it in write mode. So now it's
- * safe to run pte_offset_map().
- */
- pte = pte_offset_map(pmd, address);
-
- return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+ return handle_pmd_fault(mm, vma, address, pud, pmd, flags);
}
#ifndef __PAGETABLE_PUD_FOLDED
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 363ba70..ff32b5d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2072,6 +2072,37 @@ int set_page_dirty(struct page *page)
}
EXPORT_SYMBOL(set_page_dirty);
+int set_page_dirty_huge(struct page *head)
+{
+ struct page *work;
+ int result = 0;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(!PageLocked(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+
+ if (head->mapping)
+ result += set_page_dirty(head);
+ else
+ BUG_ON(!PageSplitDeque(head));
+
+ for (work = head+1; compound_head(work) == head; work++) {
+ VM_BUG_ON(page_has_private(work));
+ VM_BUG_ON(page_has_buffers(work));
+
+ lock_page(work);
+ if (work->mapping) {
+ result += set_page_dirty(work);
+ } else {
+ /* Bug if there is no mapping and split is not
+ * dequeued.
+ */
+ BUG_ON(!PageSplitDeque(head));
+ }
+ unlock_page(work);
+ }
+ return result;
+}
/*
* set_page_dirty() is racy if the caller has no reference against
* page->mapping->host, and if the page is unlocked. This is because another
diff --git a/mm/rmap.c b/mm/rmap.c
index c8454e0..11f54e0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1157,6 +1157,21 @@ void page_add_file_rmap(struct page *page)
}
}
+void page_add_file_rmap_huge(struct page *head)
+{
+ struct page *page;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+
+ page_add_file_rmap(head);
+ page_add_file_rmap(head + 1);
+ if (likely(compound_order(head) > 1)) {
+ for (page = head+2; page->__first_page == head; page++)
+ page_add_file_rmap(page);
+ }
+}
+
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
@@ -1207,6 +1222,20 @@ void page_remove_rmap(struct page *page)
*/
}
+void page_remove_rmap_huge(struct page *head)
+{
+ struct page *page;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+
+ page_remove_rmap(head);
+ page_remove_rmap(head + 1);
+ if (likely(compound_order(head) > 1)) {
+ for (page = head+2; page->__first_page == head; page++)
+ page_remove_rmap(page);
+ }
+}
/*
* Subfunctions of try_to_unmap: try_to_unmap_one called
* repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 02/16/2012 06:31 AM, Radosław Smogura wrote:
> From: [email protected] <[email protected]>
>
> This patch gives ability for add some "-fno-..." options for GCC
> and to force -O1 optimization. Supporting files, like Kconfig, Makefile
> are auto-generated due to large amount of available options.
>
> Patch helps to debug kernel.
Note: I only see patches 1-10.
I fix a few typos below (comments in the generated files only).
> ---
> Makefile | 11 ++++
> lib/Kconfig.debug | 2 +
> lib/Kconfig.debug.optim | 102 ++++++++++++++++++++++++++++++++++++
> scripts/Makefile.optim.inc | 23 ++++++++
> scripts/debug/make_config_optim.sh | 88 +++++++++++++++++++++++++++++++
> 5 files changed, 226 insertions(+), 0 deletions(-)
> create mode 100644 lib/Kconfig.debug.optim
> create mode 100644 scripts/Makefile.optim.inc
> create mode 100644 scripts/debug/make_config_optim.sh
>
> diff --git a/Makefile b/Makefile
> index 7c44b67..bc9a961 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -558,12 +558,23 @@ endif # $(dot-config)
> # Defaults to vmlinux, but the arch makefile usually adds further targets
> all: vmlinux
>
> +ifdef CONFIG_HACK_OPTIM_FORCE_O1_LEVEL
> +KBUILD_CFLAGS += -O1
> +else
> +
> ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
> KBUILD_CFLAGS += -Os
> else
> KBUILD_CFLAGS += -O2
> endif
>
> +endif
> +
> +# Include makefile for optimization override
> +ifdef CONFIG_HACK_OPTIM
> +include $(srctree)/scripts/Makefile.optim.inc
> +endif
> +
> include $(srctree)/arch/$(SRCARCH)/Makefile
>
> ifneq ($(CONFIG_FRAME_WARN),0)
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 8745ac7..928265e 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -1274,5 +1274,7 @@ source "lib/Kconfig.kgdb"
>
> source "lib/Kconfig.kmemcheck"
>
> +source "lib/Kconfig.debug.optim"
> +
> config TEST_KSTRTOX
> tristate "Test kstrto*() family of functions at runtime"
> diff --git a/lib/Kconfig.debug.optim b/lib/Kconfig.debug.optim
> new file mode 100644
> index 0000000..09b1012
> --- /dev/null
> +++ b/lib/Kconfig.debug.optim
> @@ -0,0 +1,102 @@
> +# This file was auto generated. It's utility configuration
^^^^^^^^^^^^^^^^^^^^^^^^^^
That sentence(?) (fragment?) is not helpful. I would drop it.
> +# Distributed under GPL v2 License
> +
> +menuconfig HACK_OPTIM
> + bool "Allows to override GCC optimization"
Allows the user to override GCC optimization
> + depends on DEBUG_KERNEL && EXPERIMENTAL
> + help
> + If you say Y here you will be able to override
> + how GCC optimize kernel code. This will create
optimizes
> + more debug friendly, but with not guarentee
friendly code, but will not guarantee
> + about same runi, like production, kernel.
the same running, like a production kernel.
> +
> + If you say Y here probably You will want say
you will want to say Y
> + for all suboptions
for all suboptions.
> +
> +if HACK_OPTIM
> +
> +config HACK_OPTIM_FORCE_O1_LEVEL
> + bool "Forces -O1 optimization level"
> + ---help---
> + This will change how GCC optimize code. Code
optimizes
> + may be slower and larger but will be more debug
> + "friendly".
> +
> + In some cases there is low chance that kernel
> + will run different then normal, reporting or not
will run differently than normal,
> + some bugs or errors. Refere to GCC manual for
Refer
> + more details.
> +
> + You SHOULD say N here.
> +
> +config HACK_OPTIM__fno_inline_functions_called_once
> + bool "Adds -fno-inline-functions-called-once parameter to gcc invoke line."
> + ---help---
> + This will change how GCC optimize code. Code
optimizes
> + may be slower and larger but will be more debug
> + "friendly".
> +
> + In some cases there is low chance that kernel
> + will run different then normal, reporting or not
will run differently than normal,
> + some bugs or errors. Refere to GCC manual for
Refer
> + more details.
> +
> + You SHOULD say N here.
> +
> +config HACK_OPTIM__fno_combine_stack_adjustments
> + bool "Adds -fno-combine-stack-adjustments parameter to gcc invoke line."
> + ---help---
> + This will change how GCC optimize code. Code
optimizes
> + may be slower and larger but will be more debug
> + "friendly".
> +
> + In some cases there is low chance that kernel
> + will run different then normal, reporting or not
will run differently than normal,
> + some bugs or errors. Refere to GCC manual for
Refer
> + more details.
> +
> + You SHOULD say N here.
> +
> +config HACK_OPTIM__fno_tree_dce
> + bool "Adds -fno-tree-dce parameter to gcc invoke line."
> + ---help---
> + This will change how GCC optimize code. Code
optimizes
> + may be slower and larger but will be more debug
> + "friendly".
> +
> + In some cases there is low chance that kernel
> + will run different then normal, reporting or not
will run differently than normal,
> + some bugs or errors. Refere to GCC manual for
Refer
> + more details.
> +
> + You SHOULD say N here.
> +
> +config HACK_OPTIM__fno_tree_dominator_opts
> + bool "Adds -fno-tree-dominator-opts parameter to gcc invoke line."
> + ---help---
> + This will change how GCC optimize code. Code
same ...
> + may be slower and larger but will be more debug
> + "friendly".
> +
> + In some cases there is low chance that kernel
> + will run different then normal, reporting or not
> + some bugs or errors. Refere to GCC manual for
> + more details.
> +
> + You SHOULD say N here.
> +
> +config HACK_OPTIM__fno_dse
> + bool "Adds -fno-dse parameter to gcc invoke line."
> + ---help---
> + This will change how GCC optimize code. Code
same ...
> + may be slower and larger but will be more debug
> + "friendly".
> +
> + In some cases there is low chance that kernel
> + will run different then normal, reporting or not
> + some bugs or errors. Refere to GCC manual for
> + more details.
> +
> + You SHOULD say N here.
> +
> +endif #HACK_OPTIM
> diff --git a/scripts/Makefile.optim.inc b/scripts/Makefile.optim.inc
> new file mode 100644
> index 0000000..e78cc92
> --- /dev/null
> +++ b/scripts/Makefile.optim.inc
> @@ -0,0 +1,23 @@
> +# This file was auto generated. It's utility configuration
> +# Distributed under GPL v2 License
> +
> +ifdef CONFIG_HACK_OPTIM__fno_inline_functions_called_once
> + KBUILD_CFLAGS += -fno-inline-functions-called-once
> +endif
> +
> +ifdef CONFIG_HACK_OPTIM__fno_combine_stack_adjustments
> + KBUILD_CFLAGS += -fno-combine-stack-adjustments
> +endif
> +
> +ifdef CONFIG_HACK_OPTIM__fno_tree_dce
> + KBUILD_CFLAGS += -fno-tree-dce
> +endif
> +
> +ifdef CONFIG_HACK_OPTIM__fno_tree_dominator_opts
> + KBUILD_CFLAGS += -fno-tree-dominator-opts
> +endif
> +
> +ifdef CONFIG_HACK_OPTIM__fno_dse
> + KBUILD_CFLAGS += -fno-dse
> +endif
> +
> diff --git a/scripts/debug/make_config_optim.sh b/scripts/debug/make_config_optim.sh
> new file mode 100644
> index 0000000..26865923
> --- /dev/null
> +++ b/scripts/debug/make_config_optim.sh
> @@ -0,0 +1,88 @@
> +#!/bin/sh
> +
> +## Utility script for generating optimization override options
> +## for kernel compilation.
> +##
> +## Distributed under GPL v2 license
> +## (c) Radosław Smogura, 2011
> +
> +# Prefix added for variable
> +CFG_PREFIX="HACK_OPTIM"
> +
> +KCFG="Kconfig.debug.optim"
> +MKFI="Makefile.optim.inc"
> +
> +OPTIMIZATIONS_PARAMS="-fno-inline-functions-called-once \
> + -fno-combine-stack-adjustments \
> + -fno-tree-dce \
> + -fno-tree-dominator-opts \
> + -fno-dse "
> +
> +echo "# This file was auto generated. It's utility configuration" > $KCFG
> +echo "# Distributed under GPL v2 License" >> $KCFG
> +echo >> $KCFG
> +echo "menuconfig ${CFG_PREFIX}" >> $KCFG
> +echo -e "\tbool \"Allows to override GCC optimization\"" >> $KCFG
> +echo -e "\tdepends on DEBUG_KERNEL && EXPERIMENTAL" >> $KCFG
> +echo -e "\thelp" >> $KCFG
> +echo -e "\t If you say Y here you will be able to override" >> $KCFG
> +echo -e "\t how GCC optimize kernel code. This will create" >> $KCFG
> +echo -e "\t more debug friendly, but with not guarentee" >> $KCFG
> +echo -e "\t about same runi, like production, kernel." >> $KCFG
> +echo >> $KCFG
> +echo -e "\t If you say Y here probably You will want say" >> $KCFG
> +echo -e "\t for all suboptions" >> $KCFG
> +echo >> $KCFG
> +echo "if ${CFG_PREFIX}" >> $KCFG
> +echo >> $KCFG
> +
> +echo "# This file was auto generated. It's utility configuration" > $MKFI
> +echo "# Distributed under GPL v2 License" >> $MKFI
> +echo >> $MKFI
> +
> +# Insert standard override optimization level
> +# This is exception, and this value will not be included
> +# in auto generated makefile. Support for this value
> +# is hard coded in main Makefile.
> +echo -e "config ${CFG_PREFIX}_FORCE_O1_LEVEL" >> $KCFG
> +echo -e "\tbool \"Forces -O1 optimization level\"" >> $KCFG
> +echo -e "\t---help---" >> $KCFG
> +echo -e "\t This will change how GCC optimize code. Code" >> $KCFG
> +echo -e "\t may be slower and larger but will be more debug" >> $KCFG
> +echo -e "\t \"friendly\"." >> $KCFG
> +echo >> $KCFG
> +echo -e "\t In some cases there is low chance that kernel" >> $KCFG
> +echo -e "\t will run different then normal, reporting or not" >> $KCFG
> +echo -e "\t some bugs or errors. Refere to GCC manual for" >> $KCFG
> +echo -e "\t more details." >> $KCFG
> +echo >> $KCFG
> +echo -e "\t You SHOULD say N here." >> $KCFG
> +echo >> $KCFG
> +
> +for o in $OPTIMIZATIONS_PARAMS ; do
> + cfg_o="${CFG_PREFIX}_${o//-/_}";
> + echo "Processing param ${o} config variable will be $cfg_o";
> +
> + # Generate kconfig entry
> + echo -e "config ${cfg_o}" >> $KCFG
> + echo -e "\tbool \"Adds $o parameter to gcc invoke line.\"" >> $KCFG
> + echo -e "\t---help---" >> $KCFG
> + echo -e "\t This will change how GCC optimize code. Code" >> $KCFG
> + echo -e "\t may be slower and larger but will be more debug" >> $KCFG
> + echo -e "\t \"friendly\"." >> $KCFG
> + echo >> $KCFG
> + echo -e "\t In some cases there is low chance that kernel" >> $KCFG
> + echo -e "\t will run different then normal, reporting or not" >> $KCFG
> + echo -e "\t some bugs or errors. Refere to GCC manual for" >> $KCFG
> + echo -e "\t more details." >> $KCFG
> + echo >> $KCFG
> + echo -e "\t You SHOULD say N here." >> $KCFG
> + echo >> $KCFG
> +
> + #Generate Make for include
> + echo "ifdef CONFIG_${cfg_o}" >> $MKFI
> + echo -e "\tKBUILD_CFLAGS += $o" >> $MKFI
> + echo "endif" >> $MKFI
> + echo >> $MKFI
> +done;
> +echo "endif #${CFG_PREFIX}" >> $KCFG
--
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Thu, 16 Feb 2012 07:44:54 -0800, Randy Dunlap wrote:
> On 02/16/2012 06:31 AM, Radosław Smogura wrote:
>> From: [email protected] <[email protected]>
>>
>> This patch gives ability for add some "-fno-..." options for GCC
>> and to force -O1 optimization. Supporting files, like Kconfig,
>> Makefile
>> are auto-generated due to large amount of available options.
>>
>> Patch helps to debug kernel.
>
> Note: I only see patches 1-10.
>
> I fix a few typos below (comments in the generated files only).
>
>
>> ---
>> Makefile | 11 ++++
>> lib/Kconfig.debug | 2 +
>> lib/Kconfig.debug.optim | 102
>> ++++++++++++++++++++++++++++++++++++
>> scripts/Makefile.optim.inc | 23 ++++++++
>> scripts/debug/make_config_optim.sh | 88
>> +++++++++++++++++++++++++++++++
>> 5 files changed, 226 insertions(+), 0 deletions(-)
>> create mode 100644 lib/Kconfig.debug.optim
>> create mode 100644 scripts/Makefile.optim.inc
>> create mode 100644 scripts/debug/make_config_optim.sh
>>
>> diff --git a/Makefile b/Makefile
>> index 7c44b67..bc9a961 100644
>> --- a/Makefile
>> +++ b/Makefile
>> @@ -558,12 +558,23 @@ endif # $(dot-config)
>> # Defaults to vmlinux, but the arch makefile usually adds further
>> targets
>> all: vmlinux
>>
>> +ifdef CONFIG_HACK_OPTIM_FORCE_O1_LEVEL
>> +KBUILD_CFLAGS += -O1
>> +else
>> +
>> ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
>> KBUILD_CFLAGS += -Os
>> else
>> KBUILD_CFLAGS += -O2
>> endif
>>
>> +endif
>> +
>> +# Include makefile for optimization override
>> +ifdef CONFIG_HACK_OPTIM
>> +include $(srctree)/scripts/Makefile.optim.inc
>> +endif
>> +
>> include $(srctree)/arch/$(SRCARCH)/Makefile
>>
>> ifneq ($(CONFIG_FRAME_WARN),0)
>> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
>> index 8745ac7..928265e 100644
>> --- a/lib/Kconfig.debug
>> +++ b/lib/Kconfig.debug
>> @@ -1274,5 +1274,7 @@ source "lib/Kconfig.kgdb"
>>
>> source "lib/Kconfig.kmemcheck"
>>
>> +source "lib/Kconfig.debug.optim"
>> +
>> config TEST_KSTRTOX
>> tristate "Test kstrto*() family of functions at runtime"
>> diff --git a/lib/Kconfig.debug.optim b/lib/Kconfig.debug.optim
>> new file mode 100644
>> index 0000000..09b1012
>> --- /dev/null
>> +++ b/lib/Kconfig.debug.optim
>> @@ -0,0 +1,102 @@
>> +# This file was auto generated. It's utility configuration
>
> ^^^^^^^^^^^^^^^^^^^^^^^^^^
> That sentence(?) (fragment?) is not helpful. I would drop it.
>
>> +# Distributed under GPL v2 License
>> +
>> +menuconfig HACK_OPTIM
>> + bool "Allows to override GCC optimization"
>
> Allows the user to override GCC optimization
>
>> + depends on DEBUG_KERNEL && EXPERIMENTAL
>> + help
>> + If you say Y here you will be able to override
>> + how GCC optimize kernel code. This will create
>
> optimizes
>
>> + more debug friendly, but with not guarentee
>
> friendly code, but will not guarantee
>
>> + about same runi, like production, kernel.
>
> the same running, like a production kernel.
>
>> +
>> + If you say Y here probably You will want say
>
> you will want to say Y
>
>> + for all suboptions
>
> for all suboptions.
>
>> +
>> +if HACK_OPTIM
>> +
>> +config HACK_OPTIM_FORCE_O1_LEVEL
>> + bool "Forces -O1 optimization level"
>> + ---help---
>> + This will change how GCC optimize code. Code
>
> optimizes
>
>> + may be slower and larger but will be more debug
>> + "friendly".
>> +
>> + In some cases there is low chance that kernel
>> + will run different then normal, reporting or not
>
> will run differently than normal,
>
>> + some bugs or errors. Refere to GCC manual for
>
> Refer
>
>> + more details.
>> +
>> + You SHOULD say N here.
>> +
>> +config HACK_OPTIM__fno_inline_functions_called_once
>> + bool "Adds -fno-inline-functions-called-once parameter to gcc
>> invoke line."
>> + ---help---
>> + This will change how GCC optimize code. Code
>
> optimizes
>
>> + may be slower and larger but will be more debug
>> + "friendly".
>> +
>> + In some cases there is low chance that kernel
>> + will run different then normal, reporting or not
>
> will run differently than normal,
>
>> + some bugs or errors. Refere to GCC manual for
>
> Refer
>
>> + more details.
>> +
>> + You SHOULD say N here.
>> +
>> +config HACK_OPTIM__fno_combine_stack_adjustments
>> + bool "Adds -fno-combine-stack-adjustments parameter to gcc invoke
>> line."
>> + ---help---
>> + This will change how GCC optimize code. Code
>
> optimizes
>
>> + may be slower and larger but will be more debug
>> + "friendly".
>> +
>> + In some cases there is low chance that kernel
>> + will run different then normal, reporting or not
>
> will run differently than normal,
>
>> + some bugs or errors. Refere to GCC manual for
>
> Refer
>
>> + more details.
>> +
>> + You SHOULD say N here.
>> +
>> +config HACK_OPTIM__fno_tree_dce
>> + bool "Adds -fno-tree-dce parameter to gcc invoke line."
>> + ---help---
>> + This will change how GCC optimize code. Code
>
> optimizes
>
>> + may be slower and larger but will be more debug
>> + "friendly".
>> +
>> + In some cases there is low chance that kernel
>> + will run different then normal, reporting or not
>
> will run differently than normal,
>
>> + some bugs or errors. Refere to GCC manual for
>
> Refer
>
>> + more details.
>> +
>> + You SHOULD say N here.
>> +
>> +config HACK_OPTIM__fno_tree_dominator_opts
>> + bool "Adds -fno-tree-dominator-opts parameter to gcc invoke line."
>> + ---help---
>> + This will change how GCC optimize code. Code
>
> same ...
>
>> + may be slower and larger but will be more debug
>> + "friendly".
>> +
>> + In some cases there is low chance that kernel
>> + will run different then normal, reporting or not
>> + some bugs or errors. Refere to GCC manual for
>> + more details.
>> +
>> + You SHOULD say N here.
>> +
>> +config HACK_OPTIM__fno_dse
>> + bool "Adds -fno-dse parameter to gcc invoke line."
>> + ---help---
>> + This will change how GCC optimize code. Code
>
> same ...
>
>> + may be slower and larger but will be more debug
>> + "friendly".
>> +
>> + In some cases there is low chance that kernel
>> + will run different then normal, reporting or not
>> + some bugs or errors. Refere to GCC manual for
>> + more details.
>> +
>> + You SHOULD say N here.
>> +
>> +endif #HACK_OPTIM
>> diff --git a/scripts/Makefile.optim.inc b/scripts/Makefile.optim.inc
>> new file mode 100644
>> index 0000000..e78cc92
>> --- /dev/null
>> +++ b/scripts/Makefile.optim.inc
>> @@ -0,0 +1,23 @@
>> +# This file was auto generated. It's utility configuration
>> +# Distributed under GPL v2 License
>> +
>> +ifdef CONFIG_HACK_OPTIM__fno_inline_functions_called_once
>> + KBUILD_CFLAGS += -fno-inline-functions-called-once
>> +endif
>> +
>> +ifdef CONFIG_HACK_OPTIM__fno_combine_stack_adjustments
>> + KBUILD_CFLAGS += -fno-combine-stack-adjustments
>> +endif
>> +
>> +ifdef CONFIG_HACK_OPTIM__fno_tree_dce
>> + KBUILD_CFLAGS += -fno-tree-dce
>> +endif
>> +
>> +ifdef CONFIG_HACK_OPTIM__fno_tree_dominator_opts
>> + KBUILD_CFLAGS += -fno-tree-dominator-opts
>> +endif
>> +
>> +ifdef CONFIG_HACK_OPTIM__fno_dse
>> + KBUILD_CFLAGS += -fno-dse
>> +endif
>> +
>> diff --git a/scripts/debug/make_config_optim.sh
>> b/scripts/debug/make_config_optim.sh
>> new file mode 100644
>> index 0000000..26865923
>> --- /dev/null
>> +++ b/scripts/debug/make_config_optim.sh
>> @@ -0,0 +1,88 @@
>> +#!/bin/sh
>> +
>> +## Utility script for generating optimization override options
>> +## for kernel compilation.
>> +##
>> +## Distributed under GPL v2 license
>> +## (c) Radosław Smogura, 2011
>> +
>> +# Prefix added for variable
>> +CFG_PREFIX="HACK_OPTIM"
>> +
>> +KCFG="Kconfig.debug.optim"
>> +MKFI="Makefile.optim.inc"
>> +
>> +OPTIMIZATIONS_PARAMS="-fno-inline-functions-called-once \
>> + -fno-combine-stack-adjustments \
>> + -fno-tree-dce \
>> + -fno-tree-dominator-opts \
>> + -fno-dse "
>> +
>> +echo "# This file was auto generated. It's utility configuration" >
>> $KCFG
>> +echo "# Distributed under GPL v2 License" >> $KCFG
>> +echo >> $KCFG
>> +echo "menuconfig ${CFG_PREFIX}" >> $KCFG
>> +echo -e "\tbool \"Allows to override GCC optimization\"" >> $KCFG
>> +echo -e "\tdepends on DEBUG_KERNEL && EXPERIMENTAL" >> $KCFG
>> +echo -e "\thelp" >> $KCFG
>> +echo -e "\t If you say Y here you will be able to override" >>
>> $KCFG
>> +echo -e "\t how GCC optimize kernel code. This will create" >>
>> $KCFG
>> +echo -e "\t more debug friendly, but with not guarentee" >>
>> $KCFG
>> +echo -e "\t about same runi, like production, kernel." >>
>> $KCFG
>> +echo >> $KCFG
>> +echo -e "\t If you say Y here probably You will want say" >>
>> $KCFG
>> +echo -e "\t for all suboptions" >> $KCFG
>> +echo >> $KCFG
>> +echo "if ${CFG_PREFIX}" >> $KCFG
>> +echo >> $KCFG
>> +
>> +echo "# This file was auto generated. It's utility configuration" >
>> $MKFI
>> +echo "# Distributed under GPL v2 License" >> $MKFI
>> +echo >> $MKFI
>> +
>> +# Insert standard override optimization level
>> +# This is exception, and this value will not be included
>> +# in auto generated makefile. Support for this value
>> +# is hard coded in main Makefile.
>> +echo -e "config ${CFG_PREFIX}_FORCE_O1_LEVEL" >> $KCFG
>> +echo -e "\tbool \"Forces -O1 optimization level\"" >> $KCFG
>> +echo -e "\t---help---" >> $KCFG
>> +echo -e "\t This will change how GCC optimize code. Code" >> $KCFG
>> +echo -e "\t may be slower and larger but will be more debug" >>
>> $KCFG
>> +echo -e "\t \"friendly\"." >> $KCFG
>> +echo >> $KCFG
>> +echo -e "\t In some cases there is low chance that kernel" >>
>> $KCFG
>> +echo -e "\t will run different then normal, reporting or not" >>
>> $KCFG
>> +echo -e "\t some bugs or errors. Refere to GCC manual for" >>
>> $KCFG
>> +echo -e "\t more details." >> $KCFG
>> +echo >> $KCFG
>> +echo -e "\t You SHOULD say N here." >> $KCFG
>> +echo >> $KCFG
>> +
>> +for o in $OPTIMIZATIONS_PARAMS ; do
>> + cfg_o="${CFG_PREFIX}_${o//-/_}";
>> + echo "Processing param ${o} config variable will be $cfg_o";
>> +
>> + # Generate kconfig entry
>> + echo -e "config ${cfg_o}" >> $KCFG
>> + echo -e "\tbool \"Adds $o parameter to gcc invoke line.\"" >>
>> $KCFG
>> + echo -e "\t---help---" >> $KCFG
>> + echo -e "\t This will change how GCC optimize code. Code" >>
>> $KCFG
>> + echo -e "\t may be slower and larger but will be more debug" >>
>> $KCFG
>> + echo -e "\t \"friendly\"." >> $KCFG
>> + echo >> $KCFG
>> + echo -e "\t In some cases there is low chance that kernel" >>
>> $KCFG
>> + echo -e "\t will run different then normal, reporting or not" >>
>> $KCFG
>> + echo -e "\t some bugs or errors. Refere to GCC manual for" >>
>> $KCFG
>> + echo -e "\t more details." >> $KCFG
>> + echo >> $KCFG
>> + echo -e "\t You SHOULD say N here." >> $KCFG
>> + echo >> $KCFG
>> +
>> + #Generate Make for include
>> + echo "ifdef CONFIG_${cfg_o}" >> $MKFI
>> + echo -e "\tKBUILD_CFLAGS += $o" >> $MKFI
>> + echo "endif" >> $MKFI
>> + echo >> $MKFI
>> +done;
>> +echo "endif #${CFG_PREFIX}" >> $KCFG
I sent rest later, when I realized not all ware sent. It's looks like
my mail server stopped after 10 (maybe spam protection...?).
Those have [WIP] prefix, my bad, but currently are visible on marc.info
(http://marc.info/?l=linux-mm&r=1&b=201202&w=3).
Oh... and I saw I merged two patches during rebase into patch 2 -
change of refcounting and moving first page to "LRU union", so there is
no info about moving fisrt_page.
Regards,
Radek
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
I send this dummy patch to describe a bit of work, maybe someone may
have additional ideas, concepts and tips. In any case I'm glad I mapped huge
EXT4 and data was synced to disk.
Some concepts about compounds:
- first_page moved to lru union to free place for buffers
- refcounting changed - compound pages are "auto managed",
page recovering is for backward
compatibilit with 2.6 kernels, actully those kernels allowed
getting tail page of count 0, but at eye glance moving few
times around 0 could cause dangling pointer bug
Compound view.
In distinction to huge pages and THP, file system
compound pages are really loosely treated, as a main difference there is no
implication huge page => huge pmd, huge page may exist and may have no
huge mappings at all.
Each page is managed almost like stand alone, have own count, mapcount, dirty
bit etc. It can't be added to any LRU nor list, because list_head is
shared with compound metadata.
Read / write locking of compound.
Splitting may be dequeued this is to prevent deadlocks, "legacy" code
will probably start with normal page locked, and then try to lock
compound, for splitting purposes this may cause deadlocks (actually this
flag was not included in faulting and enywhere else, but should be).
Still there is no defragmentation daemon nor anything simillar, this
behaviour is forced by MAP_HUGETLB.
Things not made:
* kswapd & co. not tested.
* mlock not fixed, fix will cover get_user_pages & follow_user_pages.
* fork, page_mkclean, mlock, not fixed.
* dropping caches = bug.
* migration not checked
* shmfs - writeback for reclaim should split, simple to make, but ext4
experiments should go first (syncing)
* no huge COW mapping allowed.
* code cleaning from all printk...
Signed-off-by: Radosław Smogura <[email protected]>
---
mm/filemap.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index f050209..7174fff 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1783,7 +1783,7 @@ int filemap_fault_huge(struct vm_area_struct *vma, struct vm_fault *vmf)
int ret = VM_FAULT_LOCKED;
error = vma->vm_ops->fault(vma, vmf);
- /* XXX Repeatable flags in __do fault etc. */
+ /* XXX Repeatable flags in __do fault etc. */
if (error & (VM_FAULT_ERROR | VM_FAULT_NOPAGE
| VM_FAULT_RETRY | VM_FAULT_NOHUGE)) {
return error;
--
1.7.3.4
Changes to VM subsytem allowing zapping and freeing huge pages,
additional functions for removing mapping.
Signed-off-by: Radosław Smogura <[email protected]>
---
include/asm-generic/tlb.h | 21 ++++++
include/linux/huge_mm.h | 13 ++++-
mm/huge_memory.c | 153 ++++++++++++++++++++++++++++++++++++++++++---
mm/memory.c | 39 +++++++-----
4 files changed, 202 insertions(+), 24 deletions(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index f96a5b5..f7fc543 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -126,6 +126,27 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
tlb_flush_mmu(tlb);
}
+/** Compound page must be getted frozen. */
+static inline void tlb_remove_page_huge(struct mmu_gather *tlb,
+ struct page *head)
+{
+ struct page *page;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+
+ tlb_remove_page(tlb, head);
+ tlb_remove_page(tlb, head + 1);
+ if (likely(compound_order(head) > 1)) {
+ for (page = head+2; page->__first_page == head; page++) {
+ tlb_remove_page(tlb, page);
+ /* Such situation should not happen, it means we mapped
+ * dangling page.
+ */
+ BUG_ON(!PageAnon(page) && !page->mapping);
+ }
+ }
+}
/**
* tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
*
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c2407e4..c72a849 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -88,12 +88,21 @@ extern int handle_pte_fault(struct mm_struct *mm,
pte_t *pte, pmd_t *pmd, unsigned int flags);
extern int split_huge_page(struct page *page);
extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
+extern void __split_huge_page_pmd_vma(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd);
+
#define split_huge_page_pmd(__mm, __pmd) \
do { \
pmd_t *____pmd = (__pmd); \
- if (unlikely(pmd_trans_huge(*____pmd))) \
+ if (unlikely(pmd_trans_huge(*____pmd))) \
__split_huge_page_pmd(__mm, ____pmd); \
} while (0)
+#define split_huge_page_pmd_vma(__vma, __addr, __pmd) \
+ do { \
+ pmd_t *____pmd = (__pmd); \
+ if (unlikely(pmd_trans_huge(*____pmd))) \
+ __split_huge_page_pmd_vma(__vma, __addr, ____pmd);\
+ } while (0)
#define wait_split_huge_page(__anon_vma, __pmd) \
do { \
pmd_t *____pmd = (__pmd); \
@@ -160,6 +169,8 @@ static inline int split_huge_page(struct page *page)
}
#define split_huge_page_pmd(__mm, __pmd) \
do { } while (0)
+#define split_huge_page_pmd_vma(__vma, __addr, __pmd) do { } while (0)
+
#define wait_split_huge_page(__anon_vma, __pmd) \
do { } while (0)
#define compound_trans_head(page) compound_head(page)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 74d2e84..95c9ce7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -807,6 +807,9 @@ pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
/* FIFO */
pgtable = mm->pmd_huge_pte;
+ if (!pgtable)
+ return NULL;
+
if (list_empty(&pgtable->lru))
mm->pmd_huge_pte = NULL;
else {
@@ -1029,27 +1032,56 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
int ret = 0;
+ pmd_t pmd_val;
+ /* We are going to get page, but if we will be during split, split may
+ * lock page_table_lock, then we may wait for compound_get, and split
+ * may wait for page_table_lock, we have here. So... double check
+ * locking.
+ */
+again:
spin_lock(&tlb->mm->page_table_lock);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
+ pmd_val = *pmd;
+ if (likely(pmd_trans_huge(pmd_val))) {
+ if (unlikely(pmd_trans_splitting(pmd_val))) {
spin_unlock(&tlb->mm->page_table_lock);
wait_split_huge_page(vma->anon_vma,
pmd);
} else {
struct page *page;
pgtable_t pgtable;
+
pgtable = get_pmd_huge_pte(tlb->mm);
page = pmd_page(*pmd);
+ spin_unlock(&tlb->mm->page_table_lock);
+ if (!compound_get(page))
+ return 0;
+ spin_lock(&tlb->mm->page_table_lock);
+ smp_rmb();
+ if (unlikely(!pmd_same(pmd_val, *pmd))) {
+ spin_unlock(&tlb->mm->page_table_lock);
+ compound_put(page);
+ goto again;
+ }
pmd_clear(pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- page_remove_rmap(page);
+ if (PageAnon(page))
+ page_remove_rmap(page);
+ else
+ page_remove_rmap_huge(page);
+
VM_BUG_ON(page_mapcount(page) < 0);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ add_mm_counter(tlb->mm, PageAnon(page) ?
+ MM_ANONPAGES : MM_FILEPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
spin_unlock(&tlb->mm->page_table_lock);
- tlb_remove_page(tlb, page);
- pte_free(tlb->mm, pgtable);
+ if (PageAnon(page))
+ tlb_remove_page(tlb, page);
+ else
+ tlb_remove_page_huge(tlb, page);
+ if (pgtable)
+ pte_free(tlb->mm, pgtable);
+ compound_put(page);
}
} else
spin_unlock(&tlb->mm->page_table_lock);
@@ -2368,16 +2400,121 @@ static int khugepaged(void *none)
return 0;
}
-void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+/** Makes inplace split of huge pmd to normal pmd, pmd is filled
+ * with ptes compatible with pmd,
+ * <br/>
+ * On success new page table is modified and flushed.
+ * May work only for file pmds.
+ *
+ * This method copies logic from __pte_alloc.
+ */
+int __inplace_split_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd)
+{
+ unsigned long addr, end_addr;
+ pmd_t pmdv, pmd_fake;
+ pte_t pte, pte_pmd;
+ pte_t *ptep;
+ pgtable_t new;
+ struct page *page;
+
+ address &= HPAGE_PMD_MASK;
+
+ /* TODO Good place to change locking technique for pmds. */
+repeat:
+ addr = address & HPAGE_PMD_MASK;
+
+ smp_mb();
+ if (pmd_none(*pmd) || !pmd_trans_huge(*pmd))
+ return 0;
+
+ new = pte_alloc_one(mm, addr);
+
+ if (!new)
+ return -ENOMEM;
+ pmdv = *pmd;
+
+ pmd_fake = pmdv;
+ pte_pmd = pte_clrhuge(*((pte_t *) &pmd_fake));
+ pmd_fake = *((pmd_t *) &pte_pmd);
+
+ pmd_populate(mm, &pmd_fake, new);
+
+ page = pmd_page(pmdv);
+ end_addr = pmd_addr_end(addr, 0L);
+ for (; addr < end_addr; addr += PAGE_SIZE, page++) {
+ if (!pmd_present(pmdv))
+ continue;
+ /* Copy protection from pmd. */
+ pte = mk_pte(page, vma->vm_page_prot);
+
+ if (pmd_dirty(pmdv))
+ pte = pte_mkdirty(pte);
+ if (pmd_write(pmdv))
+ pte = pte_mkwrite(pte);
+ if (pmd_exec(pmdv))
+ pte = pte_mkexec(pte);
+ if (pmd_young(pmdv))
+ pte = pte_mkyoung(pte);
+
+ ptep = pte_offset_map(&pmd_fake, addr);
+ set_pte_at(mm, addr, ptep, pte);
+ pte_unmap(ptep);
+ }
+
+ /* Ensure everything is visible before populating pmd. */
+ smp_mb();
+
+ spin_lock(&mm->page_table_lock);
+ if (pmd_same(pmdv, *pmd)) {
+ set_pmd(pmd, pmd_fake);
+ mm->nr_ptes++;
+ new = NULL;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ /* Now we have new tlb, make it visible to all. */
+ flush_tlb_range(vma, address, address + HPAGE_SIZE);
+
+ if (new) {
+ pte_free(mm, new);
+ goto repeat;
+ }
+
+ return 0;
+}
+
+/** Splits huge page for vma. */
+void __split_huge_page_pmd_vma(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd)
{
struct page *page;
+ int anonPage;
+ /* XXX Ineficient locking for pmd. */
+ spin_lock(&vma->vm_mm->page_table_lock);
+ if (!pmd_trans_huge(*pmd)) {
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return;
+ }
+ page = pmd_page(*pmd);
+ anonPage = PageAnon(page);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ if (anonPage)
+ __split_huge_page_pmd(vma->vm_mm, pmd);
+ else
+ __inplace_split_pmd(vma->vm_mm, vma, address, pmd);
+}
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+ struct page *page = pmd_page(*pmd);
+
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(&mm->page_table_lock);
return;
}
- page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
spin_unlock(&mm->page_table_lock);
diff --git a/mm/memory.c b/mm/memory.c
index 7427c9b..539d1f4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -572,22 +572,28 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
- hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
- floor, next? next->vm_start: ceiling);
- } else {
- /*
- * Optimization: gather nearby vmas into one call down
- */
- while (next && next->vm_start <= vma->vm_end + PMD_SIZE
- && !is_vm_hugetlb_page(next)) {
- vma = next;
- next = vma->vm_next;
- unlink_anon_vmas(vma);
- unlink_file_vma(vma);
+ if (vma->vm_file) {
+ if (vma->vm_file->f_mapping->a_ops->defragpage)
+ goto free_normal;
}
- free_pgd_range(tlb, addr, vma->vm_end,
+ hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
+ }
+
+free_normal:
+ /*
+ * Optimization: gather nearby vmas into one call down
+ */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE
+ && !is_vm_hugetlb_page(next)) {
+ vma = next;
+ next = vma->vm_next;
+ unlink_anon_vmas(vma);
+ unlink_file_vma(vma);
}
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next? next->vm_start: ceiling);
+
vma = next;
}
}
@@ -1248,8 +1254,11 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next-addr != HPAGE_PMD_SIZE) {
- VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
- split_huge_page_pmd(vma->vm_mm, pmd);
+ /* And now we go again in conflict with, THP...
+ * THP requires semaphore, we require compound
+ * frozen, why...?
+ */
+ split_huge_page_pmd_vma(vma, addr, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
continue;
/* fall through */
--
1.7.3.4
On Thu, 16 Feb 2012 06:31:28 -0800, Radosław Smogura <[email protected]> wrote:
> Supporting files, like Kconfig, Makefile are auto-generated due to large amount
> of available options.
So why not run the script as part of make rather then store generated files in
repository?
> diff --git a/scripts/debug/make_config_optim.sh b/scripts/debug/make_config_optim.sh
> new file mode 100644
> index 0000000..26865923
> --- /dev/null
> +++ b/scripts/debug/make_config_optim.sh
> @@ -0,0 +1,88 @@
> +#!/bin/sh
The below won't run on POSIX-compatible sh. Address my comments
below to fix that.
> +
> +## Utility script for generating optimization override options
> +## for kernel compilation.
> +##
> +## Distributed under GPL v2 license
> +## (c) Radosław Smogura, 2011
> +
> +# Prefix added for variable
> +CFG_PREFIX="HACK_OPTIM"
> +
> +KCFG="Kconfig.debug.optim"
> +MKFI="Makefile.optim.inc"
How about names that mean something?
KCONFIG=...
MAKEFILE=...
> +
> +OPTIMIZATIONS_PARAMS="-fno-inline-functions-called-once \
> + -fno-combine-stack-adjustments \
> + -fno-tree-dce \
> + -fno-tree-dominator-opts \
> + -fno-dse "
Slashes at end of lines are not necessary here.
> +
> +echo "# This file was auto generated. It's utility configuration" > $KCFG
> +echo "# Distributed under GPL v2 License" >> $KCFG
> +echo >> $KCFG
> +echo "menuconfig ${CFG_PREFIX}" >> $KCFG
> +echo -e "\tbool \"Allows to override GCC optimization\"" >> $KCFG
> +echo -e "\tdepends on DEBUG_KERNEL && EXPERIMENTAL" >> $KCFG
> +echo -e "\thelp" >> $KCFG
> +echo -e "\t If you say Y here you will be able to override" >> $KCFG
> +echo -e "\t how GCC optimize kernel code. This will create" >> $KCFG
> +echo -e "\t more debug friendly, but with not guarentee" >> $KCFG
> +echo -e "\t about same runi, like production, kernel." >> $KCFG
> +echo >> $KCFG
> +echo -e "\t If you say Y here probably You will want say" >> $KCFG
> +echo -e "\t for all suboptions" >> $KCFG
> +echo >> $KCFG
> +echo "if ${CFG_PREFIX}" >> $KCFG
> +echo >> $KCFG
Use:
cat > $KCFG <<EOF
...
EOF
through the file (of course, in next runs you'll need to use “>> $KCFG”).
More readable and also “-e” argument to echo is bash-specific.
Alternatively to using “> $KCFG” all the time, you can also do:
exec 3> Kconfig.debug.optim
exec 4> Makefile.optim.inc
at the beginning of the script and later use >&3 and >&4, which will save
you some open/close calls and make the strangely named $KCFG and $MKFI
variables no longer needed.
> +
> +echo "# This file was auto generated. It's utility configuration" > $MKFI
> +echo "# Distributed under GPL v2 License" >> $MKFI
> +echo >> $MKFI
> +
> +# Insert standard override optimization level
> +# This is exception, and this value will not be included
> +# in auto generated makefile. Support for this value
> +# is hard coded in main Makefile.
> +echo -e "config ${CFG_PREFIX}_FORCE_O1_LEVEL" >> $KCFG
> +echo -e "\tbool \"Forces -O1 optimization level\"" >> $KCFG
> +echo -e "\t---help---" >> $KCFG
> +echo -e "\t This will change how GCC optimize code. Code" >> $KCFG
> +echo -e "\t may be slower and larger but will be more debug" >> $KCFG
> +echo -e "\t \"friendly\"." >> $KCFG
> +echo >> $KCFG
> +echo -e "\t In some cases there is low chance that kernel" >> $KCFG
> +echo -e "\t will run different then normal, reporting or not" >> $KCFG
> +echo -e "\t some bugs or errors. Refere to GCC manual for" >> $KCFG
> +echo -e "\t more details." >> $KCFG
> +echo >> $KCFG
> +echo -e "\t You SHOULD say N here." >> $KCFG
> +echo >> $KCFG
> +
> +for o in $OPTIMIZATIONS_PARAMS ; do
> + cfg_o="${CFG_PREFIX}_${o//-/_}";
cfg_o=$CFG_PREFIX_$(echo "$o" | tr '[:lower:]-' '[:upper:]_')
> + echo "Processing param ${o} config variable will be $cfg_o";
> +
> + # Generate kconfig entry
> + echo -e "config ${cfg_o}" >> $KCFG
> + echo -e "\tbool \"Adds $o parameter to gcc invoke line.\"" >> $KCFG
> + echo -e "\t---help---" >> $KCFG
> + echo -e "\t This will change how GCC optimize code. Code" >> $KCFG
> + echo -e "\t may be slower and larger but will be more debug" >> $KCFG
> + echo -e "\t \"friendly\"." >> $KCFG
> + echo >> $KCFG
> + echo -e "\t In some cases there is low chance that kernel" >> $KCFG
> + echo -e "\t will run different then normal, reporting or not" >> $KCFG
> + echo -e "\t some bugs or errors. Refere to GCC manual for" >> $KCFG
> + echo -e "\t more details." >> $KCFG
> + echo >> $KCFG
> + echo -e "\t You SHOULD say N here." >> $KCFG
> + echo >> $KCFG
> +
> + #Generate Make for include
> + echo "ifdef CONFIG_${cfg_o}" >> $MKFI
> + echo -e "\tKBUILD_CFLAGS += $o" >> $MKFI
> + echo "endif" >> $MKFI
> + echo >> $MKFI
> +done;
> +echo "endif #${CFG_PREFIX}" >> $KCFG
--
Best regards, _ _
.o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o
..o | Computer Science, Michał “mina86” Nazarewicz (o o)
ooo +----<email/xmpp: [email protected]>--------------ooO--(_)--Ooo--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href
ilto:"[email protected]"> [email protected] </a>
On Thu, 16 Feb 2012 11:09:18 -0800, Michal Nazarewicz wrote:
> On Thu, 16 Feb 2012 06:31:28 -0800, Radosław Smogura
> <[email protected]> wrote:
>> Supporting files, like Kconfig, Makefile are auto-generated due to
>> large amount
>> of available options.
>
> So why not run the script as part of make rather then store generated
> files in
> repository?
Idea to run this script through make is quite good, and should work,
because new mane will be generated before "config" starts.
"Bashizms" are indeed unneeded, I will try to replace this with sed.
>> diff --git a/scripts/debug/make_config_optim.sh
>> b/scripts/debug/make_config_optim.sh
>> new file mode 100644
>> index 0000000..26865923
>> --- /dev/null
>> +++ b/scripts/debug/make_config_optim.sh
>> @@ -0,0 +1,88 @@
>> +#!/bin/sh
>
> The below won't run on POSIX-compatible sh. Address my comments
> below to fix that.
>
>> +
>> +## Utility script for generating optimization override options
>> +## for kernel compilation.
>> +##
>> +## Distributed under GPL v2 license
>> +## (c) Radosław Smogura, 2011
>> +
>> +# Prefix added for variable
>> +CFG_PREFIX="HACK_OPTIM"
>> +
>> +KCFG="Kconfig.debug.optim"
>> +MKFI="Makefile.optim.inc"
>
> How about names that mean something?
>
> KCONFIG=...
> MAKEFILE=...
>
>> +
>> +OPTIMIZATIONS_PARAMS="-fno-inline-functions-called-once \
>> + -fno-combine-stack-adjustments \
>> + -fno-tree-dce \
>> + -fno-tree-dominator-opts \
>> + -fno-dse "
>
> Slashes at end of lines are not necessary here.
>
>> +
>> +echo "# This file was auto generated. It's utility configuration" >
>> $KCFG
>> +echo "# Distributed under GPL v2 License" >> $KCFG
>> +echo >> $KCFG
>> +echo "menuconfig ${CFG_PREFIX}" >> $KCFG
>> +echo -e "\tbool \"Allows to override GCC optimization\"" >> $KCFG
>> +echo -e "\tdepends on DEBUG_KERNEL && EXPERIMENTAL" >> $KCFG
>> +echo -e "\thelp" >> $KCFG
>> +echo -e "\t If you say Y here you will be able to override" >>
>> $KCFG
>> +echo -e "\t how GCC optimize kernel code. This will create" >>
>> $KCFG
>> +echo -e "\t more debug friendly, but with not guarentee" >>
>> $KCFG
>> +echo -e "\t about same runi, like production, kernel." >>
>> $KCFG
>> +echo >> $KCFG
>> +echo -e "\t If you say Y here probably You will want say" >>
>> $KCFG
>> +echo -e "\t for all suboptions" >> $KCFG
>> +echo >> $KCFG
>> +echo "if ${CFG_PREFIX}" >> $KCFG
>> +echo >> $KCFG
>
> Use:
>
> cat > $KCFG <<EOF
> ...
> EOF
>
> through the file (of course, in next runs you'll need to use “>>
> $KCFG”).
> More readable and also “-e” argument to echo is bash-specific.
>
> Alternatively to using “> $KCFG” all the time, you can also do:
>
> exec 3> Kconfig.debug.optim
> exec 4> Makefile.optim.inc
>
> at the beginning of the script and later use >&3 and >&4, which will
> save
> you some open/close calls and make the strangely named $KCFG and
> $MKFI
> variables no longer needed.
>
>> +
>> +echo "# This file was auto generated. It's utility configuration" >
>> $MKFI
>> +echo "# Distributed under GPL v2 License" >> $MKFI
>> +echo >> $MKFI
>> +
>> +# Insert standard override optimization level
>> +# This is exception, and this value will not be included
>> +# in auto generated makefile. Support for this value
>> +# is hard coded in main Makefile.
>> +echo -e "config ${CFG_PREFIX}_FORCE_O1_LEVEL" >> $KCFG
>> +echo -e "\tbool \"Forces -O1 optimization level\"" >> $KCFG
>> +echo -e "\t---help---" >> $KCFG
>> +echo -e "\t This will change how GCC optimize code. Code" >> $KCFG
>> +echo -e "\t may be slower and larger but will be more debug" >>
>> $KCFG
>> +echo -e "\t \"friendly\"." >> $KCFG
>> +echo >> $KCFG
>> +echo -e "\t In some cases there is low chance that kernel" >>
>> $KCFG
>> +echo -e "\t will run different then normal, reporting or not" >>
>> $KCFG
>> +echo -e "\t some bugs or errors. Refere to GCC manual for" >>
>> $KCFG
>> +echo -e "\t more details." >> $KCFG
>> +echo >> $KCFG
>> +echo -e "\t You SHOULD say N here." >> $KCFG
>> +echo >> $KCFG
>> +
>> +for o in $OPTIMIZATIONS_PARAMS ; do
>> + cfg_o="${CFG_PREFIX}_${o//-/_}";
>
> cfg_o=$CFG_PREFIX_$(echo "$o" | tr '[:lower:]-' '[:upper:]_')
>
>> + echo "Processing param ${o} config variable will be $cfg_o";
>> +
>> + # Generate kconfig entry
>> + echo -e "config ${cfg_o}" >> $KCFG
>> + echo -e "\tbool \"Adds $o parameter to gcc invoke line.\"" >>
>> $KCFG
>> + echo -e "\t---help---" >> $KCFG
>> + echo -e "\t This will change how GCC optimize code. Code" >>
>> $KCFG
>> + echo -e "\t may be slower and larger but will be more debug" >>
>> $KCFG
>> + echo -e "\t \"friendly\"." >> $KCFG
>> + echo >> $KCFG
>> + echo -e "\t In some cases there is low chance that kernel" >>
>> $KCFG
>> + echo -e "\t will run different then normal, reporting or not" >>
>> $KCFG
>> + echo -e "\t some bugs or errors. Refere to GCC manual for" >>
>> $KCFG
>> + echo -e "\t more details." >> $KCFG
>> + echo >> $KCFG
>> + echo -e "\t You SHOULD say N here." >> $KCFG
>> + echo >> $KCFG
>> +
>> + #Generate Make for include
>> + echo "ifdef CONFIG_${cfg_o}" >> $MKFI
>> + echo -e "\tKBUILD_CFLAGS += $o" >> $MKFI
>> + echo "endif" >> $MKFI
>> + echo >> $MKFI
>> +done;
>> +echo "endif #${CFG_PREFIX}" >> $KCFG
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Macros for operating on pmd in simillar way like for pte.
Signed-off-by: Radosław Smogura <[email protected]>
---
arch/x86/include/asm/pgtable.h | 21 +++++++++++++++++++++
1 files changed, 21 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 49afb3f..38fd008 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -265,6 +265,11 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_RW);
}
+static inline int pmd_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_DIRTY;
+}
+
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_DIRTY);
@@ -285,6 +290,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_RW);
}
+static inline pmd_t pmd_writeprotect(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
static inline pmd_t pmd_mknotpresent(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_PRESENT);
@@ -731,6 +741,17 @@ static inline int pmd_write(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_RW;
}
+#define __HAVE_ARCH_PMD_EXEC
+static inline int pmd_exec(pmd_t pmd)
+{
+ return !(pmd_flags(pmd) & _PAGE_NX);
+}
+
+static inline void pmd_mkexec(pmd_t pmd)
+{
+ pmd_clear_flags(pmd, _PAGE_NX);
+}
+
#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
--
1.7.3.4
This add support for page splitting. Page splitting should be called
only in special situations (when continous region of compound page is
about to stop representing same continous region of mapping, e. g. some
tail pages are going to be removed from page cache).
We reuse zap vma for split purpose, it's not quite nice, but fast path,
should be corrected.
SHM support for this will be added later.
Signed-off-by: Radosław Smogura <[email protected]>
---
include/linux/huge_mm.h | 21 ++++++
include/linux/mm.h | 20 +++++
mm/filemap.c | 14 ++++-
mm/huge_memory.c | 178 +++++++++++++++++++++++++++++++++++++---------
mm/memory.c | 54 ++++++++++-----
mm/truncate.c | 18 +++++-
6 files changed, 251 insertions(+), 54 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c72a849..8e6bfc7 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -87,6 +87,23 @@ extern int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags);
extern int split_huge_page(struct page *page);
+
+/** Splits huge file page.
+ * @param head the head of page
+ * @param page the page that is going to be invalidated.
+ * @return 0 - inplace split, 1 - newly dequeued, 2 - dequeud and was dequeued
+ */
+extern int split_huge_page_file(struct page *head, struct page *page);
+
+/** Tries to aquire all possible locks on compound page. This includes,
+ * compound lock on all tails and normal locks on all tails. Function takes
+ * {@code page} as signle parameter head must be frozen, {@code page}
+ * must have normal ({@code lock_page}) lock.
+ *
+ * @param page locked page contained in compound page, may be head or tail
+ */
+extern int compound_try_lock_all(struct page *page);
+
extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
extern void __split_huge_page_pmd_vma(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd);
@@ -167,6 +184,10 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
+static inline int split_huge_page_file(struct page *head, struct page *page)
+{
+ return 0;
+}
#define split_huge_page_pmd(__mm, __pmd) \
do { } while (0)
#define split_huge_page_pmd_vma(__vma, __addr, __pmd) do { } while (0)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 236a6be..4c67555 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -279,6 +279,19 @@ struct inode;
extern int put_compound_head(struct page *head);
extern int put_compound_tail(struct page *page);
+/** Tries to aquire compound lock.
+ * @return not zero on success or when {@code CONFIG_TRANSPARENT_HUGEPAGE}
+ * is not enabled, {@code 0} otherwise
+ */
+static inline int compound_trylock(struct page *head)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ return (likely(!test_and_set_bit_lock(PG_compound_lock, &head->flags)));
+#else
+ return 1;
+#endif
+}
+
static inline void compound_lock(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1058,6 +1071,11 @@ struct zap_details {
struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */
+
+ /* Instead of unmapping areas just split it down to pte level. Used
+ * for splitting pages.
+ */
+ int just_split;
};
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
@@ -1108,6 +1126,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
+void split_mapping_range(struct address_space *mapping, loff_t const holebegin,
+ loff_t const holelen);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/filemap.c b/mm/filemap.c
index b662757..8363cd9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -124,7 +124,19 @@ void __delete_from_page_cache(struct page *page)
cleancache_put_page(page);
else
cleancache_flush_page(mapping, page);
-
+#if CONFIG_DEBUG_VM
+ /** This is really strong assumption, but it may be usefull
+ * for finding problems when page is truncated, we actually allow
+ * situation when parts of huge page will be valid in page cache,
+ * but page should be marked & to mark page compund needs to be frozen.
+ * The bug will not only bug, but will show nice stack trace, what is
+ * wrong.
+ */
+ if (PageCompound(page)) {
+ struct page *head = compound_head(page);
+ VM_BUG_ON(PageCompound(page) && !PageSplitDeque(head));
+ }
+#endif
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 95c9ce7..87fb0b1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1256,11 +1256,17 @@ static int __split_huge_page_splitting(struct page *page,
return ret;
}
-static void __split_huge_page_refcount(struct page *page)
+static void __split_huge_page_refcount(struct page *page,
+ struct page *keep_locked)
{
int i;
int tail_counter;
struct zone *zone = page_zone(page);
+ int anon_mode = PageAnon(page);
+ const int pages = (1 << compound_order(page));
+
+ VM_BUG_ON(PageTail(page));
+ VM_BUG_ON(compound_order(page) < 2);
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
@@ -1270,7 +1276,7 @@ static void __split_huge_page_refcount(struct page *page)
tail_counter = compound_elements(page);
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ for (i = pages - 1; i >= 1; i--) {
struct page *page_tail = page + i;
/* tail_page->_mapcount cannot change */
@@ -1278,8 +1284,10 @@ static void __split_huge_page_refcount(struct page *page)
/*
* tail_page->_count represents actuall number of tail pages
+ * file backed pages have own map count.
*/
- atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+ if (anon_mode)
+ atomic_add(page_mapcount(page) + 1, &page_tail->_count);
/* after clearing PageTail the gup refcount can be released */
smp_mb();
@@ -1290,17 +1298,23 @@ static void __split_huge_page_refcount(struct page *page)
* by the memory-failure.
* retain lock, and compound lock
*/
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP
- | __PG_HWPOISON
- | PG_locked
- | PG_compound_lock;
-
- page_tail->flags |= (page->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate)));
- page_tail->flags |= (1L << PG_dirty);
+ if (anon_mode) {
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP
+ | __PG_HWPOISON
+ | PG_locked
+ | PG_compound_lock;
+
+ page_tail->flags |= (page->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate)));
+ page_tail->flags |= (1L << PG_dirty);
+ } else {
+ /* Retain all flags excepting tail, head :D */
+ int clearFlags = ~((1L << PG_tail) | (1L << PG_head));
+ page_tail->flags = (page_tail->flags & clearFlags);
+ }
/* clear PageTail before overwriting first_page */
smp_wmb();
@@ -1319,26 +1333,31 @@ static void __split_huge_page_refcount(struct page *page)
* status is achieved setting a reserved bit in the
* pmd, not by clearing the present bit.
*/
- page_tail->_mapcount = page->_mapcount;
+ if (anon_mode) {
+ page_tail->_mapcount = page->_mapcount;
- BUG_ON(page_tail->mapping);
- page_tail->mapping = page->mapping;
+ BUG_ON(page_tail->mapping);
+ page_tail->mapping = page->mapping;
- page_tail->index = page->index + i;
-
- BUG_ON(!PageAnon(page_tail));
- BUG_ON(!PageUptodate(page_tail));
- BUG_ON(!PageDirty(page_tail));
- BUG_ON(!PageSwapBacked(page_tail));
+ page_tail->index = page->index + i;
+ BUG_ON(!PageAnon(page_tail));
+ BUG_ON(!PageUptodate(page_tail));
+ BUG_ON(!PageDirty(page_tail));
+ BUG_ON(!PageSwapBacked(page_tail));
+ }
+ page_tail->__first_page = NULL;
lru_add_page_tail(zone, page, page_tail);
}
BUG_ON(atomic_read(&page->_count) <= 0);
- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+ if (anon_mode) {
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+ }
ClearPageCompound(page);
+ TestClearPageSplitDeque(page);
compound_unlock(page);
/* Remove additional reference used in compound. */
if (tail_counter)
@@ -1348,17 +1367,25 @@ static void __split_huge_page_refcount(struct page *page)
for (i = 1; i < HPAGE_PMD_NR; i++) {
struct page *page_tail = page + i;
- BUG_ON(page_count(page_tail) <= 0);
- /*
- * Tail pages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
- */
- put_page(page_tail);
+ if (anon_mode) {
+ BUG_ON(page_count(page_tail) <= 0);
+ /*
+ * Tail pages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(page_tail);
+ } else {
+ if (page_tail != keep_locked)
+ unlock_page(page_tail);
+ }
}
+ if (!anon_mode && page != keep_locked)
+ unlock_page(page);
+
/*
* Only the head page (now become a regular page) is required
* to be pinned by the caller.
@@ -1473,7 +1500,7 @@ static void __split_huge_page(struct page *page,
mapcount, page_mapcount(page));
BUG_ON(mapcount != page_mapcount(page));
- __split_huge_page_refcount(page);
+ __split_huge_page_refcount(page, NULL);
mapcount2 = 0;
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
@@ -1490,6 +1517,87 @@ static void __split_huge_page(struct page *page,
BUG_ON(mapcount != mapcount2);
}
+int compound_try_lock_all(struct page *page)
+{
+ struct page *head;
+ struct page *p;
+ int processed;
+ int toProcess;
+
+ VM_BUG_ON(!PageLocked(page));
+
+ /* Requirement compound must be getted so no split. */
+ head = compound_head(page);
+ VM_BUG_ON(compound_order(head) < 2);
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) != 0);
+
+ toProcess = 1 << compound_order(head);
+
+ /* First two passes will go explicite, next by __first_page to speed up.
+ */
+ if (head != page) {
+ if (!trylock_page(head))
+ return 0;
+ }
+
+ if ((head + 1) != page) {
+ if (!trylock_page(head + 1)) {
+ unlock_page(head);
+ return 0;
+ }
+ }
+
+ processed = 2;
+ /* Lock ordering page lock, then compound lock */
+ for (p = head + 2; p->__first_page == head; p++, processed++) {
+ if (p != page) {
+ if (!trylock_page(p))
+ break;
+ }
+ }
+ if (processed == toProcess)
+ return 1;
+
+ /** Rollback - reverse order */
+ do {
+ p--;
+ if (p != page)
+ unlock_page(p);
+ if (p == head)
+ return 0;
+ } while (1);
+}
+/** Splits huge file page.
+ * @param head the head of page
+ * @param page the page that is going to be invalidated.
+ * @return 0 - inplace split, 1 - newly dequeued, 2 - dequeud and was dequeued
+ */
+int split_huge_page_file(struct page *head, struct page *page)
+{
+ VM_BUG_ON(compound_order(head) < 2);
+ VM_BUG_ON(atomic_read(&compound_head(head)[2]._compound_usage));
+ VM_BUG_ON(PageAnon(head));
+
+ if (PageSplitDeque(head))
+ return 2;
+
+ /* Split all vma's. */
+ split_mapping_range(page_mapping(head),
+ (loff_t)page->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE * (1 << compound_order(head)));
+
+ if (compound_try_lock_all(page)) {
+ /* Do in place split. */
+ __split_huge_page_refcount(head, page);
+ return 0;
+ } else {
+ /* We can't lock all tail pages, mark page as split dequed. */
+ if (TestSetPageSplitDeque(head))
+ return 2;
+ else
+ return 1;
+ }
+}
int split_huge_page(struct page *page)
{
struct anon_vma *anon_vma;
diff --git a/mm/memory.c b/mm/memory.c
index 539d1f4..2b43661 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1253,12 +1253,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
do {
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
- if (next-addr != HPAGE_PMD_SIZE) {
+ if (unlikely(details && details->just_split) ||
+ next - addr != HPAGE_PMD_SIZE) {
/* And now we go again in conflict with, THP...
* THP requires semaphore, we require compound
* frozen, why...?
*/
split_huge_page_pmd_vma(vma, addr, pmd);
+ if (unlikely(details && details->just_split))
+ continue;
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
continue;
/* fall through */
@@ -2826,22 +2829,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
}
}
-/**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
- * @mapping: the address space containing mmaps to be unmapped.
- * @holebegin: byte in first page to unmap, relative to the start of
- * the underlying file. This will be rounded down to a PAGE_SIZE
- * boundary. Note that this is different from truncate_pagecache(), which
- * must keep the partial page. In contrast, we must get rid of
- * partial pages.
- * @holelen: size of prospective hole in bytes. This will be rounded
- * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
- * end of the file.
- * @even_cows: 1 when truncating a file, unmap even private COWed pages;
- * but 0 when invalidating pagecache, don't throw away private data.
- */
-void unmap_mapping_range(struct address_space *mapping,
- loff_t const holebegin, loff_t const holelen, int even_cows)
+static void _unmap_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen, int even_cows,
+ int just_split)
{
struct zap_details details;
pgoff_t hba = holebegin >> PAGE_SHIFT;
@@ -2859,6 +2849,8 @@ void unmap_mapping_range(struct address_space *mapping,
details.nonlinear_vma = NULL;
details.first_index = hba;
details.last_index = hba + hlen - 1;
+ details.just_split = just_split;
+
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
@@ -2870,8 +2862,36 @@ void unmap_mapping_range(struct address_space *mapping,
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
mutex_unlock(&mapping->i_mmap_mutex);
}
+/**
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * @mapping: the address space containing mmaps to be unmapped.
+ * @holebegin: byte in first page to unmap, relative to the start of
+ * the underlying file. This will be rounded down to a PAGE_SIZE
+ * boundary. Note that this is different from truncate_pagecache(), which
+ * must keep the partial page. In contrast, we must get rid of
+ * partial pages.
+ * @holelen: size of prospective hole in bytes. This will be rounded
+ * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
+ * end of the file.
+ * @even_cows: 1 when truncating a file, unmap even private COWed pages;
+ * but 0 when invalidating pagecache, don't throw away private data.
+ */
+void unmap_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen, int even_cows)
+{
+ _unmap_mapping_range(mapping, holebegin, holelen, even_cows, false);
+}
EXPORT_SYMBOL(unmap_mapping_range);
+void split_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ _unmap_mapping_range(mapping, holebegin, holelen, false, true);
+#endif
+}
+EXPORT_SYMBOL(split_mapping_range);
+
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
diff --git a/mm/truncate.c b/mm/truncate.c
index 632b15e..6112a76 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -140,12 +140,28 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
int truncate_inode_page(struct address_space *mapping, struct page *page)
{
+ struct page *head = NULL;
+ int result;
+
+ if (unlikely(PageCompound(page))) {
+ head = compound_head(page);
+ if (compound_freeze(head)) {
+ if (!split_huge_page_file(head, page))
+ head = NULL;
+ } else {
+ head = NULL;
+ }
+ }
+
if (page_mapped(page)) {
unmap_mapping_range(mapping,
(loff_t)page->index << PAGE_CACHE_SHIFT,
PAGE_CACHE_SIZE, 0);
}
- return truncate_complete_page(mapping, page);
+ result = truncate_complete_page(mapping, page);
+ if (head)
+ compound_unfreeze(head);
+ return result;
}
/*
--
1.7.3.4
Fixed smaps to do not split page, and print information about
shared/private huge dirty/clean pages. This changes operates only
on dirty flag from pmd - it may not be enaugh, but checking in addition
PageDirty, like for pte, is too much, because of head of huge page may
be mapped to single pte, not only as huge pmd.
In pagemaps removed splitting and adding huge pmd as one page with shift
of huge page.
Signed-off-by: Radosław Smogura <[email protected]>
---
fs/proc/task_mmu.c | 97 ++++++++++++++++++++++++++++++++++++----------------
1 files changed, 67 insertions(+), 30 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7dcd2a2..111e64c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -333,8 +333,12 @@ struct mem_size_stats {
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
+ unsigned long shared_huge_clean;
+ unsigned long shared_huge_dirty;
unsigned long private_clean;
unsigned long private_dirty;
+ unsigned long private_huge_clean;
+ unsigned long private_huge_dirty;
unsigned long referenced;
unsigned long anonymous;
unsigned long anonymous_thp;
@@ -342,9 +346,8 @@ struct mem_size_stats {
u64 pss;
};
-
static void smaps_pte_entry(pte_t ptent, unsigned long addr,
- unsigned long ptent_size, struct mm_walk *walk)
+ unsigned long ptent_size, struct mm_walk *walk, int huge_file)
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
@@ -368,20 +371,33 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
mss->resident += ptent_size;
/* Accumulate the size in pages that have been accessed. */
- if (pte_young(ptent) || PageReferenced(page))
+ if (pte_young(ptent) || (!huge_file && PageReferenced(page)))
mss->referenced += ptent_size;
mapcount = page_mapcount(page);
+ /* For huge file mapping only account by pte, as page may be made
+ * dirty, but not pmd (huge page may be mapped in ptes not pde).
+ */
if (mapcount >= 2) {
- if (pte_dirty(ptent) || PageDirty(page))
+ if (pte_dirty(ptent) || (!huge_file && PageDirty(page))) {
mss->shared_dirty += ptent_size;
- else
+ if (huge_file)
+ mss->shared_huge_dirty += ptent_size;
+ } else {
mss->shared_clean += ptent_size;
+ if (huge_file)
+ mss->shared_huge_clean += ptent_size;
+ }
mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
} else {
- if (pte_dirty(ptent) || PageDirty(page))
+ if (pte_dirty(ptent) || (!huge_file && PageDirty(page))) {
mss->private_dirty += ptent_size;
- else
+ if (huge_file)
+ mss->private_huge_dirty += ptent_size;
+ } else {
mss->private_clean += ptent_size;
+ if (huge_file)
+ mss->private_huge_clean += ptent_size;
+ }
mss->pss += (ptent_size << PSS_SHIFT);
}
}
@@ -401,9 +417,10 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
wait_split_huge_page(vma->anon_vma, pmd);
} else {
smaps_pte_entry(*(pte_t *)pmd, addr,
- HPAGE_PMD_SIZE, walk);
+ HPAGE_PMD_SIZE, walk,
+ vma->vm_ops != NULL);
spin_unlock(&walk->mm->page_table_lock);
- mss->anonymous_thp += HPAGE_PMD_SIZE;
+ mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
} else {
@@ -416,7 +433,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
*/
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
- smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
+ smaps_pte_entry(*pte, addr, PAGE_SIZE, walk, 0);
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
@@ -443,20 +460,24 @@ static int show_smap(struct seq_file *m, void *v)
show_map_vma(m, vma);
seq_printf(m,
- "Size: %8lu kB\n"
- "Rss: %8lu kB\n"
- "Pss: %8lu kB\n"
- "Shared_Clean: %8lu kB\n"
- "Shared_Dirty: %8lu kB\n"
- "Private_Clean: %8lu kB\n"
- "Private_Dirty: %8lu kB\n"
- "Referenced: %8lu kB\n"
- "Anonymous: %8lu kB\n"
- "AnonHugePages: %8lu kB\n"
- "Swap: %8lu kB\n"
- "KernelPageSize: %8lu kB\n"
- "MMUPageSize: %8lu kB\n"
- "Locked: %8lu kB\n",
+ "Size: %8lu kB\n"
+ "Rss: %8lu kB\n"
+ "Pss: %8lu kB\n"
+ "Shared_Clean: %8lu kB\n"
+ "Shared_Dirty: %8lu kB\n"
+ "Private_Clean: %8lu kB\n"
+ "Private_Dirty: %8lu kB\n"
+ "Shared_Huge_Clean: %8lu kB\n"
+ "Shared_Huge_Dirty: %8lu kB\n"
+ "Private_Huge_Clean: %8lu kB\n"
+ "Private_Huge_Dirty: %8lu kB\n"
+ "Referenced: %8lu kB\n"
+ "Anonymous: %8lu kB\n"
+ "AnonHugePages: %8lu kB\n"
+ "Swap: %8lu kB\n"
+ "KernelPageSize: %8lu kB\n"
+ "MMUPageSize: %8lu kB\n"
+ "Locked: %8lu kB\n",
(vma->vm_end - vma->vm_start) >> 10,
mss.resident >> 10,
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -464,6 +485,10 @@ static int show_smap(struct seq_file *m, void *v)
mss.shared_dirty >> 10,
mss.private_clean >> 10,
mss.private_dirty >> 10,
+ mss.shared_huge_clean >> 10,
+ mss.shared_huge_dirty >> 10,
+ mss.private_huge_clean >> 10,
+ mss.private_huge_dirty >> 10,
mss.referenced >> 10,
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
@@ -661,6 +686,15 @@ static u64 pte_to_pagemap_entry(pte_t pte)
return pme;
}
+static u64 pmd_to_pagemap_entry(pmd_t pmd)
+{
+ u64 pme = 0;
+ if (pmd_present(pmd))
+ pme = PM_PFRAME(pmd_pfn(pmd))
+ | PM_PSHIFT(HPAGE_SHIFT) | PM_PRESENT;
+ return pme | PM_PSHIFT(HPAGE_SHIFT);
+}
+
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@@ -669,8 +703,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
int err = 0;
- split_huge_page_pmd(walk->mm, pmd);
-
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
for (; addr != end; addr += PAGE_SIZE) {
@@ -685,10 +717,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
* and that it isn't a huge page vma */
if (vma && (vma->vm_start <= addr) &&
!is_vm_hugetlb_page(vma)) {
- pte = pte_offset_map(pmd, addr);
- pfn = pte_to_pagemap_entry(*pte);
- /* unmap before userspace copy */
- pte_unmap(pte);
+ pmd_t pmd_val = *pmd;
+ if (pmd_trans_huge(pmd_val)) {
+ pfn = pmd_to_pagemap_entry(pmd_val);
+ } else {
+ pte = pte_offset_map(pmd, addr);
+ pfn = pte_to_pagemap_entry(*pte);
+ /* unmap before userspace copy */
+ pte_unmap(pte);
+ }
}
err = add_to_pagemap(addr, pfn, pm);
if (err)
--
1.7.3.4
It's rather experimental to uncover all leaks in adding huge page cache
support for shm, not for giving any real support for huge pages for
EXT4 file system. This will test if some concepts was good or bad.
In any case target is that some segments of glibc may be mapped as huge
pages, only if it will be aligned to huge page boundaries.
Signed-off-by: Radosław Smogura <[email protected]>
---
fs/ext4/Kconfig | 9 ++++
fs/ext4/file.c | 3 +
fs/ext4/inode.c | 15 +++++++
include/linux/defrag-pagecache.h | 4 ++
include/linux/mm.h | 4 ++
mm/defrag-pagecache.c | 19 +++++++++
mm/filemap.c | 82 ++++++++++++++++++++++++++++++++++++++
7 files changed, 136 insertions(+), 0 deletions(-)
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9ed1bb1..1a33bb0 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -83,3 +83,12 @@ config EXT4_DEBUG
If you select Y here, then you will be able to turn on debugging
with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+
+config EXT4_HUGEPAGECACHE
+ bool "EXT4 Huge Page Cache Support [Danegerous]"
+ depends on EXT4_FS
+ depends on HUGEPAGECACHE
+ help
+ It's rather experimental to uncover all leaks in adding huge page cache
+ support for shm, not for giving any real support for huge pages for
+ EXT4 file system. This will test if some concepts was quite good.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cb70f18..57698df 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -143,6 +143,9 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+ .fault_huge = filemap_fault_huge,
+#endif
.page_mkwrite = ext4_page_mkwrite,
};
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82f..8bbda5a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -45,6 +45,9 @@
#include <trace/events/ext4.h>
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+#include <linux/defrag-pagecache.h>
+#endif
#define MPAGE_DA_EXTENT_TAIL 0x01
static inline int ext4_begin_ordered_truncate(struct inode *inode,
@@ -3036,6 +3039,9 @@ static const struct address_space_operations ext4_ordered_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+ .defragpage = defrag_generic_file,
+#endif
};
static const struct address_space_operations ext4_writeback_aops = {
@@ -3051,6 +3057,9 @@ static const struct address_space_operations ext4_writeback_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+ .defragpage = defrag_generic_file,
+#endif
};
static const struct address_space_operations ext4_journalled_aops = {
@@ -3066,6 +3075,9 @@ static const struct address_space_operations ext4_journalled_aops = {
.direct_IO = ext4_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+ .defragpage = defrag_generic_file,
+#endif
};
static const struct address_space_operations ext4_da_aops = {
@@ -3082,6 +3094,9 @@ static const struct address_space_operations ext4_da_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_EXT4_HUGEPAGECACHE
+ .defragpage = defrag_generic_file,
+#endif
};
void ext4_set_aops(struct inode *inode)
diff --git a/include/linux/defrag-pagecache.h b/include/linux/defrag-pagecache.h
index 4ca3468..fb305c8 100644
--- a/include/linux/defrag-pagecache.h
+++ b/include/linux/defrag-pagecache.h
@@ -42,5 +42,9 @@ extern int defrag_generic_shm(struct file *file, struct address_space *mapping,
loff_t pos,
struct page **pagep,
struct defrag_pagecache_ctl *ctl);
+extern int defrag_generic_file(struct file *file, struct address_space *mapping,
+ loff_t pos,
+ struct page **pagep,
+ struct defrag_pagecache_ctl *ctl);
#endif /* DEFRAG_PAGECACHE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4c67555..24c2c6c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1620,6 +1620,10 @@ extern void truncate_inode_pages_range(struct address_space *,
/* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
+#ifdef CONFIG_HUGEPAGECACHE
+extern int filemap_fault_huge(struct vm_area_struct *vma, struct vm_fault *vmf);
+#endif
+
/* mm/page-writeback.c */
int write_one_page(struct page *page, int wait);
void task_dirty_inc(struct task_struct *tsk);
diff --git a/mm/defrag-pagecache.c b/mm/defrag-pagecache.c
index 5a14fe8..6a87814 100644
--- a/mm/defrag-pagecache.c
+++ b/mm/defrag-pagecache.c
@@ -104,6 +104,16 @@ struct page *shmem_defrag_get_page(const struct defrag_pagecache_ctl *ctl,
mapping_gfp_mask(inode->i_mapping));
}
+/** Callback for getting page for tmpfs.
+ * Tmpfs uses {@link shmem_read_mapping_page_gfp} function to read
+ * page from page cache.
+ */
+struct page *file_defrag_get_page(const struct defrag_pagecache_ctl *ctl,
+ struct inode *inode, pgoff_t pageIndex)
+{
+ return read_mapping_page(inode->i_mapping, pageIndex, NULL);
+}
+
static void defrag_generic_mig_result(struct page *oldPage,
struct page *newPage, struct migration_ctl *ctl, int result)
{
@@ -258,6 +268,15 @@ int defrag_generic_shm(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(defrag_generic_shm);
+int defrag_generic_file(struct file *file, struct address_space *mapping,
+ loff_t pos,
+ struct page **pagep,
+ struct defrag_pagecache_ctl *ctl)
+{
+ return defrageOneHugePage(file, pos, pagep, ctl, file_defrag_get_page);
+}
+EXPORT_SYMBOL(defrag_generic_file);
+
int defrag_generic_pagecache(struct file *file,
struct address_space *mapping,
loff_t pos,
diff --git a/mm/filemap.c b/mm/filemap.c
index 8363cd9..f050209 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -43,6 +43,9 @@
#include <asm/mman.h>
+#ifdef CONFIG_HUGEPAGECACHE
+#include <linux/defrag-pagecache.h>
+#endif
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -1771,6 +1774,85 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
+#ifdef CONFIG_HUGEPAGECACHE
+/** DO NOT USE THIS METHOD IS STILL EXPERIMENTAL. */
+int filemap_fault_huge(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ int error;
+ int ret = VM_FAULT_LOCKED;
+
+ error = vma->vm_ops->fault(vma, vmf);
+ /* XXX Repeatable flags in __do fault etc. */
+ if (error & (VM_FAULT_ERROR | VM_FAULT_NOPAGE
+ | VM_FAULT_RETRY | VM_FAULT_NOHUGE)) {
+ return error;
+ }
+
+ /* Just portion of developer code, to force defragmentation, as we have
+ * no external interface to make defragmentation (or daemon to do it).
+ */
+ if ((vma->vm_flags & VM_HUGEPAGE) && !PageCompound(vmf->page)) {
+ /* Force defrag - mainly devo code */
+ int defragResult;
+ const loff_t hugeChunkSize = 1 << (PMD_SHIFT - PAGE_SHIFT);
+
+ const loff_t vmaSizeToMap = (vma->vm_start
+ + ((vmf->pgoff + vma->vm_pgoff + hugeChunkSize)
+ << PAGE_SHIFT) <= vma->vm_end) ?
+ hugeChunkSize : 0;
+
+ const loff_t inodeSizeToMap =
+ (vmf->pgoff + vma->vm_pgoff + hugeChunkSize <
+ inode->i_size) ? hugeChunkSize : 0;
+
+ const struct defrag_pagecache_ctl defragControl = {
+ .fillPages = 1,
+ .requireFillPages = 1,
+ .force = 1
+ };
+
+ if (ret & VM_FAULT_LOCKED) {
+ unlock_page(vmf->page);
+ }
+ put_page(vmf->page);
+
+ defragResult = defragPageCache(vma->vm_file,
+ vmf->pgoff,
+ min(vmaSizeToMap, min(inodeSizeToMap, hugeChunkSize)),
+ &defragControl);
+ printk(KERN_INFO "Page defragmented with result %d\n",
+ defragResult);
+
+ /* Retake page. */
+ error = vma->vm_ops->fault(vma, vmf);
+ if (error & (VM_FAULT_ERROR | VM_FAULT_NOPAGE
+ | VM_FAULT_RETRY | VM_FAULT_NOHUGE)) {
+ return error;
+ }
+ }
+
+ /* After standard fault page is getted. */
+ if (!compound_get(vmf->page))
+ goto no_hugepage;
+
+ get_page_tails_for_fmap(vmf->page);
+
+ if (ret & VM_FAULT_MAJOR) {
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ }
+ return ret;
+no_hugepage:
+ if (ret & VM_FAULT_LOCKED)
+ unlock_page(vmf->page);
+ page_cache_release(vmf->page);
+ vmf->page = NULL;
+ return VM_FAULT_NOHUGE;
+}
+EXPORT_SYMBOL(filemap_fault_huge);
+#endif
+
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
};
--
1.7.3.4
This is basic support for shmfs, allowing bootstraping of huge pages
in user address space.
This patch is just one first setep, it breakes kernel, because of
missing other requirements for page cache, but establishing is
done :D. Yupi!
Signed-off-by: Radosław Smogura <[email protected]>
---
include/linux/fs.h | 4 ++--
include/linux/mm.h | 4 ++--
mm/shmem.c | 30 ++++++++++++++----------------
3 files changed, 18 insertions(+), 20 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7288166..7afc38b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -595,7 +595,7 @@ struct address_space_operations {
/** Same as \a set_page_dirty but for huge page */
int (*set_page_dirty_huge)(struct page *page);
-
+
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
@@ -627,7 +627,7 @@ struct address_space_operations {
*/
int (*split_page) (struct file *file, struct address_space *mapping,
loff_t pos, struct page *hueg_page);
-
+
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 27a10c8..236a6be 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -210,7 +210,7 @@ struct vm_operations_struct {
* If function fails, then caller may try again with fault.
*/
int (*fault_huge)(struct vm_area_struct *vma, struct vm_fault *vmf);
-
+
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -218,7 +218,7 @@ struct vm_operations_struct {
/** Same as \a page_mkwrite, but for huge page. */
int (*page_mkwrite_huge)(struct vm_area_struct *vma,
struct vm_fault *vmf);
-
+
/* called by access_process_vm when get_user_pages() fails, typically
* for use by special VMAs that can switch between memory and hardware
*/
diff --git a/mm/shmem.c b/mm/shmem.c
index a834488..97e76b9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -67,6 +67,10 @@ static struct vfsmount *shm_mnt;
#include <asm/uaccess.h>
#include <asm/pgtable.h>
+#ifdef CONFIG_HUGEPAGECACHE
+#include <linux/defrag-pagecache.h>
+#endif
+
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
@@ -1119,24 +1123,12 @@ static int shmem_fault_huge(struct vm_area_struct *vma, struct vm_fault *vmf)
}
}
- /* XXX Page & compound lock ordering please... */
-
/* After standard fault page is getted. */
- if (PageCompound(vmf->page)) {
- compound_lock(vmf->page);
- if (!PageHead(vmf->page)) {
- compound_unlock(vmf->page);
- goto no_hugepage;
- }
- }else {
+ if (!compound_get(vmf->page))
goto no_hugepage;
- }
-
- if (!(ret & VM_FAULT_LOCKED))
- lock_page(vmf->page);
-
- ret |= VM_FAULT_LOCKED;
-
+
+ get_page_tails_for_fmap(vmf->page);
+
if (ret & VM_FAULT_MAJOR) {
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
@@ -2381,6 +2373,9 @@ static const struct address_space_operations shmem_aops = {
#endif
.migratepage = migrate_page,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_HUGEPAGECACHE
+ .defragpage = defrag_generic_shm,
+#endif
};
static const struct file_operations shmem_file_operations = {
@@ -2458,6 +2453,9 @@ static const struct super_operations shmem_ops = {
static const struct vm_operations_struct shmem_vm_ops = {
.fault = shmem_fault,
+#ifdef CONFIG_SHMEM_HUGEPAGECACHE
+ .fault_huge = shmem_fault_huge,
+#endif
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
--
1.7.3.4
Writeback will be added in next patches, but after experimental support
for huge pages for EXT 4.
Signed-off-by: Radosław Smogura <[email protected]>
---
mm/shmem.c | 39 ++++++++++++++++++++++++++++++++++++++-
1 files changed, 38 insertions(+), 1 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index 97e76b9..db377bf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -454,6 +454,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
+ struct page *head = NULL;
index = indices[i];
if (index > end)
@@ -464,12 +465,32 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
index, page);
continue;
}
-
if (!trylock_page(page))
continue;
+ if (PageCompound(page)) {
+ head = compound_head(page);
+ switch (compound_try_freeze(head, false)) {
+ case -1:
+ head = NULL;
+ break;
+ case 1:
+ unlock_page(page);
+ continue;
+ case 0:
+ if (!split_huge_page_file(head, page))
+ head = NULL;
+ break;
+ }
+ }
+ /* Truncate inode page may try to freez, so unfreez. */
if (page->mapping == mapping) {
VM_BUG_ON(PageWriteback(page));
+ if (head != NULL)
+ compound_unfreeze(head);
truncate_inode_page(mapping, page);
+ } else {
+ if (head != NULL)
+ compound_unfreeze(head);
}
unlock_page(page);
}
@@ -511,6 +532,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
+ struct page *head = NULL;
index = indices[i];
if (index > end)
@@ -523,9 +545,24 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
}
lock_page(page);
+ if (PageCompound(page)) {
+ head = compound_head(page);
+ if (compound_freeze(head)) {
+ if (!split_huge_page_file(head, page))
+ head = NULL;
+ } else {
+ head = NULL;
+ }
+ }
+ /* Truncate inode page may try to freez, so unfreez. */
if (page->mapping == mapping) {
VM_BUG_ON(PageWriteback(page));
+ if (head != NULL)
+ compound_unfreeze(head);
truncate_inode_page(mapping, page);
+ } else {
+ if (head != NULL)
+ compound_unfreeze(head);
}
unlock_page(page);
}
--
1.7.3.4
On Thu, 16 Feb 2012 12:26:00 -0800, Radosław Smogura <[email protected]> wrote:
> On Thu, 16 Feb 2012 11:09:18 -0800, Michal Nazarewicz wrote:
>> On Thu, 16 Feb 2012 06:31:28 -0800, Radosław Smogura
>> <[email protected]> wrote:
>>> Supporting files, like Kconfig, Makefile are auto-generated due to
>>> large amount
>>> of available options.
>>
>> So why not run the script as part of make rather then store generated
>> files in
>> repository?
> Idea to run this script through make is quite good, and should work,
> because new mane will be generated before "config" starts.
>
> "Bashizms" are indeed unneeded, I will try to replace this with sed.
Uh? Why sed?
--
Best regards, _ _
.o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o
..o | Computer Science, Michał “mina86” Nazarewicz (o o)
ooo +----<email/xmpp: [email protected]>--------------ooO--(_)--Ooo--
On Thu, 16 Feb 2012 13:59:29 -0800, Michal Nazarewicz wrote:
> On Thu, 16 Feb 2012 12:26:00 -0800, Radosław Smogura
> <[email protected]> wrote:
>
>> On Thu, 16 Feb 2012 11:09:18 -0800, Michal Nazarewicz wrote:
>>> On Thu, 16 Feb 2012 06:31:28 -0800, Radosław Smogura
>>> <[email protected]> wrote:
>>>> Supporting files, like Kconfig, Makefile are auto-generated due to
>>>> large amount
>>>> of available options.
>>>
>>> So why not run the script as part of make rather then store
>>> generated
>>> files in
>>> repository?
>> Idea to run this script through make is quite good, and should work,
>> because new mane will be generated before "config" starts.
>>
>> "Bashizms" are indeed unneeded, I will try to replace this with sed.
>
> Uh? Why sed?
There are some substitutions so I it will be better to use sed.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Thu, 16 Feb 2012 14:40:17 -0800, Radosław Smogura <[email protected]> wrote:
> On Thu, 16 Feb 2012 13:59:29 -0800, Michal Nazarewicz wrote:
>> On Thu, 16 Feb 2012 12:26:00 -0800, Radosław Smogura
>> <[email protected]> wrote:
>>
>>> On Thu, 16 Feb 2012 11:09:18 -0800, Michal Nazarewicz wrote:
>>>> On Thu, 16 Feb 2012 06:31:28 -0800, Radosław Smogura
>>>> <[email protected]> wrote:
>>>>> Supporting files, like Kconfig, Makefile are auto-generated due to
>>>>> large amount
>>>>> of available options.
>>>>
>>>> So why not run the script as part of make rather then store
>>>> generated
>>>> files in
>>>> repository?
>>> Idea to run this script through make is quite good, and should work,
>>> because new mane will be generated before "config" starts.
>>>
>>> "Bashizms" are indeed unneeded, I will try to replace this with sed.
>>
>> Uh? Why sed?
> There are some substitutions so I it will be better to use sed.
Do you mean this:
> cfg_o="${CFG_PREFIX}_${o//-/_}";
It can be achieved with “tr” like I've shown:
> cfg_o=$CFG_PREFIX_$(echo "$o" | tr '[:lower:]-' '[:upper:]_')
--
Best regards, _ _
.o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o
..o | Computer Science, Michał “mina86” Nazarewicz (o o)
ooo +----<email/xmpp: [email protected]>--------------ooO--(_)--Ooo--
New menu under kernel hacking allows to force "-01" optimization
and gives ability to discard additional optimizations (implicite
passed with -O1). Options are added as "-f-no-..." to GCC invoke line.
This may produce additional warnings, but makes compiled code more debug
friendly.
Patch is integrated with main Makefile. It generates additional KConfig
and Makefile from script so sources are keep clean. Adding
additional deopitmization option, requires appending, just, one line
in script.
Some options are specific to pariticullar GCC versions.
---
.gitignore | 5 +
Makefile | 28 +++++++-
lib/Kconfig.debug | 2 +
scripts/debug/make_config_optim.sh | 142 ++++++++++++++++++++++++++++++++++++
4 files changed, 175 insertions(+), 2 deletions(-)
create mode 100644 scripts/debug/make_config_optim.sh
diff --git a/.gitignore b/.gitignore
index 57af07c..1ad2a92 100644
--- a/.gitignore
+++ b/.gitignore
@@ -84,3 +84,8 @@ GTAGS
*.orig
*~
\#*#
+
+# Deoptimzation generated files
+scripts/Makefile.optim.inc
+lib/Kconfig.debug.optim
+
diff --git a/Makefile b/Makefile
index 7c44b67..d1c4080 100644
--- a/Makefile
+++ b/Makefile
@@ -131,7 +131,7 @@ sub-make: FORCE
KBUILD_SRC=$(CURDIR) \
KBUILD_EXTMOD="$(KBUILD_EXTMOD)" -f $(CURDIR)/Makefile \
$(filter-out _all sub-make,$(MAKECMDGOALS))
-
+
# Leave processing to above invocation of make
skip-makefile := 1
endif # ifneq ($(KBUILD_OUTPUT),)
@@ -432,6 +432,19 @@ asm-generic:
$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.asm-generic \
obj=arch/$(SRCARCH)/include/generated/asm
+# Support for auto generating hackers deoptimization
+PHONY += deoptimize-config
+DEOPTIMIZE_FILES := $(srctree)/lib/Kconfig.debug.optim \
+ $(srctree)/scripts/Makefile.optim.inc
+
+$(DEOPTIMIZE_FILES): $(srctree)/scripts/debug/make_config_optim.sh
+ $(Q)$(srctree)/scripts/debug/make_config_optim.sh --kconfig \
+ > $(srctree)/lib/Kconfig.debug.optim
+ $(Q)$(srctree)/scripts/debug/make_config_optim.sh --makefile \
+ > scripts/Makefile.optim.inc
+
+deoptimize-config: $(DEOPTIMIZE_FILES)
+
# To make sure we do not include .config for any of the *config targets
# catch them early, and hand them over to scripts/kconfig/Makefile
# It is allowed to specify more targets when calling make, including
@@ -484,7 +497,7 @@ ifeq ($(config-targets),1)
include $(srctree)/arch/$(SRCARCH)/Makefile
export KBUILD_DEFCONFIG KBUILD_KCONFIG
-config: scripts_basic outputmakefile FORCE
+config: deoptimize-config scripts_basic outputmakefile FORCE
$(Q)mkdir -p include/linux include/config
$(Q)$(MAKE) $(build)=scripts/kconfig $@
@@ -558,12 +571,23 @@ endif # $(dot-config)
# Defaults to vmlinux, but the arch makefile usually adds further targets
all: vmlinux
+ifdef CONFIG_HACK_OPTIM_FORCE_O1_LEVEL
+KBUILD_CFLAGS += -O1
+else
+
ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
KBUILD_CFLAGS += -Os
else
KBUILD_CFLAGS += -O2
endif
+endif
+
+# Include makefile for optimization override
+ifdef CONFIG_HACK_OPTIM
+include $(srctree)/scripts/Makefile.optim.inc
+endif
+
include $(srctree)/arch/$(SRCARCH)/Makefile
ifneq ($(CONFIG_FRAME_WARN),0)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8745ac7..928265e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1274,5 +1274,7 @@ source "lib/Kconfig.kgdb"
source "lib/Kconfig.kmemcheck"
+source "lib/Kconfig.debug.optim"
+
config TEST_KSTRTOX
tristate "Test kstrto*() family of functions at runtime"
diff --git a/scripts/debug/make_config_optim.sh b/scripts/debug/make_config_optim.sh
new file mode 100644
index 0000000..35bb321
--- /dev/null
+++ b/scripts/debug/make_config_optim.sh
@@ -0,0 +1,142 @@
+#!/bin/sh
+
+############################################################################
+## Utility script for generating deoptimization overrides for kernel
+## compilation.
+##
+## By default kernel is build with -O2 or -Os options which make it harder
+## to debug, due to reorganization of execution paths, stripping code or
+## variables.
+##
+## This script generates KConfig and Makefiles includes to force -O1 and
+## add many deoptimization parameters to GCC (-fno-*).
+##
+## Happy hacking.
+##
+## Distributed under GPL v2 license
+## (c) Radosław Smogura, 2011,2012
+
+# Prefix added for variable
+CFG_PREFIX="HACK_OPTIM"
+
+function printHelpAndExit() {
+ printf "KConfig and Makefile generator for kenrel deoptimization :)\n"
+ printf "Use one of options:\n"
+ printf "%s\n" " --kconfig - to generate Kconfig.debug.optim"
+ printf "%s\n" " --makefile - to generate Makefile.optim.inc"
+ printf "Files are generated to standard output\n"
+ exit 1;
+}
+
+if [ "$1" != "--kconfig" ] && [ "$1" != "--makefile" ]; then
+ printHelpAndExit
+fi
+
+OPTIMIZATIONS_PARAMS="-fno-inline-functions-called-once
+ -fno-combine-stack-adjustments
+ -fno-tree-dce
+ -fno-tree-dominator-opts
+ -fno-dse
+ -fno-dce
+ -fno-auto-inc-dec
+ -fno-inline-small-functions
+ -fno-if-conversion
+ -fno-if-conversion2
+ -fno-tree-fre
+ -fno-tree-dse
+ -fno-tree-sra
+"
+
+function printStandardHelp() {
+ printf "\t This changes how GCC optimizes code. Code\n"
+ printf "\t may be slower and larger but will be more debug\n"
+ printf "\t \"friendly\".\n"
+ printf "\n"
+ printf "\t In some cases there is a low chance that the kernel\n"
+ printf "\t will run differently than normal, reporting or not\n"
+ printf "\t reporting some bugs or errors.\n"
+ printf "\t Refer to GCC manual for more details.\n"
+ printf "\n"
+ printf "\t You SHOULD say N here.\n"
+}
+
+function printFileHeader() {
+ printf "################################################################\n"
+ printf "## THIS FILE WAS AUTO GENERATED.\n"
+ printf "## YOU MAY MODIFY IT, BUT YOUR MODIFICATIONS MAY BE LOST\n"
+ printf "## GENERATED ON $(date)\n"
+ printf "## BY $0\n"
+ printf "## Distributed under GPL v2 License\n"
+ printf "##\n"
+ printf "## Happy hacking.\n"
+ printf "################################################################\n"
+}
+
+function printKconfigHeader() {
+ printFileHeader;
+ printf "\n"
+ printf "menuconfig ${CFG_PREFIX}\n"
+ printf "\tbool \"Allows overriding GCC optimizations\"\n"
+ printf "\tdepends on DEBUG_KERNEL && EXPERIMENTAL\n"
+ printf "\thelp\n"
+ printf "\t If you say Y here you will be able to override\n"
+ printf "\t how GCC optimizes kernel code. This creates\n"
+ printf "\t more debug-friendly code, but does not guarantee\n"
+ printf "\t the same running code like a production kernel.\n"
+ printf "\n"
+ printf "\t If you say Y here probably you will want to say\n"
+ printf "\t Y for all suboptions\n"
+ printf "\n"
+ printf "if ${CFG_PREFIX}\n"
+
+ # Insert standard override optimization level
+ # This is exception, and this value will not be included
+ # in auto generated makefile. Support for this value
+ # is hard coded in main Makefile.
+ printf "config ${CFG_PREFIX}_FORCE_O1_LEVEL\n"
+ printf "\tbool \"Forces -O1 optimization level\"\n"
+ printf "\t---help---\n"
+ printStandardHelp;
+ printf "\n"
+}
+
+function printMakeOptimStart() {
+ printFileHeader;
+ printf "\n"
+}
+
+if [ "$1" == "--kconfig" ]; then
+ printKconfigHeader;
+else
+ printMakeOptimStart;
+fi
+
+# Print each option to KConfig and Makefile
+for o in $OPTIMIZATIONS_PARAMS ; do
+ # I'm not shell script guru, but it looks like printf is not portable
+ # across various shells, in my Gentoo bash if text starts with -, then
+ # printf prints error (e.g. printf "-f-no" | sed... cause error),
+ # in posh (Bourne sh clone, it's hard to get original sh) works good,
+ # so we use here extended form of %s
+ cfg_o="${CFG_PREFIX}_$(printf "%s" "${o}" |sed -r -e 's/-/_/g;' )";
+
+ if [ "$1" == "--kconfig" ]; then
+ # Generate kconfig entry
+ printf "config ${cfg_o}\n";
+ printf "\tbool \"Adds $o parameter to gcc invoke line.\"\n";
+ printf "\t---help---\n";
+ printStandardHelp;
+ printf "\n";
+ else
+ #Generate Make for include
+ printf "ifdef CONFIG_${cfg_o}\n";
+ printf "\tKBUILD_CFLAGS += $o\n";
+ printf "endif\n";
+ printf "\n";
+ fi
+done;
+
+# Close KConfig
+if [ "$1" == "--kconfig" ]; then
+ echo "endif # if ${CFG_PREFIX}";
+fi
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>