Here is V4 of the cleanup and optimisation of the page allocator and it
should be ready for general testing. The main difference from V3 is that the
controversial patches have been dropped and I'll revisit them later. Tests
are currently running to I have exact figures of how things stand on the
test machines I used but I think this can be considered a merge candidate,
possibly for 2.6.30 depending on how reviews and wider testing goes.
Changes since V3
o Drop the more controversial patches for now and focus on the "obvious win"
material
o Add reviewed-by notes
o Fix changelog entry to say __rmqueue_fallback instead __rmqueue
o Add unlikely() for the clearMlocked check
o Change where PGFREE is accounted in free_hot_cold_page() to have symmetry
with __free_pages_ok()
Changes since V2
o Remove brances by treating watermark flags as array indices
o Remove branch by assuming __GFP_HIGH == ALLOC_HIGH
o Do not check for compound on every page free
o Remove branch by always ensuring the migratetype is known on free
o Simplify buffered_rmqueue further
o Reintroduce improved version of batched bulk free of pcp pages
o Use allocation flags as an index to zone watermarks
o Work out __GFP_COLD only once
o Reduce the number of times zone stats are updated
o Do not dump reserve pages back into the allocator. Instead treat them
as MOVABLE so that MIGRATE_RESERVE gets used on the max-order-overlapped
boundaries without causing trouble
o Allow pages up to PAGE_ALLOC_COSTLY_ORDER to use the per-cpu allocator.
order-1 allocations are frequently enough in particular to justify this
o Rearrange inlining such that the hot-path is inlined but not in a way
that increases the text size of the page allocator
o Make the check for needing additional zonelist filtering due to NUMA
or cpusets as light as possible
o Do not destroy compound pages going to the PCP lists
o Delay the merging of buddies until a high-order allocation needs them
or anti-fragmentation is being forced to fallback
Changes since V1
o Remove the ifdef CONFIG_CPUSETS from inside get_page_from_freelist()
o Use non-lock bit operations for clearing the mlock flag
o Factor out alloc_flags calculation so it is only done once (Peter)
o Make gfp.h a bit prettier and clear-cut (Peter)
o Instead of deleting a debugging check, replace page_count() in the
free path with a version that does not check for compound pages (Nick)
o Drop the alteration for hot/cold page freeing until we know if it
helps or not
__alloc_pages_internal is the core page allocator function but
essentially it is an alias of __alloc_pages_nodemask. Naming a publicly
available and exported function "internal" is also a big ugly. This
patch renames __alloc_pages_internal() to __alloc_pages_nodemask() and
deletes the old nodemask function.
Warning - This patch renames an exported symbol. No kernel driver is
affected by external drivers calling __alloc_pages_internal() should
change the call to __alloc_pages_nodemask() without any alteration of
parameters.
Signed-off-by: Mel Gorman <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
---
include/linux/gfp.h | 12 ++----------
mm/page_alloc.c | 4 ++--
2 files changed, 4 insertions(+), 12 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index dd20cd7..dcf0ab8 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -168,24 +168,16 @@ static inline void arch_alloc_page(struct page *page, int order) { }
#endif
struct page *
-__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask);
static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
- return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
+ return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}
-static inline struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
-{
- return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
-}
-
-
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c44ed4..0671b3f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1464,7 +1464,7 @@ try_next_zone:
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
-__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
@@ -1670,7 +1670,7 @@ nopage:
got_pg:
return page;
}
-EXPORT_SYMBOL(__alloc_pages_internal);
+EXPORT_SYMBOL(__alloc_pages_nodemask);
/*
* Common helper functions.
--
1.5.6.5
Callers of alloc_pages_node() can optionally specify -1 as a node to mean
"allocate from the current node". However, a number of the callers in fast
paths know for a fact their node is valid. To avoid a comparison and branch,
this patch adds alloc_pages_exact_node() that only checks the nid with
VM_BUG_ON(). Callers that know their node is valid are then converted.
Signed-off-by: Mel Gorman <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
---
arch/ia64/hp/common/sba_iommu.c | 2 +-
arch/ia64/kernel/mca.c | 3 +--
arch/ia64/kernel/uncached.c | 3 ++-
arch/ia64/sn/pci/pci_dma.c | 3 ++-
arch/powerpc/platforms/cell/ras.c | 2 +-
arch/x86/kvm/vmx.c | 2 +-
drivers/misc/sgi-gru/grufile.c | 2 +-
drivers/misc/sgi-xp/xpc_uv.c | 2 +-
include/linux/gfp.h | 9 +++++++++
include/linux/mm.h | 1 -
kernel/profile.c | 8 ++++----
mm/filemap.c | 2 +-
mm/hugetlb.c | 4 ++--
mm/mempolicy.c | 2 +-
mm/migrate.c | 2 +-
mm/slab.c | 4 ++--
mm/slob.c | 4 ++--
17 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 6d5e6c5..66a3257 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1116,7 +1116,7 @@ sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp
#ifdef CONFIG_NUMA
{
struct page *page;
- page = alloc_pages_node(ioc->node == MAX_NUMNODES ?
+ page = alloc_pages_exact_node(ioc->node == MAX_NUMNODES ?
numa_node_id() : ioc->node, flags,
get_order(size));
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index bab1de2..2e614bd 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1829,8 +1829,7 @@ ia64_mca_cpu_init(void *cpu_data)
data = mca_bootmem();
first_time = 0;
} else
- data = page_address(alloc_pages_node(numa_node_id(),
- GFP_KERNEL, get_order(sz)));
+ data = __get_free_pages(GFP_KERNEL, get_order(sz));
if (!data)
panic("Could not allocate MCA memory for cpu %d\n",
cpu);
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 8eff8c1..6ba72ab 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -98,7 +98,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
/* attempt to allocate a granule's worth of cached memory pages */
- page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ page = alloc_pages_exact_node(nid,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
IA64_GRANULE_SHIFT-PAGE_SHIFT);
if (!page) {
mutex_unlock(&uc_pool->add_chunk_mutex);
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index 863f501..2aa52de 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -91,7 +91,8 @@ void *sn_dma_alloc_coherent(struct device *dev, size_t size,
*/
node = pcibus_to_node(pdev->bus);
if (likely(node >=0)) {
- struct page *p = alloc_pages_node(node, flags, get_order(size));
+ struct page *p = alloc_pages_exact_node(node,
+ flags, get_order(size));
if (likely(p))
cpuaddr = page_address(p);
diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c
index 5f961c4..16ba671 100644
--- a/arch/powerpc/platforms/cell/ras.c
+++ b/arch/powerpc/platforms/cell/ras.c
@@ -122,7 +122,7 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order)
area->nid = nid;
area->order = order;
- area->pages = alloc_pages_node(area->nid, GFP_KERNEL, area->order);
+ area->pages = alloc_pages_exact_node(area->nid, GFP_KERNEL, area->order);
if (!area->pages)
goto out_free_area;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7611af5..cca119a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1244,7 +1244,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
struct page *pages;
struct vmcs *vmcs;
- pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+ pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index 6509838..52d4160 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -309,7 +309,7 @@ static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr)
pnode = uv_node_to_pnode(nid);
if (gru_base[bid])
continue;
- page = alloc_pages_node(nid, GFP_KERNEL, order);
+ page = alloc_pages_exact_node(nid, GFP_KERNEL, order);
if (!page)
goto fail;
gru_base[bid] = page_address(page);
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index 29c0502..0563350 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -184,7 +184,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
mq->mmr_blade = uv_cpu_to_blade_id(cpu);
nid = cpu_to_node(cpu);
- page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ page = alloc_pages_exact_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
pg_order);
if (page == NULL) {
dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d "
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 8736047..59eb093 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -4,6 +4,7 @@
#include <linux/mmzone.h>
#include <linux/stddef.h>
#include <linux/linkage.h>
+#include <linux/mmdebug.h>
struct vm_area_struct;
@@ -188,6 +189,14 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}
+static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
+ unsigned int order)
+{
+ VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+
+ return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
+}
+
#ifdef CONFIG_NUMA
extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 065cdf8..565e7b2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -7,7 +7,6 @@
#include <linux/gfp.h>
#include <linux/list.h>
-#include <linux/mmdebug.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
#include <linux/prio_tree.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e04..62e08db 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -371,7 +371,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -379,7 +379,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -570,14 +570,14 @@ static int create_hash_tables(void)
int node = cpu_to_node(cpu);
struct page *page;
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
0);
if (!page)
diff --git a/mm/filemap.c b/mm/filemap.c
index 23acefe..2523d95 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -519,7 +519,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
{
if (cpuset_do_page_mem_spread()) {
int n = cpuset_mem_spread_node();
- return alloc_pages_node(n, gfp, 0);
+ return alloc_pages_exact_node(n, gfp, 0);
}
return alloc_pages(gfp, 0);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 107da3d..1e99997 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -630,7 +630,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
if (h->order >= MAX_ORDER)
return NULL;
- page = alloc_pages_node(nid,
+ page = alloc_pages_exact_node(nid,
htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
@@ -649,7 +649,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
* Use a helper variable to find the next node and then
* copy it back to hugetlb_next_nid afterwards:
* otherwise there's a window in which a racer might
- * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
* But we don't need to use a spin_lock here: it really
* doesn't matter if occasionally a racer chooses the
* same nid as we do. Move nid forward in the mask even
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6f..341fbca 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -767,7 +767,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
- return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
+ return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
}
/*
diff --git a/mm/migrate.c b/mm/migrate.c
index a9eff3f..6bda9c2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
*result = &pm->status;
- return alloc_pages_node(pm->node,
+ return alloc_pages_exact_node(pm->node,
GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
}
diff --git a/mm/slab.c b/mm/slab.c
index 4d00855..e7f1ded 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1680,7 +1680,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
- page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+ page = alloc_pages_exact_node(nodeid, flags, cachep->gfporder);
if (!page)
return NULL;
@@ -3210,7 +3210,7 @@ retry:
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- obj = kmem_getpages(cache, local_flags, -1);
+ obj = kmem_getpages(cache, local_flags, numa_node_id());
if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {
diff --git a/mm/slob.c b/mm/slob.c
index 52bc8a2..d646a4c 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -46,7 +46,7 @@
* NUMA support in SLOB is fairly simplistic, pushing most of the real
* logic down to the page allocator, and simply doing the node accounting
* on the upper levels. In the event that a node id is explicitly
- * provided, alloc_pages_node() with the specified node id is used
+ * provided, alloc_pages_exact_node() with the specified node id is used
* instead. The common case (or when the node id isn't explicitly provided)
* will default to the current node, as per numa_node_id().
*
@@ -236,7 +236,7 @@ static void *slob_new_page(gfp_t gfp, int order, int node)
#ifdef CONFIG_NUMA
if (node != -1)
- page = alloc_pages_node(node, gfp, order);
+ page = alloc_pages_exact_node(node, gfp, order);
else
#endif
page = alloc_pages(gfp, order);
--
1.5.6.5
No user of the allocator API should be passing in an order >= MAX_ORDER
but we check for it on each and every allocation. Delete this check and
make it a VM_BUG_ON check further down the call path.
Signed-off-by: Mel Gorman <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
---
include/linux/gfp.h | 6 ------
mm/page_alloc.c | 2 ++
2 files changed, 2 insertions(+), 6 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index dcf0ab8..8736047 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -181,9 +181,6 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
- if (unlikely(order >= MAX_ORDER))
- return NULL;
-
/* Unknown node is current node */
if (nid < 0)
nid = numa_node_id();
@@ -197,9 +194,6 @@ extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)
{
- if (unlikely(order >= MAX_ORDER))
- return NULL;
-
return alloc_pages_current(gfp_mask, order);
}
extern struct page *alloc_page_vma(gfp_t gfp_mask,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0671b3f..dd87dad 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1407,6 +1407,8 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
classzone_idx = zone_idx(preferred_zone);
+ VM_BUG_ON(order >= MAX_ORDER);
+
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
--
1.5.6.5
It is possible with __GFP_THISNODE that no zones are suitable. This
patch makes sure the check is only made once.
Signed-off-by: Mel Gorman <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
---
mm/page_alloc.c | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd87dad..8024abc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1486,9 +1486,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
-restart:
- z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */
-
+ /* the list of zones suitable for gfp_mask */
+ z = zonelist->_zonerefs;
if (unlikely(!z->zone)) {
/*
* Happens if we have an empty zonelist as a result of
@@ -1497,6 +1496,7 @@ restart:
return NULL;
}
+restart:
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
if (page)
--
1.5.6.5
Zonelist are filtered based on nodemasks for memory policies normally.
It can be additionally filters on cpusets if they exist as well as
noting when zones are full. These simple checks are expensive enough to
be noticed in profiles. This patch checks in advance if zonelist
filtering will ever be needed. If not, then the bulk of the checks are
skipped.
Signed-off-by: Mel Gorman <[email protected]>
---
include/linux/cpuset.h | 2 ++
mm/page_alloc.c | 37 ++++++++++++++++++++++++++-----------
2 files changed, 28 insertions(+), 11 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 90c6074..6051082 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -83,6 +83,8 @@ extern void cpuset_print_task_mems_allowed(struct task_struct *p);
#else /* !CONFIG_CPUSETS */
+#define number_of_cpusets (0)
+
static inline int cpuset_init_early(void) { return 0; }
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d815c8f..fe71147 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1139,7 +1139,11 @@ failed:
#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#ifdef CONFIG_CPUSETS
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#else
+#define ALLOC_CPUSET 0x00
+#endif /* CONFIG_CPUSETS */
#ifdef CONFIG_FAIL_PAGE_ALLOC
@@ -1403,6 +1407,7 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
+ int zonelist_filter = 0;
(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
&preferred_zone);
@@ -1413,6 +1418,10 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
VM_BUG_ON(order >= MAX_ORDER);
+ /* Determine in advance if the zonelist needs filtering */
+ if ((alloc_flags & ALLOC_CPUSET) && unlikely(number_of_cpusets > 1))
+ zonelist_filter = 1;
+
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
@@ -1420,12 +1429,16 @@ zonelist_scan:
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
- if (NUMA_BUILD && zlc_active &&
- !zlc_zone_worth_trying(zonelist, z, allowednodes))
- continue;
- if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed_softwall(zone, gfp_mask))
- goto try_next_zone;
+
+ /* Ignore the additional zonelist filter checks if possible */
+ if (zonelist_filter) {
+ if (NUMA_BUILD && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
+ if ((alloc_flags & ALLOC_CPUSET) &&
+ !cpuset_zone_allowed_softwall(zone, gfp_mask))
+ goto try_next_zone;
+ }
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
@@ -1447,13 +1460,15 @@ zonelist_scan:
if (page)
break;
this_zone_full:
- if (NUMA_BUILD)
+ if (NUMA_BUILD && zonelist_filter)
zlc_mark_zone_full(zonelist, z);
try_next_zone:
- if (NUMA_BUILD && !did_zlc_setup) {
- /* we do zlc_setup after the first zone is tried */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
+ if (NUMA_BUILD && zonelist_filter) {
+ if (!did_zlc_setup) {
+ /* do zlc_setup after the first zone is tried */
+ allowednodes = zlc_setup(zonelist, alloc_flags);
+ zlc_active = 1;
+ }
did_zlc_setup = 1;
}
}
--
1.5.6.5
The core of the page allocator is one giant function which allocates memory
on the stack and makes calculations that may not be needed for every
allocation. This patch breaks up the allocator path into fast and slow
paths for clarity. Note the slow paths are still inlined but the entry is
marked unlikely. If they were not inlined, it actally increases text size
to generate the as there is only one call site.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 348 ++++++++++++++++++++++++++++++++++---------------------
1 files changed, 218 insertions(+), 130 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8024abc..7ba7705 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1462,45 +1462,171 @@ try_next_zone:
return page;
}
-/*
- * This is the 'heart' of the zoned buddy allocator.
- */
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
+static inline int
+should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+ unsigned long pages_reclaimed)
{
- const gfp_t wait = gfp_mask & __GFP_WAIT;
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
- struct zoneref *z;
- struct zone *zone;
- struct page *page;
- struct reclaim_state reclaim_state;
- struct task_struct *p = current;
- int do_retry;
- int alloc_flags;
- unsigned long did_some_progress;
- unsigned long pages_reclaimed = 0;
+ /* Do not loop if specifically requested */
+ if (gfp_mask & __GFP_NORETRY)
+ return 0;
- might_sleep_if(wait);
+ /*
+ * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+ * means __GFP_NOFAIL, but that may not be true in other
+ * implementations.
+ */
+ if (order <= PAGE_ALLOC_COSTLY_ORDER)
+ return 1;
- if (should_fail_alloc_page(gfp_mask, order))
- return NULL;
+ /*
+ * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+ * specified, then we retry until we no longer reclaim any pages
+ * (above), or we've reclaimed an order of pages at least as
+ * large as the allocation's order. In both cases, if the
+ * allocation still fails, we stop retrying.
+ */
+ if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
+ return 1;
- /* the list of zones suitable for gfp_mask */
- z = zonelist->_zonerefs;
- if (unlikely(!z->zone)) {
- /*
- * Happens if we have an empty zonelist as a result of
- * GFP_THISNODE being used on a memoryless node
- */
+ /*
+ * Don't let big-order allocations loop unless the caller
+ * explicitly requests that.
+ */
+ if (gfp_mask & __GFP_NOFAIL)
+ return 1;
+
+ return 0;
+}
+
+static inline struct page *
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask)
+{
+ struct page *page;
+
+ /* Acquire the OOM killer lock for the zones in zonelist */
+ if (!try_set_zone_oom(zonelist, gfp_mask)) {
+ schedule_timeout_uninterruptible(1);
return NULL;
}
-restart:
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
- zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+ /*
+ * Go through the zonelist yet one more time, keep very high watermark
+ * here, this is only to catch a parallel oom killing, we must fail if
+ * we're still under heavy pressure.
+ */
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+ order, zonelist, high_zoneidx,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET);
if (page)
- goto got_pg;
+ goto out;
+
+ /* The OOM killer will not help higher order allocs */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto out;
+
+ /* Exhausted what can be done so it's blamo time */
+ out_of_memory(zonelist, gfp_mask, order);
+
+out:
+ clear_zonelist_oom(zonelist, gfp_mask);
+ return page;
+}
+
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
+{
+ struct page *page = NULL;
+ struct reclaim_state reclaim_state;
+ struct task_struct *p = current;
+
+ cond_resched();
+
+ /* We now go into synchronous reclaim */
+ cpuset_memory_pressure_bump();
+
+ /*
+ * The task's cpuset might have expanded its set of allowable nodes
+ */
+ cpuset_update_task_memory_state();
+ p->flags |= PF_MEMALLOC;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+
+ *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+
+ p->reclaim_state = NULL;
+ p->flags &= ~PF_MEMALLOC;
+
+ cond_resched();
+
+ if (order != 0)
+ drain_all_pages();
+
+ if (likely(*did_some_progress))
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx, alloc_flags);
+ return page;
+}
+
+static inline int
+is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
+{
+ if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+ && !in_interrupt())
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ return 1;
+ return 0;
+}
+
+/*
+ * This is called in the allocator slow-path if the allocation request is of
+ * sufficient urgency to ignore watermarks and take other desperate measures
+ */
+static inline struct page *
+__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask)
+{
+ struct page *page;
+
+ do {
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+
+ if (!page && gfp_mask & __GFP_NOFAIL)
+ congestion_wait(WRITE, HZ/50);
+ } while (!page && (gfp_mask & __GFP_NOFAIL));
+
+ return page;
+}
+
+static inline
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
+ enum zone_type high_zoneidx)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ wakeup_kswapd(zone, order);
+}
+
+static inline struct page *
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask)
+{
+ const gfp_t wait = gfp_mask & __GFP_WAIT;
+ struct page *page = NULL;
+ int alloc_flags;
+ unsigned long pages_reclaimed = 0;
+ unsigned long did_some_progress;
+ struct task_struct *p = current;
/*
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1513,8 +1639,7 @@ restart:
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
- wakeup_kswapd(zone, order);
+ wake_all_kswapd(order, zonelist, high_zoneidx);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -1534,6 +1659,7 @@ restart:
if (wait)
alloc_flags |= ALLOC_CPUSET;
+restart:
/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
* coming from realtime tasks go deeper into reserves.
@@ -1547,118 +1673,47 @@ restart:
if (page)
goto got_pg;
- /* This allocation should allow future memory freeing. */
-
-rebalance:
- if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
- && !in_interrupt()) {
- if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-nofail_alloc:
- /* go through the zonelist yet again, ignoring mins */
- page = get_page_from_freelist(gfp_mask, nodemask, order,
- zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
- if (page)
- goto got_pg;
- if (gfp_mask & __GFP_NOFAIL) {
- congestion_wait(WRITE, HZ/50);
- goto nofail_alloc;
- }
- }
- goto nopage;
- }
+ /* Allocate without watermarks if the context allows */
+ if (is_allocation_high_priority(p, gfp_mask))
+ page = __alloc_pages_high_priority(gfp_mask, order,
+ zonelist, high_zoneidx, nodemask);
+ if (page)
+ goto got_pg;
/* Atomic allocations - we can't balance anything */
if (!wait)
goto nopage;
- cond_resched();
+ /* Try direct reclaim and then allocating */
+ page = __alloc_pages_direct_reclaim(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, &did_some_progress);
+ if (page)
+ goto got_pg;
- /* We now go into synchronous reclaim */
- cpuset_memory_pressure_bump();
/*
- * The task's cpuset might have expanded its set of allowable nodes
+ * If we failed to make any progress reclaiming, then we are
+ * running out of options and have to consider going OOM
*/
- cpuset_update_task_memory_state();
- p->flags |= PF_MEMALLOC;
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
-
- did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
-
- p->reclaim_state = NULL;
- p->flags &= ~PF_MEMALLOC;
-
- cond_resched();
-
- if (order != 0)
- drain_all_pages();
+ if (!did_some_progress) {
+ if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+ page = __alloc_pages_may_oom(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask);
+ if (page)
+ goto got_pg;
- if (likely(did_some_progress)) {
- page = get_page_from_freelist(gfp_mask, nodemask, order,
- zonelist, high_zoneidx, alloc_flags);
- if (page)
- goto got_pg;
- } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
- if (!try_set_zone_oom(zonelist, gfp_mask)) {
- schedule_timeout_uninterruptible(1);
goto restart;
}
-
- /*
- * Go through the zonelist yet one more time, keep
- * very high watermark here, this is only to catch
- * a parallel oom killing, we must fail if we're still
- * under heavy pressure.
- */
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
- order, zonelist, high_zoneidx,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET);
- if (page) {
- clear_zonelist_oom(zonelist, gfp_mask);
- goto got_pg;
- }
-
- /* The OOM killer will not help higher order allocs so fail */
- if (order > PAGE_ALLOC_COSTLY_ORDER) {
- clear_zonelist_oom(zonelist, gfp_mask);
- goto nopage;
- }
-
- out_of_memory(zonelist, gfp_mask, order);
- clear_zonelist_oom(zonelist, gfp_mask);
- goto restart;
}
- /*
- * Don't let big-order allocations loop unless the caller explicitly
- * requests that. Wait for some write requests to complete then retry.
- *
- * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
- * means __GFP_NOFAIL, but that may not be true in other
- * implementations.
- *
- * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
- * specified, then we retry until we no longer reclaim any pages
- * (above), or we've reclaimed an order of pages at least as
- * large as the allocation's order. In both cases, if the
- * allocation still fails, we stop retrying.
- */
+ /* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
- do_retry = 0;
- if (!(gfp_mask & __GFP_NORETRY)) {
- if (order <= PAGE_ALLOC_COSTLY_ORDER) {
- do_retry = 1;
- } else {
- if (gfp_mask & __GFP_REPEAT &&
- pages_reclaimed < (1 << order))
- do_retry = 1;
- }
- if (gfp_mask & __GFP_NOFAIL)
- do_retry = 1;
- }
- if (do_retry) {
+ if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+ /* Wait for some write requests to complete then retry */
congestion_wait(WRITE, HZ/50);
- goto rebalance;
+ goto restart;
}
nopage:
@@ -1671,6 +1726,39 @@ nopage:
}
got_pg:
return page;
+
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask)
+{
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ struct page *page;
+
+ might_sleep_if(gfp_mask & __GFP_WAIT);
+
+ if (should_fail_alloc_page(gfp_mask, order))
+ return NULL;
+
+ /*
+ * Check the zones suitable for the gfp_mask contain at least one
+ * valid zone. It's possible to have an empty zonelist as a result
+ * of GFP_THISNODE and a memoryless node
+ */
+ if (unlikely(!zonelist->_zonerefs->zone))
+ return NULL;
+
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+ zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+ if (unlikely(!page))
+ page = __alloc_pages_slowpath(gfp_mask, order,
+ zonelist, high_zoneidx, nodemask);
+
+ return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
--
1.5.6.5
On low-memory systems, anti-fragmentation gets disabled as there is nothing
it can do and it would just incur overhead shuffling pages between lists
constantly. Currently the check is made in the free page fast path for every
page. This patch moves it to a slow path. On machines with low memory,
there will be small amount of additional overhead as pages get shuffled
between lists but it should quickly settle.
Signed-off-by: Mel Gorman <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
---
include/linux/mmzone.h | 3 ---
mm/page_alloc.c | 4 ++++
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1aca6ce..ca000b8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -50,9 +50,6 @@ extern int page_group_by_mobility_disabled;
static inline int get_pageblock_migratetype(struct page *page)
{
- if (unlikely(page_group_by_mobility_disabled))
- return MIGRATE_UNMOVABLE;
-
return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7ba7705..d815c8f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -171,6 +171,10 @@ int page_group_by_mobility_disabled __read_mostly;
static void set_pageblock_migratetype(struct page *page, int migratetype)
{
+
+ if (unlikely(page_group_by_mobility_disabled))
+ migratetype = MIGRATE_UNMOVABLE;
+
set_pageblock_flags_group(page, (unsigned long)migratetype,
PB_migrate, PB_migrate_end);
}
--
1.5.6.5
get_page_from_freelist() can be called multiple times for an allocation.
Part of this calculates the preferred_zone which is the first usable
zone in the zonelist. This patch calculates preferred_zone once.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 53 ++++++++++++++++++++++++++++++++---------------------
1 files changed, 32 insertions(+), 21 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe71147..78e1d8e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1398,24 +1398,19 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
- struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
+ struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
+ struct zone *preferred_zone)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
- struct zone *zone, *preferred_zone;
+ struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
int zonelist_filter = 0;
- (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
- &preferred_zone);
- if (!preferred_zone)
- return NULL;
-
classzone_idx = zone_idx(preferred_zone);
-
VM_BUG_ON(order >= MAX_ORDER);
/* Determine in advance if the zonelist needs filtering */
@@ -1520,7 +1515,7 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask)
+ nodemask_t *nodemask, struct zone *preferred_zone)
{
struct page *page;
@@ -1537,7 +1532,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*/
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
order, zonelist, high_zoneidx,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET,
+ preferred_zone);
if (page)
goto out;
@@ -1557,7 +1553,8 @@ out:
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ unsigned long *did_some_progress)
{
struct page *page = NULL;
struct reclaim_state reclaim_state;
@@ -1588,7 +1585,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
if (likely(*did_some_progress))
page = get_page_from_freelist(gfp_mask, nodemask, order,
- zonelist, high_zoneidx, alloc_flags);
+ zonelist, high_zoneidx,
+ alloc_flags, preferred_zone);
return page;
}
@@ -1609,13 +1607,14 @@ is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask)
+ nodemask_t *nodemask, struct zone *preferred_zone)
{
struct page *page;
do {
page = get_page_from_freelist(gfp_mask, nodemask, order,
- zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+ zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
+ preferred_zone);
if (!page && gfp_mask & __GFP_NOFAIL)
congestion_wait(WRITE, HZ/50);
@@ -1638,7 +1637,7 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask)
+ nodemask_t *nodemask, struct zone *preferred_zone)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
@@ -1688,14 +1687,15 @@ restart:
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
- high_zoneidx, alloc_flags);
+ high_zoneidx, alloc_flags,
+ preferred_zone);
if (page)
goto got_pg;
/* Allocate without watermarks if the context allows */
if (is_allocation_high_priority(p, gfp_mask))
page = __alloc_pages_high_priority(gfp_mask, order,
- zonelist, high_zoneidx, nodemask);
+ zonelist, high_zoneidx, nodemask, preferred_zone);
if (page)
goto got_pg;
@@ -1707,7 +1707,8 @@ restart:
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
- alloc_flags, &did_some_progress);
+ alloc_flags, preferred_zone,
+ &did_some_progress);
if (page)
goto got_pg;
@@ -1719,7 +1720,7 @@ restart:
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
- nodemask);
+ nodemask, preferred_zone);
if (page)
goto got_pg;
@@ -1756,6 +1757,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ struct zone *preferred_zone;
struct page *page;
might_sleep_if(gfp_mask & __GFP_WAIT);
@@ -1771,11 +1773,20 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
+ /* The preferred zone is used for statistics later */
+ (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
+ &preferred_zone);
+ if (!preferred_zone)
+ return NULL;
+
+ /* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
- zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+ zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
+ preferred_zone);
if (unlikely(!page))
page = __alloc_pages_slowpath(gfp_mask, order,
- zonelist, high_zoneidx, nodemask);
+ zonelist, high_zoneidx, nodemask,
+ preferred_zone);
return page;
}
--
1.5.6.5
Allocations that specify __GFP_HIGH get the ALLOC_HIGH flag. If these
flags are equal to each other, we can eliminate a branch.
[[email protected]: Suggested the hack]
Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ad26052..1e8b4b6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1640,8 +1640,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
*/
- if (gfp_mask & __GFP_HIGH)
- alloc_flags |= ALLOC_HIGH;
+ VM_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+ alloc_flags |= (gfp_mask & __GFP_HIGH);
if (!wait) {
alloc_flags |= ALLOC_HARDER;
--
1.5.6.5
Factor out the mapping between GFP and alloc_flags only once. Once factored
out, it only needs to be calculated once but some care must be taken.
[[email protected] says]
As the test:
- if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
- && !in_interrupt()) {
- if (!(gfp_mask & __GFP_NOMEMALLOC)) {
has been replaced with a slightly weaker one:
+ if (alloc_flags & ALLOC_NO_WATERMARKS) {
we need to ensure we don't recurse when PF_MEMALLOC is set.
From: Peter Zijlstra <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Acked-by: Pekka Enberg <[email protected]>
---
mm/page_alloc.c | 88 +++++++++++++++++++++++++++++++-----------------------
1 files changed, 50 insertions(+), 38 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8771de3..0558eb4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1593,16 +1593,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
return page;
}
-static inline int
-is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
-{
- if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
- && !in_interrupt())
- if (!(gfp_mask & __GFP_NOMEMALLOC))
- return 1;
- return 0;
-}
-
/*
* This is called in the allocator slow-path if the allocation request is of
* sufficient urgency to ignore watermarks and take other desperate measures
@@ -1638,6 +1628,42 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
wakeup_kswapd(zone, order);
}
+static inline int
+gfp_to_alloc_flags(gfp_t gfp_mask)
+{
+ struct task_struct *p = current;
+ int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+ const gfp_t wait = gfp_mask & __GFP_WAIT;
+
+ /*
+ * The caller may dip into page reserves a bit more if the caller
+ * cannot run direct reclaim, or if the caller has realtime scheduling
+ * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
+ * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+ */
+ if (gfp_mask & __GFP_HIGH)
+ alloc_flags |= ALLOC_HIGH;
+
+ if (!wait) {
+ alloc_flags |= ALLOC_HARDER;
+ /*
+ * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ */
+ alloc_flags &= ~ALLOC_CPUSET;
+ } else if (unlikely(rt_task(p)) && !in_interrupt())
+ alloc_flags |= ALLOC_HARDER;
+
+ if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
+ if (!in_interrupt() &&
+ ((p->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))))
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ }
+
+ return alloc_flags;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -1668,48 +1694,34 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
- *
- * The caller may dip into page reserves a bit more if the caller
- * cannot run direct reclaim, or if the caller has realtime scheduling
- * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
*/
- alloc_flags = ALLOC_WMARK_MIN;
- if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
- alloc_flags |= ALLOC_HARDER;
- if (gfp_mask & __GFP_HIGH)
- alloc_flags |= ALLOC_HIGH;
- if (wait)
- alloc_flags |= ALLOC_CPUSET;
+ alloc_flags = gfp_to_alloc_flags(gfp_mask);
restart:
- /*
- * Go through the zonelist again. Let __GFP_HIGH and allocations
- * coming from realtime tasks go deeper into reserves.
- *
- * This is the last chance, in general, before the goto nopage.
- * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
- */
+ /* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
- high_zoneidx, alloc_flags,
- preferred_zone,
- migratetype);
+ high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
+ preferred_zone, migratetype);
if (page)
goto got_pg;
/* Allocate without watermarks if the context allows */
- if (is_allocation_high_priority(p, gfp_mask))
+ if (alloc_flags & ALLOC_NO_WATERMARKS) {
page = __alloc_pages_high_priority(gfp_mask, order,
- zonelist, high_zoneidx, nodemask, preferred_zone,
- migratetype);
- if (page)
- goto got_pg;
+ zonelist, high_zoneidx, nodemask,
+ preferred_zone, migratetype);
+ if (page)
+ goto got_pg;
+ }
/* Atomic allocations - we can't balance anything */
if (!wait)
goto nopage;
+ /* Avoid recursion of direct reclaim */
+ if (p->flags & PF_MEMALLOC)
+ goto nopage;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
--
1.5.6.5
GFP mask is converted into a migratetype when deciding which pagelist to
take a page from. However, it is happening multiple times per
allocation, at least once per zone traversed. Calculate it once.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 43 ++++++++++++++++++++++++++-----------------
1 files changed, 26 insertions(+), 17 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 78e1d8e..8771de3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1067,13 +1067,13 @@ void split_page(struct page *page, unsigned int order)
* or two.
*/
static struct page *buffered_rmqueue(struct zone *preferred_zone,
- struct zone *zone, int order, gfp_t gfp_flags)
+ struct zone *zone, int order, gfp_t gfp_flags,
+ int migratetype)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
int cpu;
- int migratetype = allocflags_to_migratetype(gfp_flags);
again:
cpu = get_cpu();
@@ -1399,7 +1399,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone)
+ struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
@@ -1451,7 +1451,8 @@ zonelist_scan:
}
}
- page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
+ page = buffered_rmqueue(preferred_zone, zone, order,
+ gfp_mask, migratetype);
if (page)
break;
this_zone_full:
@@ -1515,7 +1516,8 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone)
+ nodemask_t *nodemask, struct zone *preferred_zone,
+ int migratetype)
{
struct page *page;
@@ -1533,7 +1535,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
order, zonelist, high_zoneidx,
ALLOC_WMARK_HIGH|ALLOC_CPUSET,
- preferred_zone);
+ preferred_zone, migratetype);
if (page)
goto out;
@@ -1554,7 +1556,7 @@ static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- unsigned long *did_some_progress)
+ int migratetype, unsigned long *did_some_progress)
{
struct page *page = NULL;
struct reclaim_state reclaim_state;
@@ -1586,7 +1588,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
if (likely(*did_some_progress))
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
- alloc_flags, preferred_zone);
+ alloc_flags, preferred_zone,
+ migratetype);
return page;
}
@@ -1607,14 +1610,15 @@ is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone)
+ nodemask_t *nodemask, struct zone *preferred_zone,
+ int migratetype)
{
struct page *page;
do {
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
- preferred_zone);
+ preferred_zone, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
congestion_wait(WRITE, HZ/50);
@@ -1637,7 +1641,8 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone)
+ nodemask_t *nodemask, struct zone *preferred_zone,
+ int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
@@ -1688,14 +1693,16 @@ restart:
*/
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags,
- preferred_zone);
+ preferred_zone,
+ migratetype);
if (page)
goto got_pg;
/* Allocate without watermarks if the context allows */
if (is_allocation_high_priority(p, gfp_mask))
page = __alloc_pages_high_priority(gfp_mask, order,
- zonelist, high_zoneidx, nodemask, preferred_zone);
+ zonelist, high_zoneidx, nodemask, preferred_zone,
+ migratetype);
if (page)
goto got_pg;
@@ -1708,7 +1715,7 @@ restart:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- &did_some_progress);
+ migratetype, &did_some_progress);
if (page)
goto got_pg;
@@ -1720,7 +1727,8 @@ restart:
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
- nodemask, preferred_zone);
+ nodemask, preferred_zone,
+ migratetype);
if (page)
goto got_pg;
@@ -1759,6 +1767,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page;
+ int migratetype = allocflags_to_migratetype(gfp_mask);
might_sleep_if(gfp_mask & __GFP_WAIT);
@@ -1782,11 +1791,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
- preferred_zone);
+ preferred_zone, migratetype);
if (unlikely(!page))
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone);
+ preferred_zone, migratetype);
return page;
}
--
1.5.6.5
GFP mask is checked for __GFP_COLD has been specified when deciding which
end of the PCP lists to use. However, it is happening multiple times per
allocation, at least once per zone traversed. Calculate it once.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 35 ++++++++++++++++++-----------------
1 files changed, 18 insertions(+), 17 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0558eb4..ad26052 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1068,11 +1068,10 @@ void split_page(struct page *page, unsigned int order)
*/
static struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
- int migratetype)
+ int migratetype, int cold)
{
unsigned long flags;
struct page *page;
- int cold = !!(gfp_flags & __GFP_COLD);
int cpu;
again:
@@ -1399,7 +1398,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int migratetype)
+ struct zone *preferred_zone, int migratetype, int cold)
{
struct zoneref *z;
struct page *page = NULL;
@@ -1452,7 +1451,7 @@ zonelist_scan:
}
page = buffered_rmqueue(preferred_zone, zone, order,
- gfp_mask, migratetype);
+ gfp_mask, migratetype, cold);
if (page)
break;
this_zone_full:
@@ -1517,7 +1516,7 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int migratetype, int cold)
{
struct page *page;
@@ -1535,7 +1534,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
order, zonelist, high_zoneidx,
ALLOC_WMARK_HIGH|ALLOC_CPUSET,
- preferred_zone, migratetype);
+ preferred_zone, migratetype, cold);
if (page)
goto out;
@@ -1556,7 +1555,7 @@ static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int migratetype, int cold, unsigned long *did_some_progress)
{
struct page *page = NULL;
struct reclaim_state reclaim_state;
@@ -1589,7 +1588,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags, preferred_zone,
- migratetype);
+ migratetype, cold);
return page;
}
@@ -1601,14 +1600,14 @@ static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int migratetype, int cold)
{
struct page *page;
do {
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, migratetype, cold);
if (!page && gfp_mask & __GFP_NOFAIL)
congestion_wait(WRITE, HZ/50);
@@ -1668,7 +1667,7 @@ static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int migratetype, int cold)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
@@ -1701,7 +1700,7 @@ restart:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, migratetype, cold);
if (page)
goto got_pg;
@@ -1709,7 +1708,7 @@ restart:
if (alloc_flags & ALLOC_NO_WATERMARKS) {
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
+ preferred_zone, migratetype, cold);
if (page)
goto got_pg;
}
@@ -1727,7 +1726,8 @@ restart:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress);
+ migratetype, cold,
+ &did_some_progress);
if (page)
goto got_pg;
@@ -1740,7 +1740,7 @@ restart:
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
- migratetype);
+ migratetype, cold);
if (page)
goto got_pg;
@@ -1780,6 +1780,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zone *preferred_zone;
struct page *page;
int migratetype = allocflags_to_migratetype(gfp_mask);
+ int cold = gfp_mask & __GFP_COLD;
might_sleep_if(gfp_mask & __GFP_WAIT);
@@ -1803,11 +1804,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
- preferred_zone, migratetype);
+ preferred_zone, migratetype, cold);
if (unlikely(!page))
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
+ preferred_zone, migratetype, cold);
return page;
}
--
1.5.6.5
__rmqueue_fallback() is in the slow path but has only one call site. It
actually reduces text if it's inlined.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9f7631e..0ba9e4f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -774,8 +774,8 @@ static int move_freepages_block(struct zone *zone, struct page *page,
}
/* Remove an element from the buddy allocator from the fallback list */
-static struct page *__rmqueue_fallback(struct zone *zone, int order,
- int start_migratetype)
+static inline struct page *
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area * area;
int current_order;
--
1.5.6.5
Inline __rmqueue_smallest by altering flow very slightly so that there
is only one call site. This allows the function to be inlined without
additional text bloat.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 23 ++++++++++++++++++-----
1 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1e8b4b6..a3ca80d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -664,7 +664,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
-static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+static inline
+struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
@@ -834,24 +835,36 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
}
}
- /* Use MIGRATE_RESERVE rather than fail an allocation */
- return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
+ return NULL;
}
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order,
+static inline
+struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
{
struct page *page;
+retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page))
+ if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
page = __rmqueue_fallback(zone, order, migratetype);
+ /*
+ * Use MIGRATE_RESERVE rather than fail an allocation. goto
+ * is used because __rmqueue_smallest is an inline function
+ * and we want just one call site
+ */
+ if (!page) {
+ migratetype = MIGRATE_RESERVE;
+ goto retry_reserve;
+ }
+ }
+
return page;
}
--
1.5.6.5
buffered_rmqueue() is in the fast path so inline it. Because it only has
one call site, this actually should reduce text bloat instead of
increase it.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a3ca80d..9f7631e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1079,7 +1079,8 @@ void split_page(struct page *page, unsigned int order)
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
-static struct page *buffered_rmqueue(struct zone *preferred_zone,
+static inline
+struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype, int cold)
{
--
1.5.6.5
__rmqueue is inlined in the fast path but it has two call sites, the low
order and high order paths. However, a slight modification to the
high-order path reduces the call sites of __rmqueue. This reduces text
at the slight increase of complexity of the high-order allocation path.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 11 +++++++----
1 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0ba9e4f..795cfc5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1123,11 +1123,14 @@ again:
list_del(&page->lru);
pcp->count--;
} else {
- spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order, migratetype);
- spin_unlock(&zone->lock);
- if (!page)
+ LIST_HEAD(list);
+ local_irq_save(flags);
+
+ /* Calling __rmqueue would bloat text, hence this */
+ if (!rmqueue_bulk(zone, order, 1, &list, migratetype))
goto failed;
+ page = list_entry(list.next, struct page, lru);
+ list_del(&page->lru);
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
--
1.5.6.5
There is a zonelist cache which is used to track zones that are not in
the allowed cpuset or found to be recently full. This is to reduce cache
footprint on large machines. On smaller machines, it just incurs cost
for no gain. This patch only uses the zonelist cache when there are NUMA
nodes.
Signed-off-by: Mel Gorman <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
---
mm/page_alloc.c | 10 ++++++++--
1 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c4eb295..01cd489 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1442,6 +1442,8 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
/* Determine in advance if the zonelist needs filtering */
if ((alloc_flags & ALLOC_CPUSET) && unlikely(number_of_cpusets > 1))
zonelist_filter = 1;
+ if (num_online_nodes() > 1)
+ zonelist_filter = 1;
zonelist_scan:
/*
@@ -1486,8 +1488,12 @@ this_zone_full:
zlc_mark_zone_full(zonelist, z);
try_next_zone:
if (NUMA_BUILD && zonelist_filter) {
- if (!did_zlc_setup) {
- /* do zlc_setup after the first zone is tried */
+ if (!did_zlc_setup && num_online_nodes() > 1) {
+ /*
+ * do zlc_setup after the first zone is tried
+ * but only if there are multiple nodes to make
+ * it worthwhile
+ */
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
}
--
1.5.6.5
free_page_mlock() tests and clears PG_mlocked using locked versions of the
bit operations. If set, it disables interrupts to update counters and this
happens on every page free even though interrupts are disabled very shortly
afterwards a second time. This is wasteful.
This patch splits what free_page_mlock() does. The bit check is still
made. However, the update of counters is delayed until the interrupts are
disabled and the non-lock version for clearing the bit is used. One potential
weirdness with this split is that the counters do not get updated if the
bad_page() check is triggered but a system showing bad pages is getting
screwed already.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/internal.h | 11 +++--------
mm/page_alloc.c | 8 +++++++-
2 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index 478223b..7f775a1 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -155,14 +155,9 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
*/
static inline void free_page_mlock(struct page *page)
{
- if (unlikely(TestClearPageMlocked(page))) {
- unsigned long flags;
-
- local_irq_save(flags);
- __dec_zone_page_state(page, NR_MLOCK);
- __count_vm_event(UNEVICTABLE_MLOCKFREED);
- local_irq_restore(flags);
- }
+ __ClearPageMlocked(page);
+ __dec_zone_page_state(page, NR_MLOCK);
+ __count_vm_event(UNEVICTABLE_MLOCKFREED);
}
#else /* CONFIG_UNEVICTABLE_LRU */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 349c64d..c4eb295 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -498,7 +498,6 @@ static inline void __free_one_page(struct page *page,
static inline int free_pages_check(struct page *page)
{
- free_page_mlock(page);
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(page_count(page) != 0) |
@@ -555,6 +554,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
unsigned long flags;
int i;
int bad = 0;
+ int clearMlocked = PageMlocked(page);
for (i = 0 ; i < (1 << order) ; ++i)
bad += free_pages_check(page + i);
@@ -570,6 +570,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
kernel_map_pages(page, 1 << order, 0);
local_irq_save(flags);
+ if (unlikely(clearMlocked))
+ free_page_mlock(page);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, order,
get_pageblock_migratetype(page));
@@ -1020,6 +1022,7 @@ static void free_hot_cold_page(struct page *page, int cold)
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
+ int clearMlocked = PageMlocked(page);
if (PageAnon(page))
page->mapping = NULL;
@@ -1035,7 +1038,10 @@ static void free_hot_cold_page(struct page *page, int cold)
pcp = &zone_pcp(zone, get_cpu())->pcp;
local_irq_save(flags);
+ if (unlikely(clearMlocked))
+ free_page_mlock(page);
__count_vm_event(PGFREE);
+
if (cold)
list_add_tail(&page->lru, &pcp->list);
else
--
1.5.6.5
get_pageblock_migratetype() is potentially called twice for every page
free. Once, when being freed to the pcp lists and once when being freed
back to buddy. When freeing from the pcp lists, it is known what the
pageblock type was at the time of free so use it rather than rechecking.
In low memory situations under memory pressure, this might skew
anti-fragmentation slightly but the interference is minimal and
decisions that are fragmenting memory are being made anyway.
Signed-off-by: Mel Gorman <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
---
mm/page_alloc.c | 16 ++++++++++------
1 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 795cfc5..349c64d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -455,16 +455,18 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
*/
static inline void __free_one_page(struct page *page,
- struct zone *zone, unsigned int order)
+ struct zone *zone, unsigned int order,
+ int migratetype)
{
unsigned long page_idx;
int order_size = 1 << order;
- int migratetype = get_pageblock_migratetype(page);
if (unlikely(PageCompound(page)))
if (unlikely(destroy_compound_page(page, order)))
return;
+ VM_BUG_ON(migratetype == -1);
+
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
VM_BUG_ON(page_idx & (order_size - 1));
@@ -533,17 +535,18 @@ static void free_pages_bulk(struct zone *zone, int count,
page = list_entry(list->prev, struct page, lru);
/* have to delete it as __free_one_page list manipulates */
list_del(&page->lru);
- __free_one_page(page, zone, order);
+ __free_one_page(page, zone, order, page_private(page));
}
spin_unlock(&zone->lock);
}
-static void free_one_page(struct zone *zone, struct page *page, int order)
+static void free_one_page(struct zone *zone, struct page *page, int order,
+ int migratetype)
{
spin_lock(&zone->lock);
zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
zone->pages_scanned = 0;
- __free_one_page(page, zone, order);
+ __free_one_page(page, zone, order, migratetype);
spin_unlock(&zone->lock);
}
@@ -568,7 +571,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, order);
+ free_one_page(page_zone(page), page, order,
+ get_pageblock_migratetype(page));
local_irq_restore(flags);
}
--
1.5.6.5
num_online_nodes() is called by the page allocator to decide whether the
zonelist needs to be filtered based on cpusets or the zonelist cache.
This is actually a heavy function and touches a number of cache lines.
This patch stores the number of online nodes at boot time and when
nodes get onlined and offlined.
Signed-off-by: Mel Gorman <[email protected]>
---
include/linux/nodemask.h | 16 ++++++++++++++--
mm/page_alloc.c | 6 ++++--
2 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 848025c..4749e30 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -449,13 +449,25 @@ static inline int num_node_state(enum node_states state)
node; \
})
+/* Recorded value for num_online_nodes() */
+extern int static_num_online_nodes;
+
#define num_online_nodes() num_node_state(N_ONLINE)
#define num_possible_nodes() num_node_state(N_POSSIBLE)
#define node_online(node) node_state((node), N_ONLINE)
#define node_possible(node) node_state((node), N_POSSIBLE)
-#define node_set_online(node) node_set_state((node), N_ONLINE)
-#define node_set_offline(node) node_clear_state((node), N_ONLINE)
+static inline void node_set_online(int nid)
+{
+ node_set_state(nid, N_ONLINE);
+ static_num_online_nodes = num_node_state(N_ONLINE);
+}
+
+static inline void node_set_offline(int nid)
+{
+ node_clear_state(nid, N_ONLINE);
+ static_num_online_nodes = num_node_state(N_ONLINE);
+}
#define for_each_node(node) for_each_node_state(node, N_POSSIBLE)
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 01cd489..799e2bf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -70,6 +70,7 @@ EXPORT_SYMBOL(node_states);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
unsigned long highest_memmap_pfn __read_mostly;
+int static_num_online_nodes __read_mostly;
int percpu_pagelist_fraction;
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -1442,7 +1443,7 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
/* Determine in advance if the zonelist needs filtering */
if ((alloc_flags & ALLOC_CPUSET) && unlikely(number_of_cpusets > 1))
zonelist_filter = 1;
- if (num_online_nodes() > 1)
+ if (static_num_online_nodes > 1)
zonelist_filter = 1;
zonelist_scan:
@@ -1488,7 +1489,7 @@ this_zone_full:
zlc_mark_zone_full(zonelist, z);
try_next_zone:
if (NUMA_BUILD && zonelist_filter) {
- if (!did_zlc_setup && num_online_nodes() > 1) {
+ if (!did_zlc_setup && static_num_online_nodes > 1) {
/*
* do zlc_setup after the first zone is tried
* but only if there are multiple nodes to make
@@ -2645,6 +2646,7 @@ void build_all_zonelists(void)
else
page_group_by_mobility_disabled = 0;
+ static_num_online_nodes = num_node_state(N_ONLINE);
printk("Built %i zonelists in %s order, mobility grouping %s. "
"Total pages: %ld\n",
num_online_nodes(),
--
1.5.6.5
When pages are being freed to the buddy allocator, the zone
NR_FREE_PAGES counter must be updated. In the case of bulk per-cpu page
freeing, it's updated once per page. This retouches cache lines more
than necessary. Update the counters one per per-cpu bulk free.
Signed-off-by: Mel Gorman <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
---
mm/page_alloc.c | 12 ++++++------
1 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 21affd4..98ce091 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -460,7 +460,6 @@ static inline void __free_one_page(struct page *page,
int migratetype)
{
unsigned long page_idx;
- int order_size = 1 << order;
if (unlikely(PageCompound(page)))
if (unlikely(destroy_compound_page(page, order)))
@@ -470,10 +469,9 @@ static inline void __free_one_page(struct page *page,
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
- VM_BUG_ON(page_idx & (order_size - 1));
+ VM_BUG_ON(page_idx & ((1 << order) - 1));
VM_BUG_ON(bad_range(zone, page));
- __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
while (order < MAX_ORDER-1) {
unsigned long combined_idx;
struct page *buddy;
@@ -528,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count,
spin_lock(&zone->lock);
zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
zone->pages_scanned = 0;
+
+ __mod_zone_page_state(zone, NR_FREE_PAGES, count);
while (count--) {
struct page *page;
@@ -546,6 +546,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
spin_lock(&zone->lock);
zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
zone->pages_scanned = 0;
+
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
__free_one_page(page, zone, order, migratetype);
spin_unlock(&zone->lock);
}
@@ -690,7 +692,6 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
list_del(&page->lru);
rmv_page_order(page);
area->nr_free--;
- __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
expand(zone, page, order, current_order, area, migratetype);
return page;
}
@@ -830,8 +831,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);
- __mod_zone_page_state(zone, NR_FREE_PAGES,
- -(1UL << order));
if (current_order == pageblock_order)
set_pageblock_migratetype(page,
@@ -905,6 +904,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
set_page_private(page, migratetype);
list = &page->lru;
}
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
return i;
}
--
1.5.6.5
ALLOC_WMARK_MIN, ALLOC_WMARK_LOW and ALLOC_WMARK_HIGH determin whether
pages_min, pages_low or pages_high is used as the zone watermark when
allocating the pages. Two branches in the allocator hotpath determine which
watermark to use. This patch uses the flags as an array index and places
the three watermarks in a union with an array so it can be offset. This
means the flags can be used as an array index and reduces the branches
taken.
Signed-off-by: Mel Gorman <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
---
include/linux/mmzone.h | 8 +++++++-
mm/page_alloc.c | 18 ++++++++----------
2 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ca000b8..c20c662 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -275,7 +275,13 @@ struct zone_reclaim_stat {
struct zone {
/* Fields commonly accessed by the page allocator */
- unsigned long pages_min, pages_low, pages_high;
+ union {
+ struct {
+ unsigned long pages_min, pages_low, pages_high;
+ };
+ unsigned long pages_mark[3];
+ };
+
/*
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 18465cd..21affd4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1160,10 +1160,13 @@ failed:
return NULL;
}
-#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
-#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
-#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
-#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
+/* The WMARK bits are used as an index zone->pages_mark */
+#define ALLOC_WMARK_MIN 0x00 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW 0x01 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH 0x02 /* use pages_high watermark */
+#define ALLOC_NO_WATERMARKS 0x08 /* don't check watermarks at all */
+#define ALLOC_WMARK_MASK 0x07 /* Mask to get the watermark bits */
+
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#ifdef CONFIG_CPUSETS
@@ -1466,12 +1469,7 @@ zonelist_scan:
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
- if (alloc_flags & ALLOC_WMARK_MIN)
- mark = zone->pages_min;
- else if (alloc_flags & ALLOC_WMARK_LOW)
- mark = zone->pages_low;
- else
- mark = zone->pages_high;
+ mark = zone->pages_mark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags)) {
if (!zone_reclaim_mode ||
--
1.5.6.5
A number of sanity checks are made on each page allocation and free
including that the page count is zero. page_count() checks for
compound pages and checks the count of the head page if true. However,
in these paths, we do not care if the page is compound or not as the
count of each tail page should also be zero.
This patch makes two changes to the use of page_count() in the free path. It
converts one check of page_count() to a VM_BUG_ON() as the count should
have been unconditionally checked earlier in the free path. It also avoids
checking for compound pages.
[[email protected]: Wrote changelog]
Signed-off-by: Nick Piggin <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
---
mm/page_alloc.c | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 799e2bf..18465cd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -425,7 +425,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
if (PageBuddy(buddy) && page_order(buddy) == order) {
- BUG_ON(page_count(buddy) != 0);
+ VM_BUG_ON(page_count(buddy) != 0);
return 1;
}
return 0;
@@ -501,7 +501,7 @@ static inline int free_pages_check(struct page *page)
{
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
- (page_count(page) != 0) |
+ (atomic_read(&page->_count) != 0) |
(page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
bad_page(page);
return 1;
@@ -646,7 +646,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
{
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
- (page_count(page) != 0) |
+ (atomic_read(&page->_count) != 0) |
(page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
bad_page(page);
return 1;
--
1.5.6.5
Every page allocation uses gfp_zone() to calcuate what the highest zone
allowed by a combination of GFP flags is. This is a large number of branches
to have in a fast path. This patch replaces the branches with a lookup
table that is calculated at boot-time and stored in the read-mostly section
so it can be shared. This requires __GFP_MOVABLE to be redefined but it's
debatable as to whether it should be considered a zone modifier or not.
Signed-off-by: Mel Gorman <[email protected]>
---
include/linux/gfp.h | 28 +++++++++++-----------------
init/main.c | 1 +
mm/page_alloc.c | 36 +++++++++++++++++++++++++++++++++++-
3 files changed, 47 insertions(+), 18 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 59eb093..581f8a9 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -16,6 +16,10 @@ struct vm_area_struct;
* Do not put any conditional on these. If necessary modify the definitions
* without the underscores and use the consistently. The definitions here may
* be used in bit comparisons.
+ *
+ * Note that __GFP_MOVABLE uses the next available bit but it is not
+ * a zone modifier. It uses the fourth bit so that the calculation of
+ * gfp_zone() can use a table rather than a series of comparisons
*/
#define __GFP_DMA ((__force gfp_t)0x01u)
#define __GFP_HIGHMEM ((__force gfp_t)0x02u)
@@ -50,7 +54,7 @@ struct vm_area_struct;
#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
#define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */
#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
-#define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */
+#define __GFP_MOVABLE ((__force gfp_t)0x08u) /* Page is movable */
#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -77,6 +81,9 @@ struct vm_area_struct;
#define GFP_THISNODE ((__force gfp_t)0)
#endif
+/* This is a mask of all modifiers affecting gfp_zonemask() */
+#define GFP_ZONEMASK (__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32 | __GFP_MOVABLE)
+
/* This mask makes up all the page movable related flags */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
@@ -112,24 +119,11 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
((gfp_flags & __GFP_RECLAIMABLE) != 0);
}
+extern int gfp_zone_table[GFP_ZONEMASK];
+void init_gfp_zone_table(void);
static inline enum zone_type gfp_zone(gfp_t flags)
{
-#ifdef CONFIG_ZONE_DMA
- if (flags & __GFP_DMA)
- return ZONE_DMA;
-#endif
-#ifdef CONFIG_ZONE_DMA32
- if (flags & __GFP_DMA32)
- return ZONE_DMA32;
-#endif
- if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
- (__GFP_HIGHMEM | __GFP_MOVABLE))
- return ZONE_MOVABLE;
-#ifdef CONFIG_HIGHMEM
- if (flags & __GFP_HIGHMEM)
- return ZONE_HIGHMEM;
-#endif
- return ZONE_NORMAL;
+ return gfp_zone_table[flags & GFP_ZONEMASK];
}
/*
diff --git a/init/main.c b/init/main.c
index 8442094..08a5663 100644
--- a/init/main.c
+++ b/init/main.c
@@ -573,6 +573,7 @@ asmlinkage void __init start_kernel(void)
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
+ init_gfp_zone_table();
build_all_zonelists();
page_alloc_init();
printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 98ce091..f71091a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -70,6 +70,7 @@ EXPORT_SYMBOL(node_states);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
unsigned long highest_memmap_pfn __read_mostly;
+int gfp_zone_table[GFP_ZONEMASK] __read_mostly;
int static_num_online_nodes __read_mostly;
int percpu_pagelist_fraction;
@@ -4569,7 +4570,7 @@ static void setup_per_zone_inactive_ratio(void)
* 8192MB: 11584k
* 16384MB: 16384k
*/
-static int __init init_per_zone_pages_min(void)
+static int init_per_zone_pages_min(void)
{
unsigned long lowmem_kbytes;
@@ -4587,6 +4588,39 @@ static int __init init_per_zone_pages_min(void)
}
module_init(init_per_zone_pages_min)
+static inline int __init gfp_flags_to_zone(gfp_t flags)
+{
+#ifdef CONFIG_ZONE_DMA
+ if (flags & __GFP_DMA)
+ return ZONE_DMA;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ if (flags & __GFP_DMA32)
+ return ZONE_DMA32;
+#endif
+ if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
+ (__GFP_HIGHMEM | __GFP_MOVABLE))
+ return ZONE_MOVABLE;
+#ifdef CONFIG_HIGHMEM
+ if (flags & __GFP_HIGHMEM)
+ return ZONE_HIGHMEM;
+#endif
+ return ZONE_NORMAL;
+}
+
+/*
+ * For each possible combination of zone modifier flags, we calculate
+ * what zone it should be using. This consumes a cache line in most
+ * cases but avoids a number of branches in the allocator fast path
+ */
+void __init init_gfp_zone_table(void)
+{
+ gfp_t gfp_flags;
+
+ for (gfp_flags = 0; gfp_flags < GFP_ZONEMASK; gfp_flags++)
+ gfp_zone_table[gfp_flags] = gfp_flags_to_zone(gfp_flags);
+}
+
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
--
1.5.6.5
Resort the GFP flags after __GFP_MOVABLE got redefined so how the bits
are used are a bit cleared.
From: Peter Zijlstra <[email protected]>
Signed-off-by: Mel Gorman <[email protected]>
---
include/linux/gfp.h | 9 +++++----
1 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 581f8a9..8f7d176 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -25,6 +25,8 @@ struct vm_area_struct;
#define __GFP_HIGHMEM ((__force gfp_t)0x02u)
#define __GFP_DMA32 ((__force gfp_t)0x04u)
+#define __GFP_MOVABLE ((__force gfp_t)0x08u) /* Page is movable */
+
/*
* Action modifiers - doesn't change the zoning
*
@@ -50,11 +52,10 @@ struct vm_area_struct;
#define __GFP_NORETRY ((__force gfp_t)0x1000u)/* See above */
#define __GFP_COMP ((__force gfp_t)0x4000u)/* Add compound page metadata */
#define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */
-#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
-#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
-#define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */
+#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
+#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_THISNODE ((__force gfp_t)0x40000u) /* No fallback, no policies */
#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
-#define __GFP_MOVABLE ((__force gfp_t)0x08u) /* Page is movable */
#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
--
1.5.6.5
Local interrupts are disabled when freeing pages to the PCP list. Part
of that free checks what the migratetype of the pageblock the page is in
but it checks this with interrupts disabled. This patch checks the
pagetype with interrupts enabled. The impact is that it is possible a
page is freed to the wrong list when a pageblock changes type but as
that block is now already considered mixed from an anti-fragmentation
perspective, it's not of vital importance.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f71091a..ca7bc04 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1039,6 +1039,7 @@ static void free_hot_cold_page(struct page *page, int cold)
kernel_map_pages(page, 1, 0);
pcp = &zone_pcp(zone, get_cpu())->pcp;
+ set_page_private(page, get_pageblock_migratetype(page));
local_irq_save(flags);
if (unlikely(clearMlocked))
free_page_mlock(page);
@@ -1048,7 +1049,6 @@ static void free_hot_cold_page(struct page *page, int cold)
list_add_tail(&page->lru, &pcp->list);
else
list_add(&page->lru, &pcp->list);
- set_page_private(page, get_pageblock_migratetype(page));
pcp->count++;
if (pcp->count >= pcp->high) {
free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
--
1.5.6.5
Reviewed-by: Christoph Lameter <[email protected]>
Reviewed-by: Christoph Lameter <[email protected]>
On Mon, 16 Mar 2009, Mel Gorman wrote:
> +int gfp_zone_table[GFP_ZONEMASK] __read_mostly;
The gfp_zone_table is compile time determinable. There is no need to
calculate it.
const int gfp_zone_table[GFP_ZONEMASK] = {
ZONE_NORMAL, /* 00 No flags set */
ZONE_DMA, /* 01 Only GFP_DMA set */
ZONE_HIGHMEM, /* 02 Only GFP_HIGHMEM set */
ZONE_DMA, /* 03 GFP_HIGHMEM and GFP_DMA set */
ZONE_DMA32, /* 04 Only GFP_DMA32 set */
ZONE_DMA, /* 05 GFP_DMA and GFP_DMA32 set */
ZONE_DMA32, /* 06 GFP_DMA32 and GFP_HIGHMEM set */
ZONE_DMA, /* 07 GFP_DMA, GFP_DMA32 and GFP_DMA32 set */
ZONE_MOVABLE, /* 08 Only ZONE_MOVABLE set */
ZONE_DMA, /* 09 MOVABLE + DMA */
ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
ZONE_DMA, /* 0B MOVABLE + DMA + HIGHMEM */
ZONE_DMA32, /* 0C MOVABLE + DMA32 */
ZONE_DMA, /* 0D MOVABLE + DMA + DMA32 */
ZONE_DMA32, /* 0E MOVABLE + DMA32 + HIGHMEM */
ZONE_DMA /* 0F MOVABLE + DMA32 + HIGHMEM + DMA
};
Hmmmm... Guess one would need to add some #ifdeffery here to setup
ZONE_NORMAL in cases there is no DMA, DMA32 and HIGHMEM.
Looks also like a good cleanup of the page allocator.
Reviewed-by: Christoph Lameter <[email protected]>
On Mon, Mar 16, 2009 at 03:12:50PM -0400, Christoph Lameter wrote:
> On Mon, 16 Mar 2009, Mel Gorman wrote:
>
> > +int gfp_zone_table[GFP_ZONEMASK] __read_mostly;
>
> The gfp_zone_table is compile time determinable. There is no need to
> calculate it.
>
The cost of calculating it is negligible and the code is then freed later
in boot. Does having a const table make any difference?
> const int gfp_zone_table[GFP_ZONEMASK] = {
> ZONE_NORMAL, /* 00 No flags set */
> ZONE_DMA, /* 01 Only GFP_DMA set */
> ZONE_HIGHMEM, /* 02 Only GFP_HIGHMEM set */
> ZONE_DMA, /* 03 GFP_HIGHMEM and GFP_DMA set */
> ZONE_DMA32, /* 04 Only GFP_DMA32 set */
> ZONE_DMA, /* 05 GFP_DMA and GFP_DMA32 set */
> ZONE_DMA32, /* 06 GFP_DMA32 and GFP_HIGHMEM set */
> ZONE_DMA, /* 07 GFP_DMA, GFP_DMA32 and GFP_DMA32 set */
> ZONE_MOVABLE, /* 08 Only ZONE_MOVABLE set */
> ZONE_DMA, /* 09 MOVABLE + DMA */
> ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
> ZONE_DMA, /* 0B MOVABLE + DMA + HIGHMEM */
> ZONE_DMA32, /* 0C MOVABLE + DMA32 */
> ZONE_DMA, /* 0D MOVABLE + DMA + DMA32 */
> ZONE_DMA32, /* 0E MOVABLE + DMA32 + HIGHMEM */
> ZONE_DMA /* 0F MOVABLE + DMA32 + HIGHMEM + DMA
> };
>
> Hmmmm... Guess one would need to add some #ifdeffery here to setup
> ZONE_NORMAL in cases there is no DMA, DMA32 and HIGHMEM.
>
Indeed, as I said, this is somewhat error prone which is why the patch
calculates the table at run-time instead of compile-time trickery.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Wed, 18 Mar 2009, Mel Gorman wrote:
> On Mon, Mar 16, 2009 at 03:12:50PM -0400, Christoph Lameter wrote:
> > On Mon, 16 Mar 2009, Mel Gorman wrote:
> >
> > > +int gfp_zone_table[GFP_ZONEMASK] __read_mostly;
> >
> > The gfp_zone_table is compile time determinable. There is no need to
> > calculate it.
> >
>
> The cost of calculating it is negligible and the code is then freed later
> in boot. Does having a const table make any difference?
Should it not enable the compiler to determine the value at
compile time and therefore make things like gfp_zone(constant) a
constant?
> > const int gfp_zone_table[GFP_ZONEMASK] = {
> > ZONE_NORMAL, /* 00 No flags set */
> > ZONE_DMA, /* 01 Only GFP_DMA set */
> > ZONE_HIGHMEM, /* 02 Only GFP_HIGHMEM set */
> > ZONE_DMA, /* 03 GFP_HIGHMEM and GFP_DMA set */
> > ZONE_DMA32, /* 04 Only GFP_DMA32 set */
> > ZONE_DMA, /* 05 GFP_DMA and GFP_DMA32 set */
> > ZONE_DMA32, /* 06 GFP_DMA32 and GFP_HIGHMEM set */
> > ZONE_DMA, /* 07 GFP_DMA, GFP_DMA32 and GFP_DMA32 set */
> > ZONE_MOVABLE, /* 08 Only ZONE_MOVABLE set */
> > ZONE_DMA, /* 09 MOVABLE + DMA */
> > ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
> > ZONE_DMA, /* 0B MOVABLE + DMA + HIGHMEM */
> > ZONE_DMA32, /* 0C MOVABLE + DMA32 */
> > ZONE_DMA, /* 0D MOVABLE + DMA + DMA32 */
> > ZONE_DMA32, /* 0E MOVABLE + DMA32 + HIGHMEM */
> > ZONE_DMA /* 0F MOVABLE + DMA32 + HIGHMEM + DMA
> > };
> >
> > Hmmmm... Guess one would need to add some #ifdeffery here to setup
> > ZONE_NORMAL in cases there is no DMA, DMA32 and HIGHMEM.
> >
>
> Indeed, as I said, this is somewhat error prone which is why the patch
> calculates the table at run-time instead of compile-time trickery.
One would need to define some macros to make it simpler I guess
Write something like
#ifdef CONFIG_ZONE_DMA
#define TZONE_DMA ZONE_DMA
#else
#define TZONE_DMA ZONE_NORMAL
#endif
for each configurable item. Then just add the T to the above table.
On Wed, Mar 18, 2009 at 10:15:26AM -0400, Christoph Lameter wrote:
> On Wed, 18 Mar 2009, Mel Gorman wrote:
>
> > On Mon, Mar 16, 2009 at 03:12:50PM -0400, Christoph Lameter wrote:
> > > On Mon, 16 Mar 2009, Mel Gorman wrote:
> > >
> > > > +int gfp_zone_table[GFP_ZONEMASK] __read_mostly;
> > >
> > > The gfp_zone_table is compile time determinable. There is no need to
> > > calculate it.
> > >
> >
> > The cost of calculating it is negligible and the code is then freed later
> > in boot. Does having a const table make any difference?
>
> Should it not enable the compiler to determine the value at
> compile time and therefore make things like gfp_zone(constant) a
> constant?
>
Yeah, you're right. I didn't think it would but a test program showed
that code accessing const fields like this are calculated at compile
time.
> > > const int gfp_zone_table[GFP_ZONEMASK] = {
> > > ZONE_NORMAL, /* 00 No flags set */
> > > ZONE_DMA, /* 01 Only GFP_DMA set */
> > > ZONE_HIGHMEM, /* 02 Only GFP_HIGHMEM set */
> > > ZONE_DMA, /* 03 GFP_HIGHMEM and GFP_DMA set */
> > > ZONE_DMA32, /* 04 Only GFP_DMA32 set */
> > > ZONE_DMA, /* 05 GFP_DMA and GFP_DMA32 set */
> > > ZONE_DMA32, /* 06 GFP_DMA32 and GFP_HIGHMEM set */
> > > ZONE_DMA, /* 07 GFP_DMA, GFP_DMA32 and GFP_DMA32 set */
> > > ZONE_MOVABLE, /* 08 Only ZONE_MOVABLE set */
> > > ZONE_DMA, /* 09 MOVABLE + DMA */
> > > ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
> > > ZONE_DMA, /* 0B MOVABLE + DMA + HIGHMEM */
> > > ZONE_DMA32, /* 0C MOVABLE + DMA32 */
> > > ZONE_DMA, /* 0D MOVABLE + DMA + DMA32 */
> > > ZONE_DMA32, /* 0E MOVABLE + DMA32 + HIGHMEM */
> > > ZONE_DMA /* 0F MOVABLE + DMA32 + HIGHMEM + DMA
> > > };
> > >
> > > Hmmmm... Guess one would need to add some #ifdeffery here to setup
> > > ZONE_NORMAL in cases there is no DMA, DMA32 and HIGHMEM.
> > >
> >
> > Indeed, as I said, this is somewhat error prone which is why the patch
> > calculates the table at run-time instead of compile-time trickery.
>
> One would need to define some macros to make it simpler I guess
>
> Write something like
>
> #ifdef CONFIG_ZONE_DMA
> #define TZONE_DMA ZONE_DMA
> #else
> #define TZONE_DMA ZONE_NORMAL
> #endif
>
> for each configurable item. Then just add the T to the above table.
>
If you don't mind, I'd like to postpone writing such a patch until a second
or third pass at improving the allocator. I don't think I'll have the time
in the short-term to put together a const-initialised-table patch that will
definitily be correct.
Alternatively, I can drop this patch entirely from the set.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
> time.
>
> > > > const int gfp_zone_table[GFP_ZONEMASK] = {
> > > > ZONE_NORMAL, /* 00 No flags set */
> > > > ZONE_DMA, /* 01 Only GFP_DMA set */
> > > > ZONE_HIGHMEM, /* 02 Only GFP_HIGHMEM set */
> > > > ZONE_DMA, /* 03 GFP_HIGHMEM and GFP_DMA set */
> > > > ZONE_DMA32, /* 04 Only GFP_DMA32 set */
> > > > ZONE_DMA, /* 05 GFP_DMA and GFP_DMA32 set */
> > > > ZONE_DMA32, /* 06 GFP_DMA32 and GFP_HIGHMEM set */
> > > > ZONE_DMA, /* 07 GFP_DMA, GFP_DMA32 and GFP_DMA32 set */
> > > > ZONE_MOVABLE, /* 08 Only ZONE_MOVABLE set */
> > > > ZONE_DMA, /* 09 MOVABLE + DMA */
> > > > ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
> > > > ZONE_DMA, /* 0B MOVABLE + DMA + HIGHMEM */
> > > > ZONE_DMA32, /* 0C MOVABLE + DMA32 */
> > > > ZONE_DMA, /* 0D MOVABLE + DMA + DMA32 */
> > > > ZONE_DMA32, /* 0E MOVABLE + DMA32 + HIGHMEM */
> > > > ZONE_DMA /* 0F MOVABLE + DMA32 + HIGHMEM + DMA
> > > > };
> > > >
> > > > Hmmmm... Guess one would need to add some #ifdeffery here to setup
> > > > ZONE_NORMAL in cases there is no DMA, DMA32 and HIGHMEM.
> > > >
> > >
> > > Indeed, as I said, this is somewhat error prone which is why the patch
> > > calculates the table at run-time instead of compile-time trickery.
> >
> > One would need to define some macros to make it simpler I guess
> >
> > Write something like
> >
> > #ifdef CONFIG_ZONE_DMA
> > #define TZONE_DMA ZONE_DMA
> > #else
> > #define TZONE_DMA ZONE_NORMAL
> > #endif
> >
> > for each configurable item. Then just add the T to the above table.
> >
>
> If you don't mind, I'd like to postpone writing such a patch until a second
> or third pass at improving the allocator. I don't think I'll have the time
> in the short-term to put together a const-initialised-table patch that will
> definitily be correct.
>
> Alternatively, I can drop this patch entirely from the set.
>
>
Let me give it a shot:
Note that there is a slight buggyness in the current implementation of
gfp_zone. If you set both GFP_DMA32 and GFP_HIGHMEM and the arch does not
support GFP_DMA32 then gfp_zone returns GFP_HIGHMEM which may result in
memory being allocated that cannot be used for I/O.
This version here returns GFP_NORMAL which is more correct.
#ifdef CONFIG_ZONE_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
#endif
#ifdef CONFIG_ZONE_DMA
#define OPT_ZONE_DMA ZONE_DMA
#else
#define OPT_ZONE_DMA ZONE_NORMAL
#endif
#ifdef CONFIG_ZONE_DMA32
#define OPT_ZONE_DMA32 ZONE_DMA32
#else
#define OPT_ZONE_DMA32 OPT_ZONE_DMA
#endif
const int gfp_zone_table[GFP_ZONEMASK] = {
ZONE_NORMAL, /* 00 No flags set */
OPT_ZONE_DMA, /* 01 GFP_DMA */
OPT_ZONE_HIGHMEM, /* 02 GFP_HIGHMEM */
OPT_ZONE_DMA, /* 03 GFP_HIGHMEM GFP_DMA */
OPT_ZONE_DMA32, /* 04 GFP_DMA32 */
OPT_ZONE_DMA, /* 05 GFP_DMA32 GFP_DMA */
OPT_ZONE_DMA32, /* 06 GFP_DMA32 GFP_HIGHMEM */
OPT_ZONE_DMA, /* 07 GFP_DMA32 GFP_HIGHMEM GFP_DMA */
ZONE_NORMAL, /* 08 ZONE_MOVABLE */
OPT_ZONE_DMA, /* 09 MOVABLE + DMA */
ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
OPT_ZONE_DMA, /* 0B MOVABLE + HIGHMEM + DMA */
OPT_ZONE_DMA32, /* 0C MOVABLE + DMA32 */
OPT_ZONE_DMA, /* 0D MOVABLE + DMA32 + DMA */
OPT_ZONE_DMA32, /* 0E MOVABLE + DMA32 + HIGHMEM */
OPT_ZONE_DMA /* 0F MOVABLE + DMA32 + HIGHMEM + DMA */
};
On Wed, Mar 18, 2009 at 01:21:30PM -0400, Christoph Lameter wrote:
>
>
> > time.
> >
> > > > > const int gfp_zone_table[GFP_ZONEMASK] = {
> > > > > ZONE_NORMAL, /* 00 No flags set */
> > > > > ZONE_DMA, /* 01 Only GFP_DMA set */
> > > > > ZONE_HIGHMEM, /* 02 Only GFP_HIGHMEM set */
> > > > > ZONE_DMA, /* 03 GFP_HIGHMEM and GFP_DMA set */
> > > > > ZONE_DMA32, /* 04 Only GFP_DMA32 set */
> > > > > ZONE_DMA, /* 05 GFP_DMA and GFP_DMA32 set */
> > > > > ZONE_DMA32, /* 06 GFP_DMA32 and GFP_HIGHMEM set */
> > > > > ZONE_DMA, /* 07 GFP_DMA, GFP_DMA32 and GFP_DMA32 set */
> > > > > ZONE_MOVABLE, /* 08 Only ZONE_MOVABLE set */
> > > > > ZONE_DMA, /* 09 MOVABLE + DMA */
> > > > > ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
> > > > > ZONE_DMA, /* 0B MOVABLE + DMA + HIGHMEM */
> > > > > ZONE_DMA32, /* 0C MOVABLE + DMA32 */
> > > > > ZONE_DMA, /* 0D MOVABLE + DMA + DMA32 */
> > > > > ZONE_DMA32, /* 0E MOVABLE + DMA32 + HIGHMEM */
> > > > > ZONE_DMA /* 0F MOVABLE + DMA32 + HIGHMEM + DMA
> > > > > };
> > > > >
> > > > > Hmmmm... Guess one would need to add some #ifdeffery here to setup
> > > > > ZONE_NORMAL in cases there is no DMA, DMA32 and HIGHMEM.
> > > > >
> > > >
> > > > Indeed, as I said, this is somewhat error prone which is why the patch
> > > > calculates the table at run-time instead of compile-time trickery.
> > >
> > > One would need to define some macros to make it simpler I guess
> > >
> > > Write something like
> > >
> > > #ifdef CONFIG_ZONE_DMA
> > > #define TZONE_DMA ZONE_DMA
> > > #else
> > > #define TZONE_DMA ZONE_NORMAL
> > > #endif
> > >
> > > for each configurable item. Then just add the T to the above table.
> > >
> >
> > If you don't mind, I'd like to postpone writing such a patch until a second
> > or third pass at improving the allocator. I don't think I'll have the time
> > in the short-term to put together a const-initialised-table patch that will
> > definitily be correct.
> >
> > Alternatively, I can drop this patch entirely from the set.
> >
> >
>
> Let me give it a shot:
>
> Note that there is a slight buggyness in the current implementation of
> gfp_zone. If you set both GFP_DMA32 and GFP_HIGHMEM and the arch does not
> support GFP_DMA32 then gfp_zone returns GFP_HIGHMEM which may result in
> memory being allocated that cannot be used for I/O.
>
> This version here returns GFP_NORMAL which is more correct.
>
>
> #ifdef CONFIG_ZONE_HIGHMEM
> #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
> #else
> #define OPT_ZONE_HIGHMEM ZONE_NORMAL
> #endif
>
> #ifdef CONFIG_ZONE_DMA
> #define OPT_ZONE_DMA ZONE_DMA
> #else
> #define OPT_ZONE_DMA ZONE_NORMAL
> #endif
>
> #ifdef CONFIG_ZONE_DMA32
> #define OPT_ZONE_DMA32 ZONE_DMA32
> #else
> #define OPT_ZONE_DMA32 OPT_ZONE_DMA
> #endif
>
>
> const int gfp_zone_table[GFP_ZONEMASK] = {
> ZONE_NORMAL, /* 00 No flags set */
> OPT_ZONE_DMA, /* 01 GFP_DMA */
> OPT_ZONE_HIGHMEM, /* 02 GFP_HIGHMEM */
> OPT_ZONE_DMA, /* 03 GFP_HIGHMEM GFP_DMA */
> OPT_ZONE_DMA32, /* 04 GFP_DMA32 */
> OPT_ZONE_DMA, /* 05 GFP_DMA32 GFP_DMA */
> OPT_ZONE_DMA32, /* 06 GFP_DMA32 GFP_HIGHMEM */
> OPT_ZONE_DMA, /* 07 GFP_DMA32 GFP_HIGHMEM GFP_DMA */
> ZONE_NORMAL, /* 08 ZONE_MOVABLE */
> OPT_ZONE_DMA, /* 09 MOVABLE + DMA */
> ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
> OPT_ZONE_DMA, /* 0B MOVABLE + HIGHMEM + DMA */
> OPT_ZONE_DMA32, /* 0C MOVABLE + DMA32 */
> OPT_ZONE_DMA, /* 0D MOVABLE + DMA32 + DMA */
> OPT_ZONE_DMA32, /* 0E MOVABLE + DMA32 + HIGHMEM */
> OPT_ZONE_DMA /* 0F MOVABLE + DMA32 + HIGHMEM + DMA */
> };
>
Thanks.At a quick glance, it looks ok but I haven't tested it. As the intention
was to get one pass of patches that are not controversial and are "obvious",
I have dropped my version of the gfp_zone patch and the subsequent flag
cleanup and will revisit it after the first lot of patches has been dealt
with. I'm testing again with the remaining patches.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Wed, 18 Mar 2009, Mel Gorman wrote:
> Thanks.At a quick glance, it looks ok but I haven't tested it. As the intention
> was to get one pass of patches that are not controversial and are "obvious",
> I have dropped my version of the gfp_zone patch and the subsequent flag
> cleanup and will revisit it after the first lot of patches has been dealt
> with. I'm testing again with the remaining patches.
This fixes buggy behavior of gfp_zone so it would deserve a higher
priority.
On Wed, Mar 18, 2009 at 03:07:48PM -0400, Christoph Lameter wrote:
> On Wed, 18 Mar 2009, Mel Gorman wrote:
>
> > Thanks.At a quick glance, it looks ok but I haven't tested it. As the intention
> > was to get one pass of patches that are not controversial and are "obvious",
> > I have dropped my version of the gfp_zone patch and the subsequent flag
> > cleanup and will revisit it after the first lot of patches has been dealt
> > with. I'm testing again with the remaining patches.
>
> This fixes buggy behavior of gfp_zone so it would deserve a higher
> priority.
>
It is buggy behaviour in response to a flag combination that makes no sense
which arguably is a buggy caller. Now that I get to think about it a bit more,
you can't define a const table in a header. If it's declared extern, then
the compiler doesn't know what the constant value is so it can't generate
better code. At best, you end up with equivalent code to what my patch did
in the first place except __GFP_DMA32|__GFP_HIGHMEM will return ZONE_NORMAL.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Wed, 18 Mar 2009 19:46:04 +0000
Mel Gorman <[email protected]> wrote:
> On Wed, Mar 18, 2009 at 03:07:48PM -0400, Christoph Lameter wrote:
> > On Wed, 18 Mar 2009, Mel Gorman wrote:
> >
> > > Thanks.At a quick glance, it looks ok but I haven't tested it. As the intention
> > > was to get one pass of patches that are not controversial and are "obvious",
> > > I have dropped my version of the gfp_zone patch and the subsequent flag
> > > cleanup and will revisit it after the first lot of patches has been dealt
> > > with. I'm testing again with the remaining patches.
> >
> > This fixes buggy behavior of gfp_zone so it would deserve a higher
> > priority.
> >
>
> It is buggy behaviour in response to a flag combination that makes no sense
> which arguably is a buggy caller. Now that I get to think about it a bit more,
> you can't define a const table in a header. If it's declared extern, then
> the compiler doesn't know what the constant value is so it can't generate
> better code. At best, you end up with equivalent code to what my patch did
> in the first place except __GFP_DMA32|__GFP_HIGHMEM will return ZONE_NORMAL.
>
I wonder why you have to make the bad caller work insane way ?
Is this bad ?
==
const int gfp_zone_table[GFP_ZONEMASK] = {
ZONE_NORMAL, /* 00 No flags set */
ZONE_DMA, /* 01 Only GFP_DMA set */
ZONE_HIGHMEM, /* 02 Only GFP_HIGHMEM set */
BAD_ZONE, /* 03 GFP_HIGHMEM and GFP_DMA set */
ZONE_DMA32, /* 04 Only GFP_DMA32 set */
BAD_ZONE, /* 05 GFP_DMA and GFP_DMA32 set */
BAD_ZONE, /* 06 GFP_DMA32 and GFP_HIGHMEM set */
BAD_ZONE, /* 07 GFP_DMA, GFP_DMA32 and GFP_DMA32 set */
ZONE_MOVABLE, /* 08 Only ZONE_MOVABLE set */
ZONE_DMA, /* 09 MOVABLE + DMA */
ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
BAD_ZONE, /* 0B MOVABLE + DMA + HIGHMEM */
ZONE_DMA32, /* 0C MOVABLE + DMA32 */
BAD_ZONE, /* 0D MOVABLE + DMA + DMA32 */
BAD_ZONE, /* 0E MOVABLE + DMA32 + HIGHMEM */
BAD_ZONE /* 0F MOVABLE + DMA32 + HIGHMEM + DMA
};
==
Thanks,
-Kame
> --
> Mel Gorman
> Part-time Phd Student Linux Technology Center
> University of Limerick IBM Dublin Software Lab
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
On Thu, 19 Mar 2009, KAMEZAWA Hiroyuki wrote:
> I wonder why you have to make the bad caller work insane way ?
> Is this bad ?
> ==
> const int gfp_zone_table[GFP_ZONEMASK] = {
> ZONE_NORMAL, /* 00 No flags set */
> ZONE_DMA, /* 01 Only GFP_DMA set */
> ZONE_HIGHMEM, /* 02 Only GFP_HIGHMEM set */
> BAD_ZONE, /* 03 GFP_HIGHMEM and GFP_DMA set */
> ZONE_DMA32, /* 04 Only GFP_DMA32 set */
> BAD_ZONE, /* 05 GFP_DMA and GFP_DMA32 set */
> BAD_ZONE, /* 06 GFP_DMA32 and GFP_HIGHMEM set */
> BAD_ZONE, /* 07 GFP_DMA, GFP_DMA32 and GFP_DMA32 set */
> ZONE_MOVABLE, /* 08 Only ZONE_MOVABLE set */
> ZONE_DMA, /* 09 MOVABLE + DMA */
> ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
> BAD_ZONE, /* 0B MOVABLE + DMA + HIGHMEM */
> ZONE_DMA32, /* 0C MOVABLE + DMA32 */
> BAD_ZONE, /* 0D MOVABLE + DMA + DMA32 */
> BAD_ZONE, /* 0E MOVABLE + DMA32 + HIGHMEM */
> BAD_ZONE /* 0F MOVABLE + DMA32 + HIGHMEM + DMA
> };
> ==
It would work if we could check for BAD_ZONE with a VM_BUG_ON or a
BUILD_BUG_ON. If I get some time I will look into this.
On Thu, 19 Mar 2009, Christoph Lameter wrote:
> It would work if we could check for BAD_ZONE with a VM_BUG_ON or a
> BUILD_BUG_ON. If I get some time I will look into this.
Here is such a patch. Boots on my machine and working with that kernel
now. There is a slight gcc problem in that the table is likely repeated
for each compilation unit. Anyone know how to fix that?
Subject: Use a table lookup for gfp_zone and check for errors in flags passed to the page allocator
Use a table to lookup the zone to use given gfp_flags using gfp_zone().
This simplifies the code in gfp_zone() and also keeps the ability of the compiler to
use constant folding to get rid of gfp_zone processing.
One problem with this patch is that we define a static const array in gfp.h. This results
in every compilation unit to reserve its own space for the array. There must be some
trick to get the compiler to allocate this only once. The contents of the array
must be described in the header file otherwise the compiler will not be able to
determine the value of a lookup in the table.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6/include/linux/gfp.h
===================================================================
--- linux-2.6.orig/include/linux/gfp.h 2009-03-19 11:43:32.000000000 -0500
+++ linux-2.6/include/linux/gfp.h 2009-03-19 11:48:38.000000000 -0500
@@ -19,7 +19,8 @@
#define __GFP_DMA ((__force gfp_t)0x01u)
#define __GFP_HIGHMEM ((__force gfp_t)0x02u)
#define __GFP_DMA32 ((__force gfp_t)0x04u)
-
+#define __GFP_MOVABLE ((__force gfp_t)0x08u) /* Page is movable */
+#define GFP_ZONEMASK ((__force gfp_t)0x0fu)
/*
* Action modifiers - doesn't change the zoning
*
@@ -49,7 +50,6 @@
#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
#define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */
#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
-#define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */
#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -111,24 +111,56 @@
((gfp_flags & __GFP_RECLAIMABLE) != 0);
}
-static inline enum zone_type gfp_zone(gfp_t flags)
-{
+#ifdef CONFIG_ZONE_HIGHMEM
+#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
+#else
+#define OPT_ZONE_HIGHMEM ZONE_NORMAL
+#endif
+
#ifdef CONFIG_ZONE_DMA
- if (flags & __GFP_DMA)
- return ZONE_DMA;
+#define OPT_ZONE_DMA ZONE_DMA
+#else
+#define OPT_ZONE_DMA ZONE_NORMAL
#endif
+
#ifdef CONFIG_ZONE_DMA32
- if (flags & __GFP_DMA32)
- return ZONE_DMA32;
+#define OPT_ZONE_DMA32 ZONE_DMA32
+#else
+#define OPT_ZONE_DMA32 OPT_ZONE_DMA
#endif
- if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
- (__GFP_HIGHMEM | __GFP_MOVABLE))
- return ZONE_MOVABLE;
-#ifdef CONFIG_HIGHMEM
- if (flags & __GFP_HIGHMEM)
- return ZONE_HIGHMEM;
+
+#define BAD_ZONE MAX_NR_ZONES
+
+static const enum zone_type gfp_zone_table[GFP_ZONEMASK + 1] = {
+ ZONE_NORMAL, /* 00 No flags set */
+ OPT_ZONE_DMA, /* 01 GFP_DMA */
+ OPT_ZONE_HIGHMEM, /* 02 GFP_HIGHMEM */
+ BAD_ZONE, /* 03 GFP_HIGHMEM GFP_DMA */
+ OPT_ZONE_DMA32, /* 04 GFP_DMA32 */
+ BAD_ZONE, /* 05 GFP_DMA32 GFP_DMA */
+ BAD_ZONE, /* 06 GFP_DMA32 GFP_HIGHMEM */
+ BAD_ZONE, /* 07 GFP_DMA32 GFP_HIGHMEM GFP_DMA */
+ ZONE_NORMAL, /* 08 ZONE_MOVABLE */
+ OPT_ZONE_DMA, /* 09 MOVABLE + DMA */
+ ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
+ BAD_ZONE, /* 0B MOVABLE + HIGHMEM + DMA */
+ OPT_ZONE_DMA32, /* 0C MOVABLE + DMA32 */
+ BAD_ZONE, /* 0D MOVABLE + DMA32 + DMA */
+ BAD_ZONE, /* 0E MOVABLE + DMA32 + HIGHMEM */
+ BAD_ZONE /* 0F MOVABLE + DMA32 + HIGHMEM + DMA */
+};
+
+static inline enum zone_type gfp_zone(gfp_t flags)
+{
+ enum zone_type zone = gfp_zone_table[flags & 0xf];
+
+ if (__builtin_constant_p(zone))
+ BUILD_BUG_ON(zone == BAD_ZONE);
+#ifdef CONFIG_DEBUG_VM
+ else
+ BUG_ON(zone == BAD_ZONE);
#endif
- return ZONE_NORMAL;
+ return zone;
}
/*
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h 2009-03-19 11:47:00.000000000 -0500
+++ linux-2.6/include/linux/mmzone.h 2009-03-19 11:47:54.000000000 -0500
@@ -240,7 +240,8 @@
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
- __MAX_NR_ZONES
+ __MAX_NR_ZONES,
+ BAD_ZONE
};
#ifndef __GENERATING_BOUNDS_H
On Thu, Mar 19, 2009 at 12:53:34PM -0400, Christoph Lameter wrote:
> On Thu, 19 Mar 2009, Christoph Lameter wrote:
>
> > It would work if we could check for BAD_ZONE with a VM_BUG_ON or a
> > BUILD_BUG_ON. If I get some time I will look into this.
>
> Here is such a patch. Boots on my machine and working with that kernel
> now. There is a slight gcc problem in that the table is likely repeated
> for each compilation unit. Anyone know how to fix that?
>
I ran into exactly that problem and ended up shoving the table into
page_alloc.c but then there is no benefits from having the table statically
declared because there is no constant folding.
Just to confirm: With your patch, gfp_zone_table() does end up in different
complation units
$ readelf -s vmlinux | grep gfp_zone_table
5479: c03a9ea0 64 OBJECT LOCAL DEFAULT 5 gfp_zone_table
5537: c03a9f20 64 OBJECT LOCAL DEFAULT 5 gfp_zone_table
5753: c03a9fe0 64 OBJECT LOCAL DEFAULT 5 gfp_zone_table
> Subject: Use a table lookup for gfp_zone and check for errors in flags passed to the page allocator
>
> Use a table to lookup the zone to use given gfp_flags using gfp_zone().
>
> This simplifies the code in gfp_zone() and also keeps the ability of the compiler to
> use constant folding to get rid of gfp_zone processing.
>
> One problem with this patch is that we define a static const array in gfp.h. This results
> in every compilation unit to reserve its own space for the array. There must be some
> trick to get the compiler to allocate this only once. The contents of the array
> must be described in the header file otherwise the compiler will not be able to
> determine the value of a lookup in the table.
>
Yep, that is exactly the problem I hit but I didn't find a suitable answer.
> Signed-off-by: Christoph Lameter <[email protected]>
>
> Index: linux-2.6/include/linux/gfp.h
> ===================================================================
> --- linux-2.6.orig/include/linux/gfp.h 2009-03-19 11:43:32.000000000 -0500
> +++ linux-2.6/include/linux/gfp.h 2009-03-19 11:48:38.000000000 -0500
> @@ -19,7 +19,8 @@
> #define __GFP_DMA ((__force gfp_t)0x01u)
> #define __GFP_HIGHMEM ((__force gfp_t)0x02u)
> #define __GFP_DMA32 ((__force gfp_t)0x04u)
> -
> +#define __GFP_MOVABLE ((__force gfp_t)0x08u) /* Page is movable */
> +#define GFP_ZONEMASK ((__force gfp_t)0x0fu)
To avoid magic number syndrome, you could define GFP_ZONEMASK as
(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32 | __GFP_MOVABLE)
> /*
> * Action modifiers - doesn't change the zoning
> *
> @@ -49,7 +50,6 @@
> #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
> #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */
> #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
> -#define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */
>
> #define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */
> #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
> @@ -111,24 +111,56 @@
> ((gfp_flags & __GFP_RECLAIMABLE) != 0);
> }
>
> -static inline enum zone_type gfp_zone(gfp_t flags)
> -{
> +#ifdef CONFIG_ZONE_HIGHMEM
> +#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
> +#else
> +#define OPT_ZONE_HIGHMEM ZONE_NORMAL
> +#endif
> +
> #ifdef CONFIG_ZONE_DMA
> - if (flags & __GFP_DMA)
> - return ZONE_DMA;
> +#define OPT_ZONE_DMA ZONE_DMA
> +#else
> +#define OPT_ZONE_DMA ZONE_NORMAL
> #endif
> +
> #ifdef CONFIG_ZONE_DMA32
> - if (flags & __GFP_DMA32)
> - return ZONE_DMA32;
> +#define OPT_ZONE_DMA32 ZONE_DMA32
> +#else
> +#define OPT_ZONE_DMA32 OPT_ZONE_DMA
> #endif
> - if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
> - (__GFP_HIGHMEM | __GFP_MOVABLE))
> - return ZONE_MOVABLE;
> -#ifdef CONFIG_HIGHMEM
> - if (flags & __GFP_HIGHMEM)
> - return ZONE_HIGHMEM;
> +
> +#define BAD_ZONE MAX_NR_ZONES
> +
> +static const enum zone_type gfp_zone_table[GFP_ZONEMASK + 1] = {
> + ZONE_NORMAL, /* 00 No flags set */
> + OPT_ZONE_DMA, /* 01 GFP_DMA */
> + OPT_ZONE_HIGHMEM, /* 02 GFP_HIGHMEM */
> + BAD_ZONE, /* 03 GFP_HIGHMEM GFP_DMA */
> + OPT_ZONE_DMA32, /* 04 GFP_DMA32 */
> + BAD_ZONE, /* 05 GFP_DMA32 GFP_DMA */
> + BAD_ZONE, /* 06 GFP_DMA32 GFP_HIGHMEM */
> + BAD_ZONE, /* 07 GFP_DMA32 GFP_HIGHMEM GFP_DMA */
> + ZONE_NORMAL, /* 08 ZONE_MOVABLE */
> + OPT_ZONE_DMA, /* 09 MOVABLE + DMA */
> + ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */
> + BAD_ZONE, /* 0B MOVABLE + HIGHMEM + DMA */
> + OPT_ZONE_DMA32, /* 0C MOVABLE + DMA32 */
> + BAD_ZONE, /* 0D MOVABLE + DMA32 + DMA */
> + BAD_ZONE, /* 0E MOVABLE + DMA32 + HIGHMEM */
> + BAD_ZONE /* 0F MOVABLE + DMA32 + HIGHMEM + DMA */
> +};
> +
> +static inline enum zone_type gfp_zone(gfp_t flags)
> +{
> + enum zone_type zone = gfp_zone_table[flags & 0xf];
> +
flags & GFP_ZONEMASK here
> + if (__builtin_constant_p(zone))
> + BUILD_BUG_ON(zone == BAD_ZONE);
> +#ifdef CONFIG_DEBUG_VM
> + else
> + BUG_ON(zone == BAD_ZONE);
> #endif
That could be made a bit prettier with
if (__builtin_constant_p(zone))
BUILD_BUG_ON(zone == BAD_ZONE);
VM_BUG_ON(zone == BAD_ZONE);
> - return ZONE_NORMAL;
> + return zone;
> }
>
> /*
> Index: linux-2.6/include/linux/mmzone.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mmzone.h 2009-03-19 11:47:00.000000000 -0500
> +++ linux-2.6/include/linux/mmzone.h 2009-03-19 11:47:54.000000000 -0500
> @@ -240,7 +240,8 @@
> ZONE_HIGHMEM,
> #endif
> ZONE_MOVABLE,
> - __MAX_NR_ZONES
> + __MAX_NR_ZONES,
> + BAD_ZONE
> };
>
> #ifndef __GENERATING_BOUNDS_H
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Thu, 19 Mar 2009, Mel Gorman wrote:
> I ran into exactly that problem and ended up shoving the table into
> page_alloc.c but then there is no benefits from having the table statically
> declared because there is no constant folding.
Right. The table must be defined in the .h file. Just a matter of figuring
out how to convince the compiler/linker to do the right thing.
> > + if (__builtin_constant_p(zone))
> > + BUILD_BUG_ON(zone == BAD_ZONE);
> > +#ifdef CONFIG_DEBUG_VM
> > + else
> > + BUG_ON(zone == BAD_ZONE);
> > #endif
>
> That could be made a bit prettier with
>
> if (__builtin_constant_p(zone))
> BUILD_BUG_ON(zone == BAD_ZONE);
> VM_BUG_ON(zone == BAD_ZONE);
VM_BUG_ON is not available here. It has to be that ugly.
Some macros can get us around the problems:
Subject: Use a table lookup for gfp_zone and check for errors in flags passed to the page allocator
Use a table to lookup the zone to use given gfp_flags using gfp_zone().
This simplifies the code in gfp_zone() and also keeps the ability of the compiler to
use constant folding to get rid of gfp_zone processing.
We are doing some macro tricks here to convince the compiler to always do the
constant folding if possible.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6/include/linux/gfp.h
===================================================================
--- linux-2.6.orig/include/linux/gfp.h 2009-03-19 11:43:32.000000000 -0500
+++ linux-2.6/include/linux/gfp.h 2009-03-19 13:32:48.000000000 -0500
@@ -19,7 +19,8 @@
#define __GFP_DMA ((__force gfp_t)0x01u)
#define __GFP_HIGHMEM ((__force gfp_t)0x02u)
#define __GFP_DMA32 ((__force gfp_t)0x04u)
-
+#define __GFP_MOVABLE ((__force gfp_t)0x08u) /* Page is movable */
+#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
/*
* Action modifiers - doesn't change the zoning
*
@@ -49,7 +50,6 @@
#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
#define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */
#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
-#define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */
#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -111,24 +111,63 @@
((gfp_flags & __GFP_RECLAIMABLE) != 0);
}
-static inline enum zone_type gfp_zone(gfp_t flags)
-{
+#ifdef CONFIG_ZONE_HIGHMEM
+#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
+#else
+#define OPT_ZONE_HIGHMEM ZONE_NORMAL
+#endif
+
#ifdef CONFIG_ZONE_DMA
- if (flags & __GFP_DMA)
- return ZONE_DMA;
+#define OPT_ZONE_DMA ZONE_DMA
+#else
+#define OPT_ZONE_DMA ZONE_NORMAL
#endif
+
#ifdef CONFIG_ZONE_DMA32
- if (flags & __GFP_DMA32)
- return ZONE_DMA32;
+#define OPT_ZONE_DMA32 ZONE_DMA32
+#else
+#define OPT_ZONE_DMA32 OPT_ZONE_DMA
#endif
- if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
- (__GFP_HIGHMEM | __GFP_MOVABLE))
- return ZONE_MOVABLE;
-#ifdef CONFIG_HIGHMEM
- if (flags & __GFP_HIGHMEM)
- return ZONE_HIGHMEM;
+
+#define GFP_ZONE_TABLE \
+const enum zone_type gfp_zone_table[GFP_ZONEMASK + 1] = { \
+ ZONE_NORMAL, /* 00 No flags set */ \
+ OPT_ZONE_DMA, /* 01 GFP_DMA */ \
+ OPT_ZONE_HIGHMEM, /* 02 GFP_HIGHMEM */ \
+ BAD_ZONE, /* 03 GFP_HIGHMEM GFP_DMA */ \
+ OPT_ZONE_DMA32, /* 04 GFP_DMA32 */ \
+ BAD_ZONE, /* 05 GFP_DMA32 GFP_DMA */ \
+ BAD_ZONE, /* 06 GFP_DMA32 GFP_HIGHMEM */ \
+ BAD_ZONE, /* 07 GFP_DMA32 GFP_HIGHMEM GFP_DMA */ \
+ ZONE_NORMAL, /* 08 ZONE_MOVABLE */ \
+ OPT_ZONE_DMA, /* 09 MOVABLE + DMA */ \
+ ZONE_MOVABLE, /* 0A MOVABLE + HIGHMEM */ \
+ BAD_ZONE, /* 0B MOVABLE + HIGHMEM + DMA */ \
+ OPT_ZONE_DMA32, /* 0C MOVABLE + DMA32 */ \
+ BAD_ZONE, /* 0D MOVABLE + DMA32 + DMA */ \
+ BAD_ZONE, /* 0E MOVABLE + DMA32 + HIGHMEM */ \
+ BAD_ZONE /* 0F MOVABLE + DMA32 + HIGHMEM + DMA */\
+};
+
+extern const enum zone_type gfp_zone_table[GFP_ZONEMASK + 1];
+
+static inline enum zone_type gfp_zone(gfp_t flags)
+{
+
+ if (__builtin_constant_p(flags)) {
+ GFP_ZONE_TABLE
+ enum zone_type zone = gfp_zone_table[flags & GFP_ZONEMASK];
+
+ BUILD_BUG_ON(zone == BAD_ZONE);
+ return zone;
+ } else {
+
+ enum zone_type zone = gfp_zone_table[flags & GFP_ZONEMASK];
+#ifdef CONFIG_DEBUG_VM
+ BUG_ON(zone == BAD_ZONE);
#endif
- return ZONE_NORMAL;
+ return zone;
+ }
}
/*
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h 2009-03-19 11:47:00.000000000 -0500
+++ linux-2.6/include/linux/mmzone.h 2009-03-19 11:47:54.000000000 -0500
@@ -240,7 +240,8 @@
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
- __MAX_NR_ZONES
+ __MAX_NR_ZONES,
+ BAD_ZONE
};
#ifndef __GENERATING_BOUNDS_H
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2009-03-19 13:28:35.000000000 -0500
+++ linux-2.6/mm/page_alloc.c 2009-03-19 13:32:21.000000000 -0500
@@ -67,6 +67,9 @@
};
EXPORT_SYMBOL(node_states);
+GFP_ZONE_TABLE
+EXPORT_SYMBOL(gfp_zone_table);
+
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
unsigned long highest_memmap_pfn __read_mostly;