LinuxLists.cc - [RFC 1/7] cpuset write dirty map

2007-06-01 06:04:26

by Ethan Solomita

[permalink] [raw]

Subject: [RFC 1/7] cpuset write dirty map

Add a dirty map to struct address_space

In a NUMA system it is helpful to know where the dirty pages of a mapping
are located. That way we will be able to implement writeout for applications
that are constrained to a portion of the memory of the system as required by
cpusets.

This patch implements the management of dirty node maps for an address
space through the following functions:

cpuset_clear_dirty_nodes(mapping) Clear the map of dirty nodes

cpuset_update_nodes(mapping, page) Record a node in the dirty nodes map

cpuset_init_dirty_nodes(mapping) First time init of the map

The dirty map may be stored either directly in the mapping (for NUMA
systems with less then BITS_PER_LONG nodes) or separately allocated
for systems with a large number of nodes (f.e. IA64 with 1024 nodes).

Updating the dirty map may involve allocating it first for large
configurations. Therefore we protect the allocation and setting
of a node in the map through the tree_lock. The tree_lock is
already taken when a page is dirtied so there is no additional
locking overhead if we insert the updating of the nodemask there.

The dirty map is only cleared (or freed) when the inode is cleared.
At that point no pages are attached to the inode anymore and therefore it can
be done without any locking. The dirty map therefore records all nodes that
have been used for dirty pages by that inode until the inode is no longer
used.

Originally by Christoph Lameter <[email protected]>

Signed-off-by: Ethan Solomita <[email protected]>

---

diff -uprN -X 0/Documentation/dontdiff 0/fs/buffer.c 1/fs/buffer.c
--- 0/fs/buffer.c 2007-05-29 17:42:07.000000000 -0700
+++ 1/fs/buffer.c 2007-05-29 17:44:33.000000000 -0700
@@ -41,6 +41,7 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <linux/cpuset.h>

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);

@@ -710,6 +711,7 @@ static int __set_page_dirty(struct page
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
+ cpuset_update_dirty_nodes(mapping, page);
write_unlock_irq(&mapping->tree_lock);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

diff -uprN -X 0/Documentation/dontdiff 0/fs/fs-writeback.c 1/fs/fs-writeback.c
--- 0/fs/fs-writeback.c 2007-05-29 17:42:07.000000000 -0700
+++ 1/fs/fs-writeback.c 2007-05-29 18:13:48.000000000 -0700
@@ -22,6 +22,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
+#include <linux/cpuset.h>
#include "internal.h"

int sysctl_inode_debug __read_mostly;
@@ -483,6 +484,12 @@ int generic_sync_sb_inodes(struct super_
continue; /* blockdev has wrong queue */
}

+ if (!cpuset_intersects_dirty_nodes(mapping, wbc->nodes)) {
+ /* No pages on the nodes under writeback */
+ redirty_head(inode);
+ continue;
+ }
+
/* Was this inode dirtied after sync_sb_inodes was called? */
if (time_after(inode->dirtied_when, start))
break;
diff -uprN -X 0/Documentation/dontdiff 0/fs/inode.c 1/fs/inode.c
--- 0/fs/inode.c 2007-05-29 17:42:07.000000000 -0700
+++ 1/fs/inode.c 2007-05-29 17:44:33.000000000 -0700
@@ -22,6 +22,7 @@
#include <linux/bootmem.h>
#include <linux/inotify.h>
#include <linux/mount.h>
+#include <linux/cpuset.h>

/*
* This is needed for the following functions:
@@ -148,6 +149,7 @@ static struct inode *alloc_inode(struct
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
mapping->assoc_mapping = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
+ cpuset_init_dirty_nodes(mapping);

/*
* If the block_device provides a backing_dev_info for client
@@ -255,6 +257,7 @@ void clear_inode(struct inode *inode)
bd_forget(inode);
if (S_ISCHR(inode->i_mode) && inode->i_cdev)
cd_forget(inode);
+ cpuset_clear_dirty_nodes(inode->i_mapping);
inode->i_state = I_CLEAR;
}

diff -uprN -X 0/Documentation/dontdiff 0/include/linux/cpuset.h 1/include/linux/cpuset.h
--- 0/include/linux/cpuset.h 2007-05-29 17:40:07.000000000 -0700
+++ 1/include/linux/cpuset.h 2007-05-29 17:44:33.000000000 -0700
@@ -75,6 +75,45 @@ static inline int cpuset_do_slab_mem_spr

extern void cpuset_track_online_nodes(void);

+/*
+ * We need macros since struct address_space is not defined yet
+ */
+#if MAX_NUMNODES <= BITS_PER_LONG
+#define cpuset_update_dirty_nodes(__mapping, __page) \
+ do { \
+ int node = page_to_nid(__page); \
+ if (!node_isset(node, (__mapping)->dirty_nodes)) \
+ node_set(node, (__mapping)->dirty_nodes); \
+ } while (0)
+
+#define cpuset_clear_dirty_nodes(__mapping) \
+ (__mapping)->dirty_nodes = NODE_MASK_NONE
+
+#define cpuset_init_dirty_nodes(__mapping) \
+ (__mapping)->dirty_nodes = NODE_MASK_NONE
+
+#define cpuset_intersects_dirty_nodes(__mapping, __nodemask_ptr) \
+ (!(__nodemask_ptr) || \
+ nodes_intersects((__mapping)->dirty_nodes, \
+ *(__nodemask_ptr)))
+
+#else
+
+#define cpuset_init_dirty_nodes(__mapping) \
+ (__mapping)->dirty_nodes = NULL
+
+struct address_space;
+
+extern void cpuset_update_dirty_nodes(struct address_space *a,
+ struct page *p);
+
+extern void cpuset_clear_dirty_nodes(struct address_space *a);
+
+extern int cpuset_intersects_dirty_nodes(struct address_space *a,
+ nodemask_t *mask);
+
+#endif
+
#else /* !CONFIG_CPUSETS */

static inline int cpuset_init_early(void) { return 0; }
@@ -146,6 +185,26 @@ static inline int cpuset_do_slab_mem_spr

static inline void cpuset_track_online_nodes(void) {}

+struct address_space;
+
+static inline void cpuset_update_dirty_nodes(struct address_space *a,
+ struct page *p) {}
+
+static inline void cpuset_clear_dirty_nodes(struct address_space *a) {}
+
+static inline void cpuset_init_dirty_nodes(struct address_space *a) {}
+
+static inline int cpuset_dirty_node_set(struct inode *i, int node)
+{
+ return 1;
+}
+
+static inline int cpuset_intersects_dirty_nodes(struct address_space *a,
+ nodemask_t *n)
+{
+ return 1;
+}
+
#endif /* !CONFIG_CPUSETS */

#endif /* _LINUX_CPUSET_H */
diff -uprN -X 0/Documentation/dontdiff 0/include/linux/fs.h 1/include/linux/fs.h
--- 0/include/linux/fs.h 2007-05-29 17:42:07.000000000 -0700
+++ 1/include/linux/fs.h 2007-05-29 17:44:33.000000000 -0700
@@ -468,6 +468,13 @@ struct address_space {
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
+#ifdef CONFIG_CPUSETS
+#if MAX_NUMNODES <= BITS_PER_LONG
+ nodemask_t dirty_nodes; /* nodes with dirty pages */
+#else
+ nodemask_t *dirty_nodes; /* pointer to map if dirty */
+#endif
+#endif
} __attribute__((aligned(sizeof(long))));
/*
* On most architectures that alignment is already the case; but
diff -uprN -X 0/Documentation/dontdiff 0/include/linux/writeback.h 1/include/linux/writeback.h
--- 0/include/linux/writeback.h 2007-05-29 17:42:08.000000000 -0700
+++ 1/include/linux/writeback.h 2007-05-30 11:20:16.000000000 -0700
@@ -63,6 +63,7 @@ struct writeback_control {
unsigned range_cyclic:1; /* range_start is cyclic */

void *fs_private; /* For use by ->writepages() */
+ nodemask_t *nodes; /* Set of nodes of interest */
};

/*
diff -uprN -X 0/Documentation/dontdiff 0/kernel/cpuset.c 1/kernel/cpuset.c
--- 0/kernel/cpuset.c 2007-05-29 17:42:08.000000000 -0700
+++ 1/kernel/cpuset.c 2007-05-29 17:44:33.000000000 -0700
@@ -4,7 +4,7 @@
* Processor and Memory placement constraints for sets of tasks.
*
* Copyright (C) 2003 BULL SA.
- * Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ * Copyright (C) 2004-2007 Silicon Graphics, Inc.
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
@@ -12,6 +12,7 @@
* 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
+ * 2007 Cpuset writeback by Christoph Lameter.
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
@@ -2482,6 +2483,63 @@ int cpuset_mem_spread_node(void)
}
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);

+#if MAX_NUMNODES > BITS_PER_LONG
+
+/*
+ * Special functions for NUMA systems with a large number of nodes.
+ * The nodemask is pointed to from the address space structures.
+ * The attachment of the dirty_node mask is protected by the
+ * tree_lock. The nodemask is freed only when the inode is cleared
+ * (and therefore unused, thus no locking necessary).
+ */
+void cpuset_update_dirty_nodes(struct address_space *mapping,
+ struct page *page)
+{
+ nodemask_t *nodes = mapping->dirty_nodes;
+ int node = page_to_nid(page);
+
+ if (!nodes) {
+ nodes = kmalloc(sizeof(nodemask_t), GFP_ATOMIC);
+ if (!nodes)
+ return;
+
+ *nodes = NODE_MASK_NONE;
+ mapping->dirty_nodes = nodes;
+ }
+
+ if (!node_isset(node, *nodes))
+ node_set(node, *nodes);
+}
+
+void cpuset_clear_dirty_nodes(struct address_space *mapping)
+{
+ nodemask_t *nodes = mapping->dirty_nodes;
+
+ if (nodes) {
+ mapping->dirty_nodes = NULL;
+ kfree(nodes);
+ }
+}
+
+/*
+ * Called without the tree_lock. The nodemask is only freed when the inode
+ * is cleared and therefore this is safe.
+ */
+int cpuset_intersects_dirty_nodes(struct address_space *mapping,
+ nodemask_t *mask)
+{
+ nodemask_t *dirty_nodes = mapping->dirty_nodes;
+
+ if (!mask)
+ return 1;
+
+ if (!dirty_nodes)
+ return 0;
+
+ return nodes_intersects(*dirty_nodes, *mask);
+}
+#endif
+
/**
* cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
* @p: pointer to task_struct of some other task.
diff -uprN -X 0/Documentation/dontdiff 0/mm/page-writeback.c 1/mm/page-writeback.c
--- 0/mm/page-writeback.c 2007-05-29 17:42:08.000000000 -0700
+++ 1/mm/page-writeback.c 2007-05-29 17:44:33.000000000 -0700
@@ -33,6 +33,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
+#include <linux/cpuset.h>

/*
* The maximum number of pages to writeout in a single bdflush/kupdate
@@ -834,6 +835,7 @@ int __set_page_dirty_nobuffers(struct pa
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
+ cpuset_update_dirty_nodes(mapping, page);
write_unlock_irq(&mapping->tree_lock);
if (mapping->host) {
/* !PageAnon && !swapper_space */

2007-06-01 06:11:21

by Ethan Solomita

[permalink] [raw]

Subject: [RFC 2/7] cpuset write pdflush nodemask

pdflush: Allow the passing of a nodemask parameter

If we want to support nodeset specific writeout then we need a way
to communicate the set of nodes that an operation should affect.

So add a nodemask_t parameter to the pdflush functions and also
store the nodemask in the pdflush control structure.

Originally by Christoph Lameter <[email protected]>

Signed-off-by: Ethan Solomita <[email protected]>

---

diff -uprN -X 0/Documentation/dontdiff 1/fs/buffer.c 2/fs/buffer.c
--- 1/fs/buffer.c 2007-05-29 17:44:33.000000000 -0700
+++ 2/fs/buffer.c 2007-05-30 11:31:22.000000000 -0700
@@ -359,7 +359,7 @@ static void free_more_memory(void)
struct zone **zones;
pg_data_t *pgdat;

- wakeup_pdflush(1024);
+ wakeup_pdflush(1024, NULL);
yield();

for_each_online_pgdat(pgdat) {
diff -uprN -X 0/Documentation/dontdiff 1/fs/super.c 2/fs/super.c
--- 1/fs/super.c 2007-05-29 17:43:00.000000000 -0700
+++ 2/fs/super.c 2007-05-30 11:31:22.000000000 -0700
@@ -615,7 +615,7 @@ int do_remount_sb(struct super_block *sb
return 0;
}

-static void do_emergency_remount(unsigned long foo)
+static void do_emergency_remount(unsigned long foo, nodemask_t *bar)
{
struct super_block *sb;

@@ -643,7 +643,7 @@ static void do_emergency_remount(unsigne

void emergency_remount(void)
{
- pdflush_operation(do_emergency_remount, 0);
+ pdflush_operation(do_emergency_remount, 0, NULL);
}

/*
diff -uprN -X 0/Documentation/dontdiff 1/fs/sync.c 2/fs/sync.c
--- 1/fs/sync.c 2007-05-29 17:43:00.000000000 -0700
+++ 2/fs/sync.c 2007-05-30 11:31:22.000000000 -0700
@@ -21,9 +21,9 @@
* sync everything. Start out by waking pdflush, because that writes back
* all queues in parallel.
*/
-static void do_sync(unsigned long wait)
+static void do_sync(unsigned long wait, nodemask_t *unused)
{
- wakeup_pdflush(0);
+ wakeup_pdflush(0, NULL);
sync_inodes(0); /* All mappings, inodes and their blockdevs */
DQUOT_SYNC(NULL);
sync_supers(); /* Write the superblocks */
@@ -38,13 +38,13 @@ static void do_sync(unsigned long wait)

asmlinkage long sys_sync(void)
{
- do_sync(1);
+ do_sync(1, NULL);
return 0;
}

void emergency_sync(void)
{
- pdflush_operation(do_sync, 0);
+ pdflush_operation(do_sync, 0, NULL);
}

/*
diff -uprN -X 0/Documentation/dontdiff 1/include/linux/writeback.h 2/include/linux/writeback.h
--- 1/include/linux/writeback.h 2007-05-30 11:20:16.000000000 -0700
+++ 2/include/linux/writeback.h 2007-05-30 11:31:22.000000000 -0700
@@ -86,7 +86,7 @@ static inline void wait_on_inode(struct
/*
* mm/page-writeback.c
*/
-int wakeup_pdflush(long nr_pages);
+int wakeup_pdflush(long nr_pages, nodemask_t *nodes);
void laptop_io_completion(void);
void laptop_sync_completion(void);
void throttle_vm_writeout(gfp_t gfp_mask);
@@ -117,7 +117,8 @@ balance_dirty_pages_ratelimited(struct a
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
void *data);

-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
+int pdflush_operation(void (*fn)(unsigned long, nodemask_t *nodes),
+ unsigned long arg0, nodemask_t *nodes);
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int write_cache_pages(struct address_space *mapping,
diff -uprN -X 0/Documentation/dontdiff 1/mm/page-writeback.c 2/mm/page-writeback.c
--- 1/mm/page-writeback.c 2007-05-29 17:44:33.000000000 -0700
+++ 2/mm/page-writeback.c 2007-05-30 11:31:22.000000000 -0700
@@ -101,7 +101,7 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */

-static void background_writeout(unsigned long _min_pages);
+static void background_writeout(unsigned long _min_pages, nodemask_t *nodes);

/*
* Work out the current dirty-memory clamping and background writeout
@@ -272,7 +272,7 @@ static void balance_dirty_pages(struct a
*/
if ((laptop_mode && pages_written) ||
(!laptop_mode && (nr_reclaimable > background_thresh)))
- pdflush_operation(background_writeout, 0);
+ pdflush_operation(background_writeout, 0, NULL);
}

void set_page_dirty_balance(struct page *page)
@@ -362,7 +362,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
-static void background_writeout(unsigned long _min_pages)
+static void background_writeout(unsigned long _min_pages, nodemask_t *unused)
{
long min_pages = _min_pages;
struct writeback_control wbc = {
@@ -402,12 +402,12 @@ static void background_writeout(unsigned
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
* -1 if all pdflush threads were busy.
*/
-int wakeup_pdflush(long nr_pages)
+int wakeup_pdflush(long nr_pages, nodemask_t *nodes)
{
if (nr_pages == 0)
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- return pdflush_operation(background_writeout, nr_pages);
+ return pdflush_operation(background_writeout, nr_pages, nodes);
}

static void wb_timer_fn(unsigned long unused);
@@ -431,7 +431,7 @@ static DEFINE_TIMER(laptop_mode_wb_timer
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
-static void wb_kupdate(unsigned long arg)
+static void wb_kupdate(unsigned long arg, nodemask_t *unused)
{
unsigned long oldest_jif;
unsigned long start_jif;
@@ -491,18 +491,18 @@ int dirty_writeback_centisecs_handler(ct

static void wb_timer_fn(unsigned long unused)
{
- if (pdflush_operation(wb_kupdate, 0) < 0)
+ if (pdflush_operation(wb_kupdate, 0, NULL) < 0)
mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
}

-static void laptop_flush(unsigned long unused)
+static void laptop_flush(unsigned long unused, nodemask_t *unused2)
{
sys_sync();
}

static void laptop_timer_fn(unsigned long unused)
{
- pdflush_operation(laptop_flush, 0);
+ pdflush_operation(laptop_flush, 0, NULL);
}

/*
diff -uprN -X 0/Documentation/dontdiff 1/mm/pdflush.c 2/mm/pdflush.c
--- 1/mm/pdflush.c 2007-05-29 17:43:00.000000000 -0700
+++ 2/mm/pdflush.c 2007-05-30 11:31:22.000000000 -0700
@@ -83,10 +83,12 @@ static unsigned long last_empty_jifs;
*/
struct pdflush_work {
struct task_struct *who; /* The thread */
- void (*fn)(unsigned long); /* A callback function */
+ void (*fn)(unsigned long, nodemask_t *); /* A callback function */
unsigned long arg0; /* An argument to the callback */
struct list_head list; /* On pdflush_list, when idle */
unsigned long when_i_went_to_sleep;
+ int have_nodes; /* Nodes were specified */
+ nodemask_t nodes; /* Nodes of interest */
};

static int __pdflush(struct pdflush_work *my_work)
@@ -123,7 +125,8 @@ static int __pdflush(struct pdflush_work
}
spin_unlock_irq(&pdflush_lock);

- (*my_work->fn)(my_work->arg0);
+ (*my_work->fn)(my_work->arg0,
+ my_work->have_nodes ? &my_work->nodes : NULL);

/*
* Thread creation: For how long have there been zero
@@ -197,7 +200,8 @@ static int pdflush(void *dummy)
* Returns zero if it indeed managed to find a worker thread, and passed your
* payload to it.
*/
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
+int pdflush_operation(void (*fn)(unsigned long, nodemask_t *),
+ unsigned long arg0, nodemask_t *nodes)
{
unsigned long flags;
int ret = 0;
@@ -217,6 +221,11 @@ int pdflush_operation(void (*fn)(unsigne
last_empty_jifs = jiffies;
pdf->fn = fn;
pdf->arg0 = arg0;
+ if (nodes) {
+ pdf->nodes = *nodes;
+ pdf->have_nodes = 1;
+ } else
+ pdf->have_nodes = 0;
wake_up_process(pdf->who);
spin_unlock_irqrestore(&pdflush_lock, flags);
}
diff -uprN -X 0/Documentation/dontdiff 1/mm/vmscan.c 2/mm/vmscan.c
--- 1/mm/vmscan.c 2007-05-29 17:43:00.000000000 -0700
+++ 2/mm/vmscan.c 2007-05-30 11:31:22.000000000 -0700
@@ -1198,7 +1198,7 @@ unsigned long try_to_free_pages(struct z
*/
if (total_scanned > sc.swap_cluster_max +
sc.swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned, NULL);
sc.may_writepage = 1;
}

2007-06-01 06:12:31

by Ethan Solomita

[permalink] [raw]

Subject: [RFC 3/7] cpuset write throttle

Make page writeback obey cpuset constraints

Currently dirty throttling does not work properly in a cpuset.

If f.e a cpuset contains only 1/10th of available memory then all of the
memory of a cpuset can be dirtied without any writes being triggered.
If all of the cpusets memory is dirty then only 10% of total memory is dirty.
The background writeback threshold is usually set at 10% and the synchrononous
threshold at 40%. So we are still below the global limits while the dirty
ratio in the cpuset is 100%! Writeback throttling and background writeout
do not work at all in such scenarios.

This patch makes dirty writeout cpuset aware. When determining the
dirty limits in get_dirty_limits() we calculate values based on the
nodes that are reachable from the current process (that has been
dirtying the page). Then we can trigger writeout based on the
dirty ratio of the memory in the cpuset.

We trigger writeout in a a cpuset specific way. We go through the dirty
inodes and search for inodes that have dirty pages on the nodes of the
active cpuset. If an inode fulfills that requirement then we begin writeout
of the dirty pages of that inode.

Adding up all the counters for each node in a cpuset may seem to be quite
an expensive operation (in particular for large cpusets with hundreds of
nodes) compared to just accessing the global counters if we do not have
a cpuset. However, please remember that the global counters were only
introduced recently. Before 2.6.18 we did add up per processor
counters for each processor on each invocation of get_dirty_limits().
We now add per node information which I think is equal or less effort
since there are less nodes than processors.

Originally by Christoph Lameter <[email protected]>

Signed-off-by: Ethan Solomita <[email protected]>

---

diff -uprN -X 0/Documentation/dontdiff 2/mm/page-writeback.c 3/mm/page-writeback.c
--- 2/mm/page-writeback.c 2007-05-30 11:31:22.000000000 -0700
+++ 3/mm/page-writeback.c 2007-05-30 11:34:26.000000000 -0700
@@ -103,6 +103,14 @@ EXPORT_SYMBOL(laptop_mode);

static void background_writeout(unsigned long _min_pages, nodemask_t *nodes);

+struct dirty_limits {
+ long thresh_background;
+ long thresh_dirty;
+ unsigned long nr_dirty;
+ unsigned long nr_unstable;
+ unsigned long nr_writeback;
+};
+
/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
@@ -121,13 +129,15 @@ static void background_writeout(unsigned
* clamping level.
*/

-static unsigned long highmem_dirtyable_memory(unsigned long total)
+static unsigned long highmem_dirtyable_memory(nodemask_t *nodes, unsigned long total)
{
#ifdef CONFIG_HIGHMEM
int node;
unsigned long x = 0;

- for_each_online_node(node) {
+ if (nodes == NULL)
+ nodes = &node_online_mask;
+ for_each_node_mask(node, *nodes) {
struct zone *z =
&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];

@@ -154,13 +164,13 @@ static unsigned long determine_dirtyable
x = global_page_state(NR_FREE_PAGES)
+ global_page_state(NR_INACTIVE)
+ global_page_state(NR_ACTIVE);
- x -= highmem_dirtyable_memory(x);
+ x -= highmem_dirtyable_memory(NULL, x);
return x + 1; /* Ensure that we never return 0 */
}

-static void
-get_dirty_limits(long *pbackground, long *pdirty,
- struct address_space *mapping)
+static int
+get_dirty_limits(struct dirty_limits *dl, struct address_space *mapping,
+ nodemask_t *nodes)
{
int background_ratio; /* Percentages */
int dirty_ratio;
@@ -168,12 +178,60 @@ get_dirty_limits(long *pbackground, long
long background;
long dirty;
unsigned long available_memory = determine_dirtyable_memory();
+ unsigned long dirtyable_memory;
+ unsigned long nr_mapped;
struct task_struct *tsk;
+ int is_subset = 0;
+
+#ifdef CONFIG_CPUSETS
+ if (unlikely(nodes &&
+ !nodes_subset(node_online_map, *nodes))) {
+ int node;
+
+ /*
+ * Calculate the limits relative to the current cpuset.
+ *
+ * We do not disregard highmem because all nodes (except
+ * maybe node 0) have either all memory in HIGHMEM (32 bit) or
+ * all memory in non HIGHMEM (64 bit). If we would disregard
+ * highmem then cpuset throttling would not work on 32 bit.
+ */
+ is_subset = 1;
+ memset(dl, 0, sizeof(struct dirty_limits));
+ dirtyable_memory = 0;
+ nr_mapped = 0;
+ for_each_node_mask(node, *nodes) {
+ if (!node_online(node))
+ continue;
+ dl->nr_dirty += node_page_state(node, NR_FILE_DIRTY);
+ dl->nr_unstable +=
+ node_page_state(node, NR_UNSTABLE_NFS);
+ dl->nr_writeback +=
+ node_page_state(node, NR_WRITEBACK);
+ dirtyable_memory +=
+ node_page_state(node, NR_ACTIVE) +
+ node_page_state(node, NR_INACTIVE) +
+ node_page_state(node, NR_FREE_PAGES);
+ nr_mapped += node_page_state(node, NR_FILE_MAPPED) +
+ node_page_state(node, NR_ANON_PAGES);
+ }
+ dirtyable_memory -= highmem_dirtyable_memory(nodes,
+ dirtyable_memory);
+ } else
+#endif
+ {
+ /* Global limits */
+ dl->nr_dirty = global_page_state(NR_FILE_DIRTY);
+ dl->nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ dl->nr_writeback = global_page_state(NR_WRITEBACK);
+ dirtyable_memory = determine_dirtyable_memory();
+ nr_mapped = global_page_state(NR_FILE_MAPPED) +
+ global_page_state(NR_ANON_PAGES);
+ }

unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES)) * 100) /
- available_memory;
-
+ vm_total_pages;
dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
dirty_ratio = unmapped_ratio / 2;
@@ -185,15 +243,16 @@ get_dirty_limits(long *pbackground, long
if (background_ratio >= dirty_ratio)
background_ratio = dirty_ratio / 2;

- background = (background_ratio * available_memory) / 100;
- dirty = (dirty_ratio * available_memory) / 100;
+ background = (background_ratio * dirtyable_memory) / 100;
+ dirty = (dirty_ratio * dirtyable_memory) / 100;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
background += background / 4;
dirty += dirty / 4;
}
- *pbackground = background;
- *pdirty = dirty;
+ dl->thresh_background = background;
+ dl->thresh_dirty = dirty;
+ return is_subset;
}

/*
@@ -206,8 +265,7 @@ get_dirty_limits(long *pbackground, long
static void balance_dirty_pages(struct address_space *mapping)
{
long nr_reclaimable;
- long background_thresh;
- long dirty_thresh;
+ struct dirty_limits dl;
unsigned long pages_written = 0;
unsigned long write_chunk = sync_writeback_pages();

@@ -222,11 +280,12 @@ static void balance_dirty_pages(struct a
.range_cyclic = 1,
};

- get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
- dirty_thresh)
+ if (get_dirty_limits(&dl, mapping,
+ &cpuset_current_mems_allowed))
+ wbc.nodes = &cpuset_current_mems_allowed;
+ nr_reclaimable = dl.nr_dirty + dl.nr_unstable;
+ if (nr_reclaimable + dl.nr_writeback <=
+ dl.thresh_dirty)
break;

if (!dirty_exceeded)
@@ -240,13 +299,10 @@ static void balance_dirty_pages(struct a
*/
if (nr_reclaimable) {
writeback_inodes(&wbc);
- get_dirty_limits(&background_thresh,
- &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable +
- global_page_state(NR_WRITEBACK)
- <= dirty_thresh)
+ get_dirty_limits(&dl, mapping,
+ &cpuset_current_mems_allowed);
+ nr_reclaimable = dl.nr_dirty + dl.nr_unstable;
+ if (nr_reclaimable + dl.nr_writeback <= dl.thresh_dirty)
break;
pages_written += write_chunk - wbc.nr_to_write;
if (pages_written >= write_chunk)
@@ -255,8 +311,8 @@ static void balance_dirty_pages(struct a
congestion_wait(WRITE, HZ/10);
}

- if (nr_reclaimable + global_page_state(NR_WRITEBACK)
- <= dirty_thresh && dirty_exceeded)
+ if (nr_reclaimable + dl.nr_writeback
+ <= dl.thresh_dirty && dirty_exceeded)
dirty_exceeded = 0;

if (writeback_in_progress(bdi))
@@ -271,8 +327,9 @@ static void balance_dirty_pages(struct a
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && (nr_reclaimable > background_thresh)))
- pdflush_operation(background_writeout, 0, NULL);
+ (!laptop_mode && (nr_reclaimable > dl.thresh_background)))
+ pdflush_operation(background_writeout, 0,
+ &cpuset_current_mems_allowed);
}

void set_page_dirty_balance(struct page *page)
@@ -329,8 +386,7 @@ EXPORT_SYMBOL(balance_dirty_pages_rateli

void throttle_vm_writeout(gfp_t gfp_mask)
{
- long background_thresh;
- long dirty_thresh;
+ struct dirty_limits dl;

if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
/*
@@ -342,27 +398,26 @@ void throttle_vm_writeout(gfp_t gfp_mask
return;
}

- for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+ for ( ; ; ) {
+ get_dirty_limits(&dl, NULL, &node_online_map);

- /*
- * Boost the allowable dirty threshold a bit for page
- * allocators so they don't get DoS'ed by heavy writers
- */
- dirty_thresh += dirty_thresh / 10; /* wheeee... */
-
- if (global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK) <= dirty_thresh)
- break;
- congestion_wait(WRITE, HZ/10);
- }
+ /*
+ * Boost the allowable dirty threshold a bit for page
+ * allocators so they don't get DoS'ed by heavy writers
+ */
+ dl.thresh_dirty += dl.thresh_dirty / 10; /* wheeee... */
+
+ if (dl.nr_unstable + dl.nr_writeback <= dl.thresh_dirty)
+ break;
+ congestion_wait(WRITE, HZ/10);
+ }
}

/*
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
-static void background_writeout(unsigned long _min_pages, nodemask_t *unused)
+static void background_writeout(unsigned long _min_pages, nodemask_t *nodes)
{
long min_pages = _min_pages;
struct writeback_control wbc = {
@@ -375,12 +430,11 @@ static void background_writeout(unsigned
};

for ( ; ; ) {
- long background_thresh;
- long dirty_thresh;
+ struct dirty_limits dl;

- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) < background_thresh
+ if (get_dirty_limits(&dl, NULL, nodes))
+ wbc.nodes = nodes;
+ if (dl.nr_dirty + dl.nr_unstable < dl.thresh_background
&& min_pages <= 0)
break;
wbc.encountered_congestion = 0;

2007-06-01 06:13:29

by Ethan Solomita

[permalink] [raw]

Subject: [RFC 4/7] cpuset write vmscan

Direct reclaim: cpuset aware writeout

During direct reclaim we traverse down a zonelist and are carefully
checking each zone if its a member of the active cpuset. But then we call
pdflush without enforcing the same restrictions. In a larger system this
may have the effect of a massive amount of pages being dirtied and then either

A. No writeout occurs because global dirty limits have not been reached

or

B. Writeout starts randomly for some dirty inode in the system. Pdflush
may just write out data for nodes in another cpuset and miss doing
proper dirty handling for the current cpuset.

In both cases dirty pages in the zones of interest may not be affected
and writeout may not occur as necessary.

Fix that by restricting pdflush to the active cpuset. Writeout will occur
from direct reclaim the same way as without a cpuset.

Originally by Christoph Lameter <[email protected]>

Signed-off-by: Ethan Solomita <[email protected]>

---

diff -uprN -X 0/Documentation/dontdiff 3/mm/vmscan.c 4/mm/vmscan.c
--- 3/mm/vmscan.c 2007-05-30 11:34:21.000000000 -0700
+++ 4/mm/vmscan.c 2007-05-30 11:36:17.000000000 -0700
@@ -1198,7 +1198,8 @@ unsigned long try_to_free_pages(struct z
*/
if (total_scanned > sc.swap_cluster_max +
sc.swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned, NULL);
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned,
+ &cpuset_current_mems_allowed);
sc.may_writepage = 1;
}

2007-06-01 06:15:46

by Ethan Solomita

[permalink] [raw]

Subject: [RFC 5/7] cpuset write vm writeout

Throttle VM writeout in a cpuset aware way

This bases the vm throttling from the reclaim path on the dirty ratio
of the cpuset. Note that a cpuset is only effective if shrink_zone is called
from direct reclaim.

kswapd has a cpuset context that includes the whole machine. VM throttling
will only work during synchrononous reclaim and not from kswapd.

Originally by Christoph Lameter <[email protected]>

Signed-off-by: Ethan Solomita <[email protected]>

---

diff -uprN -X 0/Documentation/dontdiff 4/include/linux/writeback.h
5/include/linux/writeback.h
--- 4/include/linux/writeback.h 2007-05-30 11:36:14.000000000 -0700
+++ 5/include/linux/writeback.h 2007-05-30 11:37:01.000000000 -0700
@@ -89,7 +89,7 @@ static inline void wait_on_inode(struct
int wakeup_pdflush(long nr_pages, nodemask_t *nodes);
void laptop_io_completion(void);
void laptop_sync_completion(void);
-void throttle_vm_writeout(gfp_t gfp_mask);
+void throttle_vm_writeout(nodemask_t *nodes,gfp_t gfp_mask);

/* These are exported to sysctl. */
extern int dirty_background_ratio;
diff -uprN -X 0/Documentation/dontdiff 4/mm/page-writeback.c
5/mm/page-writeback.c
--- 4/mm/page-writeback.c 2007-05-30 11:36:15.000000000 -0700
+++ 5/mm/page-writeback.c 2007-05-30 11:37:01.000000000 -0700
@@ -384,7 +384,7 @@ void balance_dirty_pages_ratelimited_nr(
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);

-void throttle_vm_writeout(gfp_t gfp_mask)
+void throttle_vm_writeout(nodemask_t *nodes, gfp_t gfp_mask)
{
struct dirty_limits dl;

@@ -399,7 +399,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
}

for ( ; ; ) {
- get_dirty_limits(&dl, NULL, &node_online_map);
+ get_dirty_limits(&dl, NULL, nodes);

/*
* Boost the allowable dirty threshold a bit for page
diff -uprN -X 0/Documentation/dontdiff 4/mm/vmscan.c 5/mm/vmscan.c
--- 4/mm/vmscan.c 2007-05-30 11:36:17.000000000 -0700
+++ 5/mm/vmscan.c 2007-05-30 11:37:01.000000000 -0700
@@ -1079,7 +1079,7 @@ static unsigned long shrink_zone(int pri
}
}

- throttle_vm_writeout(sc->gfp_mask);
+ throttle_vm_writeout(&cpuset_current_mems_allowed, sc->gfp_mask);

atomic_dec(&zone->reclaim_in_progress);
return nr_reclaimed;

2007-06-01 06:16:33

by Ethan Solomita

[permalink] [raw]

Subject: [corrected][RFC 5/7] cpuset write vm writeout

Throttle VM writeout in a cpuset aware way

This bases the vm throttling from the reclaim path on the dirty ratio
of the cpuset. Note that a cpuset is only effective if shrink_zone is called
from direct reclaim.

kswapd has a cpuset context that includes the whole machine. VM throttling
will only work during synchrononous reclaim and not from kswapd.

Originally by Christoph Lameter <[email protected]>

Signed-off-by: Ethan Solomita <[email protected]>

---

diff -uprN -X 0/Documentation/dontdiff 4/include/linux/writeback.h 5/include/linux/writeback.h
--- 4/include/linux/writeback.h 2007-05-30 11:36:14.000000000 -0700
+++ 5/include/linux/writeback.h 2007-05-30 11:37:01.000000000 -0700
@@ -89,7 +89,7 @@ static inline void wait_on_inode(struct
int wakeup_pdflush(long nr_pages, nodemask_t *nodes);
void laptop_io_completion(void);
void laptop_sync_completion(void);
-void throttle_vm_writeout(gfp_t gfp_mask);
+void throttle_vm_writeout(nodemask_t *nodes,gfp_t gfp_mask);

/* These are exported to sysctl. */
extern int dirty_background_ratio;
diff -uprN -X 0/Documentation/dontdiff 4/mm/page-writeback.c 5/mm/page-writeback.c
--- 4/mm/page-writeback.c 2007-05-30 11:36:15.000000000 -0700
+++ 5/mm/page-writeback.c 2007-05-30 11:37:01.000000000 -0700
@@ -384,7 +384,7 @@ void balance_dirty_pages_ratelimited_nr(
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);

-void throttle_vm_writeout(gfp_t gfp_mask)
+void throttle_vm_writeout(nodemask_t *nodes, gfp_t gfp_mask)
{
struct dirty_limits dl;

@@ -399,7 +399,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
}

for ( ; ; ) {
- get_dirty_limits(&dl, NULL, &node_online_map);
+ get_dirty_limits(&dl, NULL, nodes);

/*
* Boost the allowable dirty threshold a bit for page
diff -uprN -X 0/Documentation/dontdiff 4/mm/vmscan.c 5/mm/vmscan.c
--- 4/mm/vmscan.c 2007-05-30 11:36:17.000000000 -0700
+++ 5/mm/vmscan.c 2007-05-30 11:37:01.000000000 -0700
@@ -1079,7 +1079,7 @@ static unsigned long shrink_zone(int pri
}
}

- throttle_vm_writeout(sc->gfp_mask);
+ throttle_vm_writeout(&cpuset_current_mems_allowed, sc->gfp_mask);

atomic_dec(&zone->reclaim_in_progress);
return nr_reclaimed;

2007-06-01 06:17:19

by Ethan Solomita

[permalink] [raw]

Subject: [RFC 6/7] cpuset write fixes

Remove unneeded local variable.

Originally by Christoph Lameter <[email protected]>

Signed-off-by: Ethan Solomita <[email protected]>

---

diff -uprN -X 0/Documentation/dontdiff 5/mm/page-writeback.c 6/mm/page-writeback.c
--- 5/mm/page-writeback.c 2007-05-30 11:37:01.000000000 -0700
+++ 6/mm/page-writeback.c 2007-05-30 11:39:25.000000000 -0700
@@ -177,7 +177,6 @@ get_dirty_limits(struct dirty_limits *dl
int unmapped_ratio;
long background;
long dirty;
- unsigned long available_memory = determine_dirtyable_memory();
unsigned long dirtyable_memory;
unsigned long nr_mapped;
struct task_struct *tsk;

2007-06-01 06:18:00

by Ethan Solomita

[permalink] [raw]

Subject: [RFC 7/7] cpuset dirty limits

Per cpuset dirty ratios

This implements dirty ratios per cpuset. Two new files are added
to the cpuset directories:

background_dirty_ratio Percentage at which background writeback starts

throttle_dirty_ratio Percentage at which the application is throttled
and we start synchrononous writeout.

Both variables are set to -1 by default which means that the global
limits (/proc/sys/vm/vm_dirty_ratio and /proc/sys/vm/dirty_background_ratio)
are used for a cpuset.

Originally by Christoph Lameter <[email protected]>

Signed-off-by: Ethan Solomita <[email protected]>

---

diff -uprN -X 0/Documentation/dontdiff 6/include/linux/cpuset.h 7/include/linux/cpuset.h
--- 6/include/linux/cpuset.h 2007-05-30 11:39:17.000000000 -0700
+++ 7/include/linux/cpuset.h 2007-05-30 11:39:48.000000000 -0700
@@ -75,6 +75,7 @@ static inline int cpuset_do_slab_mem_spr

extern void cpuset_track_online_nodes(void);

+extern void cpuset_get_current_ratios(int *background, int *ratio);
/*
* We need macros since struct address_space is not defined yet
*/
diff -uprN -X 0/Documentation/dontdiff 6/kernel/cpuset.c 7/kernel/cpuset.c
--- 6/kernel/cpuset.c 2007-05-30 11:39:17.000000000 -0700
+++ 7/kernel/cpuset.c 2007-05-30 11:39:48.000000000 -0700
@@ -49,6 +49,7 @@
#include <linux/time.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
+#include <linux/writeback.h>

#include <asm/uaccess.h>
#include <asm/atomic.h>
@@ -99,6 +100,9 @@ struct cpuset {
int mems_generation;

struct fmeter fmeter; /* memory_pressure filter */
+
+ int background_dirty_ratio;
+ int throttle_dirty_ratio;
};

/* bits in struct cpuset flags field */
@@ -176,6 +180,8 @@ static struct cpuset top_cpuset = {
.count = ATOMIC_INIT(0),
.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
.children = LIST_HEAD_INIT(top_cpuset.children),
+ .background_dirty_ratio = -1,
+ .throttle_dirty_ratio = -1,
};

static struct vfsmount *cpuset_mount;
@@ -1030,6 +1036,21 @@ static int update_flag(cpuset_flagbits_t
return 0;
}

+static int update_int(int *cs_int, char *buf, int min, int max)
+{
+ char *endp;
+ int val;
+
+ val = simple_strtol(buf, &endp, 10);
+ if (val < min || val > max)
+ return -EINVAL;
+
+ mutex_lock(&callback_mutex);
+ *cs_int = val;
+ mutex_unlock(&callback_mutex);
+ return 0;
+}
+
/*
* Frequency meter - How fast is some event occurring?
*
@@ -1238,6 +1259,8 @@ typedef enum {
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
FILE_TASKLIST,
+ FILE_THROTTLE_DIRTY_RATIO,
+ FILE_BACKGROUND_DIRTY_RATIO,
} cpuset_filetype_t;

static ssize_t cpuset_common_file_write(struct file *file,
@@ -1308,6 +1331,12 @@ static ssize_t cpuset_common_file_write(
case FILE_TASKLIST:
retval = attach_task(cs, buffer, &pathbuf);
break;
+ case FILE_BACKGROUND_DIRTY_RATIO:
+ retval = update_int(&cs->background_dirty_ratio, buffer, -1, 100);
+ break;
+ case FILE_THROTTLE_DIRTY_RATIO:
+ retval = update_int(&cs->throttle_dirty_ratio, buffer, -1, 100);
+ break;
default:
retval = -EINVAL;
goto out2;
@@ -1420,6 +1449,12 @@ static ssize_t cpuset_common_file_read(s
case FILE_SPREAD_SLAB:
*s++ = is_spread_slab(cs) ? '1' : '0';
break;
+ case FILE_BACKGROUND_DIRTY_RATIO:
+ s += sprintf(s, "%d", cs->background_dirty_ratio);
+ break;
+ case FILE_THROTTLE_DIRTY_RATIO:
+ s += sprintf(s, "%d", cs->throttle_dirty_ratio);
+ break;
default:
retval = -EINVAL;
goto out;
@@ -1788,6 +1823,16 @@ static struct cftype cft_spread_slab = {
.private = FILE_SPREAD_SLAB,
};

+static struct cftype cft_background_dirty_ratio = {
+ .name = "background_dirty_ratio",
+ .private = FILE_BACKGROUND_DIRTY_RATIO,
+};
+
+static struct cftype cft_throttle_dirty_ratio = {
+ .name = "throttle_dirty_ratio",
+ .private = FILE_THROTTLE_DIRTY_RATIO,
+};
+
static int cpuset_populate_dir(struct dentry *cs_dentry)
{
int err;
@@ -1810,6 +1855,10 @@ static int cpuset_populate_dir(struct de
return err;
if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_background_dirty_ratio)) < 0)
+ return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_throttle_dirty_ratio)) < 0)
+ return err;
if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
return err;
return 0;
@@ -1849,6 +1898,8 @@ static long cpuset_create(struct cpuset
INIT_LIST_HEAD(&cs->children);
cs->mems_generation = cpuset_mems_generation++;
fmeter_init(&cs->fmeter);
+ cs->background_dirty_ratio = parent->background_dirty_ratio;
+ cs->throttle_dirty_ratio = parent->throttle_dirty_ratio;

cs->parent = parent;

@@ -2483,8 +2534,30 @@ int cpuset_mem_spread_node(void)
}
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);

-#if MAX_NUMNODES > BITS_PER_LONG
+/*
+ * Determine the dirty ratios for the currently active cpuset
+ */
+void cpuset_get_current_ratios(int *background_ratio, int *throttle_ratio)
+{
+ int background = -1;
+ int throttle = -1;
+ struct task_struct *tsk = current;
+
+ task_lock(tsk);
+ background = tsk->cpuset->background_dirty_ratio;
+ throttle = tsk->cpuset->throttle_dirty_ratio;
+ task_unlock(tsk);

+ if (background == -1)
+ background = dirty_background_ratio;
+ if (throttle == -1)
+ throttle = vm_dirty_ratio;
+
+ *background_ratio = background;
+ *throttle_ratio = throttle;
+}
+
+#if MAX_NUMNODES > BITS_PER_LONG
/*
* Special functions for NUMA systems with a large number of nodes.
* The nodemask is pointed to from the address space structures.
diff -uprN -X 0/Documentation/dontdiff 6/mm/page-writeback.c 7/mm/page-writeback.c
--- 6/mm/page-writeback.c 2007-05-30 11:39:25.000000000 -0700
+++ 7/mm/page-writeback.c 2007-05-30 11:39:48.000000000 -0700
@@ -216,6 +216,7 @@ get_dirty_limits(struct dirty_limits *dl
}
dirtyable_memory -= highmem_dirtyable_memory(nodes,
dirtyable_memory);
+ cpuset_get_current_ratios(&background_ratio, &dirty_ratio);
} else
#endif
{
@@ -226,19 +227,19 @@ get_dirty_limits(struct dirty_limits *dl
dirtyable_memory = determine_dirtyable_memory();
nr_mapped = global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES);
+ dirty_ratio = vm_dirty_ratio;
+ background_ratio = dirty_background_ratio;
}

unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES)) * 100) /
vm_total_pages;
- dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
dirty_ratio = unmapped_ratio / 2;

if (dirty_ratio < 5)
dirty_ratio = 5;

- background_ratio = dirty_background_ratio;
if (background_ratio >= dirty_ratio)
background_ratio = dirty_ratio / 2;

2007-06-04 18:39:50

by Christoph Lameter

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

On Thu, 31 May 2007, Ethan Solomita wrote:

> The dirty map is only cleared (or freed) when the inode is cleared.
> At that point no pages are attached to the inode anymore and therefore it can
> be done without any locking. The dirty map therefore records all nodes that
> have been used for dirty pages by that inode until the inode is no longer
> used.
>
> Originally by Christoph Lameter <[email protected]>

You should preserve my Signed-off-by: since I wrote most of this. Is there
a changelog?

2007-06-04 19:39:15

by Ethan Solomita

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

Christoph Lameter wrote:
> On Thu, 31 May 2007, Ethan Solomita wrote:
>
>> The dirty map is only cleared (or freed) when the inode is cleared.
>> At that point no pages are attached to the inode anymore and therefore it can
>> be done without any locking. The dirty map therefore records all nodes that
>> have been used for dirty pages by that inode until the inode is no longer
>> used.
>>
>> Originally by Christoph Lameter <[email protected]>
>
> You should preserve my Signed-off-by: since I wrote most of this. Is there
> a changelog?
>

I wasn't sure of the etiquette -- I'd thought that by saying you had
signed it off that meant you were accepting my modifications, and didn't
want to presume. But I will change it if you like. No slight intended.

Unfortunately I don't have a changelog, and since I've since forward
ported the changes it would be hard to produce. If you want to review it
you should probably review it all, because the forward porting may have
introduced issues.
-- Ethan

2007-06-04 19:52:20

by Christoph Lameter

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

On Mon, 4 Jun 2007, Ethan Solomita wrote:

> > You should preserve my Signed-off-by: since I wrote most of this. Is there
> > a changelog?
> >
>
> I wasn't sure of the etiquette -- I'd thought that by saying you had
> signed it off that meant you were accepting my modifications, and didn't
> want to presume. But I will change it if you like. No slight intended.
>
> Unfortunately I don't have a changelog, and since I've since forward
> ported the changes it would be hard to produce. If you want to review it
> you should probably review it all, because the forward porting may have
> introduced issues.

I glanced over it and it looks okay. Please cc me on future submissions.

What testing was done? Would you include the results of tests in your next
post?

2007-06-25 20:22:23

by Ethan Solomita

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

Christoph Lameter wrote:
>
> What testing was done? Would you include the results of tests in your next
> post?

Sorry for the delay in responding -- I was chasing phantom failures.

I created a stress test which involved using cpusets and mems_allowed
to split memory so that all daemons had memory set aside for them, and
my memory stress test had a separate set of memory. The stress test was
mmaping 7GB of a very large file on disk. It then scans the entire 7GB
of memory reading and modifying each byte. 7GB is more than the amount
of physical memory made available to the stress test.

Using iostat I can see the initial period of reading from disk,
followed by a period of simultaneous reads and writes as dirty bytes are
pushed to make room for new reads.

In a separate log-in, in the other cpuset, I am running:

while `true`; do date | tee -a date.txt; sleep 5; done

date.txt resides on the same disk as the large file mentioned above.
The above while-loop serves the dual purpose of providing me visual
clues of progress along with the opportunity for the "tee" command to
become throttled writing to the disk.

The effect of this patchset is straightforward. Without it there are
long hangs between appearances of the date. With it the dates are all 5
(or sometimes 6) seconds apart.

I also added printks to the kernel to verify that, without these
patches, the tee was being throttled (along with lots of other things),
and with the patch only pdflush is being throttled.
-- Ethan

2007-06-26 19:17:05

by Christoph Lameter

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

On Mon, 25 Jun 2007, Ethan Solomita wrote:

> The effect of this patchset is straightforward. Without it there are
> long hangs between appearances of the date. With it the dates are all 5
> (or sometimes 6) seconds apart.
>
> I also added printks to the kernel to verify that, without these
> patches, the tee was being throttled (along with lots of other things),
> and with the patch only pdflush is being throttled.

That sounds good. Andrew: Any chance that we can get this patchset merged?

2007-06-26 22:22:33

by Andrew Morton

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

On Tue, 26 Jun 2007 12:16:50 -0700 (PDT)
Christoph Lameter <[email protected]> wrote:

> On Mon, 25 Jun 2007, Ethan Solomita wrote:
>
> > The effect of this patchset is straightforward. Without it there are
> > long hangs between appearances of the date. With it the dates are all 5
> > (or sometimes 6) seconds apart.
> >
> > I also added printks to the kernel to verify that, without these
> > patches, the tee was being throttled (along with lots of other things),
> > and with the patch only pdflush is being throttled.
>
> That sounds good. Andrew: Any chance that we can get this patchset merged?
>

Is in my queue somewhere. Could be that by the time I get to it it will
need refreshing (again), we'll see.

One open question is the interaction between these changes and with Peter's
per-device-dirty-throttling changes. They also are in my queue somewhere.
Having a 100:1 coder:reviewer ratio doesn't exactly make for swift
progress.

2007-06-27 03:18:55

by Christoph Lameter

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

On Tue, 26 Jun 2007, Andrew Morton wrote:

> Is in my queue somewhere. Could be that by the time I get to it it will
> need refreshing (again), we'll see.
>
> One open question is the interaction between these changes and with Peter's
> per-device-dirty-throttling changes. They also are in my queue somewhere.
> Having a 100:1 coder:reviewer ratio doesn't exactly make for swift
> progress.

Hmmmm.. How can we help? I can look at some aspects of Peter's per device
throttling.

2007-06-27 09:14:36

by Andrew Morton

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

On Tue, 26 Jun 2007 20:18:36 -0700 (PDT) Christoph Lameter <[email protected]> wrote:

> On Tue, 26 Jun 2007, Andrew Morton wrote:
>
> > Is in my queue somewhere. Could be that by the time I get to it it will
> > need refreshing (again), we'll see.
> >
> > One open question is the interaction between these changes and with Peter's
> > per-device-dirty-throttling changes. They also are in my queue somewhere.
> > Having a 100:1 coder:reviewer ratio doesn't exactly make for swift
> > progress.
>
> Hmmmm.. How can we help? I can look at some aspects of Peter's per device
> throttling.

That can't hurt.

I'm more concerned about all of Mel's code in -mm actually. I don't recall
anyone doing a full review recently and I'm still not sure that this is the
overall direction in which we wish to go. Last time I asked this everyone
seemed a bit waffly and non-committal.

2007-06-27 12:45:10

by Christoph Lameter

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

On Wed, 27 Jun 2007, Andrew Morton wrote:

> I'm more concerned about all of Mel's code in -mm actually. I don't recall
> anyone doing a full review recently and I'm still not sure that this is the
> overall direction in which we wish to go. Last time I asked this everyone
> seemed a bit waffly and non-committal.

I have looked over this several times lately and it all seems quite okay.
Still cannot find a justification for the movable zone (never had to
use it in testing) but it seems that it makes memory unplug easier. The
antifrag patchset together with a page migration patch simplifies the
unplug patchset significantly.

I think the antifrag code is a significant step forward and will enable
lots of other features (memory unplug, larger page use in SLUB, huge page
allocs after boot). It may be useful to put memory compaction and memory
unplug in at the same time (I think we can get there even for .23) so that
we have a full package. With compaction we can finally recover from loads
that typically cause memory to be split in a lot of disjoint pieces and
get to a sitaution were we can dynamically reconfigure the number of huge
pages at run time (Our customers currently reboot to do this which is a
pain). Compaction increases the chance of I/O controllers being able to
merge I/O requests since contiguous pages can be served by the page
allocator again. Antifrag almost gets there but I can still construct
allocation scenarios that fragment memory significantly.

Also compaction is a requirement if we ever want to support larger
blocksizes. That would allow the removal of various layers that are now
needed to compensate for not supporting larger pages.

The whole approach is useful to increase performance. We have seen
several percentage points of performance wins with SLUB when allowing
larger pages sizes. The use of huge pages is also mainly done for
performance reasons. The large blocksize patch has shown a 50% performance
increase even in its prototype form where we certainly have not solved
server performance issues.

Even without large blocksize: The ability to restore the capability of the
page allocator to serve pages that are in sequence can be used to shorten
the scatter gather lists in the I/O layer speeding up I/O.

I think this is an important contribution that will move a lot of other
issues forward.

2007-06-27 18:18:06

by Ethan Solomita

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

Andrew Morton wrote:
>
> One open question is the interaction between these changes and with Peter's
> per-device-dirty-throttling changes. They also are in my queue somewhere.

I looked over it at one point. Most of the code doesn't conflict, but I
believe that the code path which calculates the dirty limits will need
some merging. Doable but non-trivial.
-- Ethan

2007-06-27 21:11:09

by mel

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

On (27/06/07 05:44), Christoph Lameter didst pronounce:
> On Wed, 27 Jun 2007, Andrew Morton wrote:
>
> > I'm more concerned about all of Mel's code in -mm actually. I don't recall
> > anyone doing a full review recently and I'm still not sure that this is the
> > overall direction in which we wish to go. Last time I asked this everyone
> > seemed a bit waffly and non-committal.
>
> I have looked over this several times lately and it all seems quite okay.
> Still cannot find a justification for the movable zone (never had to
> use it in testing) but it seems that it makes memory unplug easier.

As well as helping memory hot-remove, it provides a known lower-limit that
the hugepage pool can be resized to at runtime. Grouping pages by mobility
on its own does not give a known number of pageblocks that could be used
for hugepage allocation. On it's own, it provides a high probability that
the hugepage pool can be grown but how large depends on the workload. With
the zone, an administrator can set aside X amount of memory that can be used
for hugepages if necessary and base pages otherwise.

> The
> antifrag patchset together with a page migration patch simplifies the
> unplug patchset significantly.
>
> I think the antifrag code is a significant step forward and will enable
> lots of other features (memory unplug, larger page use in SLUB, huge page
> allocs after boot). It may be useful to put memory compaction and memory
> unplug in at the same time (I think we can get there even for .23) so that
> we have a full package.

The memory unplug patches already migrate pages without the help of memory
compaction. Memory compaction needs a bit more work before I try sending
it towards -mm for wider testing. It's important to get right because
with compaction, the migration code is getting a lot more exercise than it
does today. However, memory compaction needing more work should not hold
back memory unplug nor does it impact on the utility of grouping pages
by mobility or the movable zone.

> With compaction we can finally recover from loads
> that typically cause memory to be split in a lot of disjoint pieces and
> get to a sitaution were we can dynamically reconfigure the number of huge
> pages at run time (Our customers currently reboot to do this which is a
> pain). Compaction increases the chance of I/O controllers being able to
> merge I/O requests since contiguous pages can be served by the page
> allocator again. Antifrag almost gets there but I can still construct
> allocation scenarios that fragment memory significantly.
>

The number of scenarios that cause fragmentation have dropped over time.
It's something I see as constantly improving over time.

> Also compaction is a requirement if we ever want to support larger
> blocksizes. That would allow the removal of various layers that are now
> needed to compensate for not supporting larger pages.
>
> The whole approach is useful to increase performance. We have seen
> several percentage points of performance wins with SLUB when allowing
> larger pages sizes. The use of huge pages is also mainly done for
> performance reasons. The large blocksize patch has shown a 50% performance
> increase even in its prototype form where we certainly have not solved
> server performance issues.
>
> Even without large blocksize: The ability to restore the capability of the
> page allocator to serve pages that are in sequence can be used to shorten
> the scatter gather lists in the I/O layer speeding up I/O.
>
> I think this is an important contribution that will move a lot of other
> issues forward.
>

Thanks Christoph for your reviewing of the code, your comments here and the
quite rigorous exercise of the anti-fragmentation code. I too believe
there are performance gains to be had through the use of larger pages
and working on this does not preclude work on improving the use of base
pages.

For larger pages to be used though, external fragmentation has to be
addressed in some fashion and no other viable implementation exists
besides anti-fragmentation right now.

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab

2007-06-27 21:38:26

by Christoph Lameter

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

On Wed, 27 Jun 2007, Ethan Solomita wrote:

> I looked over it at one point. Most of the code doesn't conflict, but I
> believe that the code path which calculates the dirty limits will need
> some merging. Doable but non-trivial.
> -- Ethan

I hope you will keep on updating the patchset and posting it against
current mm?

2007-07-01 02:57:28

by Ethan Solomita

[permalink] [raw]

Subject: Re: [RFC 1/7] cpuset write dirty map

Christoph Lameter wrote:
> On Wed, 27 Jun 2007, Ethan Solomita wrote:
>
>> I looked over it at one point. Most of the code doesn't conflict, but I
>> believe that the code path which calculates the dirty limits will need
>> some merging. Doable but non-trivial.
>> -- Ethan
>
> I hope you will keep on updating the patchset and posting it against
> current mm?
>

I have no new changes, but I can update it against the current mm. Or
did the per-bdi throttling change get taken by Andrew?
-- Ethan