Currently cpusets are not able to do proper writeback since
dirty ratio calculations and writeback are all done for the system
as a whole. This may result in a large percentage of a cpuset
to become dirty without writeout being triggered. Under NFS
this can lead to OOM conditions.
Writeback will occur during the LRU scans. But such writeout
is not effective since we write page by page and not in inode page
order (regular writeback).
In order to fix the problem we first of all introduce a method to
establish a map of nodes that contain dirty pages for each
inode mapping.
Secondly we modify the dirty limit calculation to be based
on the acctive cpuset.
If we are in a cpuset then we select only inodes for writeback
that have pages on the nodes of the cpuset.
After we have the cpuset throttling in place we can then make
further fixups:
A. We can do inode based writeout from direct reclaim
avoiding single page writes to the filesystem.
B. We add a new counter NR_UNRECLAIMABLE that is subtracted
from the available pages in a node. This allows us to
accurately calculate the dirty ratio even if large portions
of the node have been allocated for huge pages or for
slab pages.
There are a couple of points where some better ideas could be used:
1. The nodemask expands the inode structure significantly if the
architecture allows a high number of nodes. This is only an issue
for IA64. For that platform we expand the inode structure by 128 byte
(to support 1024 nodes). The last patch attempts to address the issue
by using the knowledge about the maximum possible number of nodes
determined on bootup to shrink the nodemask.
2. The calculation of the per cpuset limits can require looping
over a number of nodes which may bring the performance of get_dirty_limits
near pre 2.6.18 performance (before the introduction of the ZVC counters)
(only for cpuset based limit calculation). There is no way of keeping these
counters per cpuset since cpusets may overlap.
Paul probably needs to go through this and may want additional fixes to
keep things in harmony with cpusets.
Tested on:
IA64 NUMA 128p, 12p
Compiles on:
i386 SMP
x86_64 UP
Replace highest_possible_node_id() with nr_node_ids
highest_possible_node_id() is used to calculate the last possible node id
so that the network subsystem can figure out how to size per node arrays.
I think having the ability to determine the maximum amount of nodes in
a system at runtime is useful but then we should name this entry
correspondingly and also only calculate the value once on bootup.
This patch introduces nr_node_ids and replaces the use of
highest_possible_node_id(). nr_node_ids is calculated on bootup when
the page allocators pagesets are initialized.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6.20-rc4-mm1/include/linux/nodemask.h
===================================================================
--- linux-2.6.20-rc4-mm1.orig/include/linux/nodemask.h 2007-01-06 21:45:51.000000000 -0800
+++ linux-2.6.20-rc4-mm1/include/linux/nodemask.h 2007-01-12 12:59:50.000000000 -0800
@@ -352,7 +352,7 @@
#define node_possible(node) node_isset((node), node_possible_map)
#define first_online_node first_node(node_online_map)
#define next_online_node(nid) next_node((nid), node_online_map)
-int highest_possible_node_id(void);
+extern int nr_node_ids;
#else
#define num_online_nodes() 1
#define num_possible_nodes() 1
@@ -360,7 +360,7 @@
#define node_possible(node) ((node) == 0)
#define first_online_node 0
#define next_online_node(nid) (MAX_NUMNODES)
-#define highest_possible_node_id() 0
+#define nr_node_ids 1
#endif
#define any_online_node(mask) \
Index: linux-2.6.20-rc4-mm1/mm/page_alloc.c
===================================================================
--- linux-2.6.20-rc4-mm1.orig/mm/page_alloc.c 2007-01-12 12:58:26.000000000 -0800
+++ linux-2.6.20-rc4-mm1/mm/page_alloc.c 2007-01-12 12:59:50.000000000 -0800
@@ -679,6 +679,26 @@
return i;
}
+#if MAX_NUMNODES > 1
+int nr_node_ids __read_mostly;
+EXPORT_SYMBOL(nr_node_ids);
+
+/*
+ * Figure out the number of possible node ids.
+ */
+static void __init setup_nr_node_ids(void)
+{
+ unsigned int node;
+ unsigned int highest = 0;
+
+ for_each_node_mask(node, node_possible_map)
+ highest = node;
+ nr_node_ids = highest + 1;
+}
+#else
+static void __init setup_nr_node_ids(void) {}
+#endif
+
#ifdef CONFIG_NUMA
/*
* Called from the slab reaper to drain pagesets on a particular node that
@@ -3318,6 +3338,7 @@
min_free_kbytes = 65536;
setup_per_zone_pages_min();
setup_per_zone_lowmem_reserve();
+ setup_nr_node_ids();
return 0;
}
module_init(init_per_zone_pages_min)
@@ -3519,18 +3540,4 @@
EXPORT_SYMBOL(page_to_pfn);
#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
-#if MAX_NUMNODES > 1
-/*
- * Find the highest possible node id.
- */
-int highest_possible_node_id(void)
-{
- unsigned int node;
- unsigned int highest = 0;
- for_each_node_mask(node, node_possible_map)
- highest = node;
- return highest;
-}
-EXPORT_SYMBOL(highest_possible_node_id);
-#endif
Index: linux-2.6.20-rc4-mm1/net/sunrpc/svc.c
===================================================================
--- linux-2.6.20-rc4-mm1.orig/net/sunrpc/svc.c 2007-01-06 21:45:51.000000000 -0800
+++ linux-2.6.20-rc4-mm1/net/sunrpc/svc.c 2007-01-12 12:59:50.000000000 -0800
@@ -116,7 +116,7 @@
static int
svc_pool_map_init_percpu(struct svc_pool_map *m)
{
- unsigned int maxpools = highest_possible_processor_id()+1;
+ unsigned int maxpools = nr_node_ids;
unsigned int pidx = 0;
unsigned int cpu;
int err;
@@ -144,7 +144,7 @@
static int
svc_pool_map_init_pernode(struct svc_pool_map *m)
{
- unsigned int maxpools = highest_possible_node_id()+1;
+ unsigned int maxpools = nr_node_ids;
unsigned int pidx = 0;
unsigned int node;
int err;
Direct reclaim: cpuset aware writeout
During direct reclaim we traverse down a zonelist and are carefully
checking each zone if its a member of the active cpuset. But then we call
pdflush without enforcing the same restrictions. In a larger system this
may have the effect of a massive amount of pages being dirtied and then either
A. No writeout occurs because global dirty limits have not been reached
or
B. Writeout starts randomly for some dirty inode in the system. Pdflush
may just write out data for nodes in another cpuset and miss doing
proper dirty handling for the current cpuset.
In both cases dirty pages in the zones of interest may not be affected
and writeout may not occur as necessary.
Fix that by restricting pdflush to the active cpuset. Writeout will occur
from direct reclaim as in an SMP system.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6.20-rc5/mm/vmscan.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/vmscan.c 2007-01-15 21:34:43.173887398 -0600
+++ linux-2.6.20-rc5/mm/vmscan.c 2007-01-15 21:37:26.605346439 -0600
@@ -1065,7 +1065,8 @@ unsigned long try_to_free_pages(struct z
*/
if (total_scanned > sc.swap_cluster_max +
sc.swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned, NULL);
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned,
+ &cpuset_current_mems_allowed);
sc.may_writepage = 1;
}
Add a dirty map to the inode
In a NUMA system it is helpful to know where the dirty pages of a mapping
are located. That way we will be able to implement writeout for applications
that are constrained to a portion of the memory of the system as required by
cpusets.
Two functions are introduced to manage the dirty node map:
cpuset_clear_dirty_nodes() and cpuset_update_nodes(). Both are defined using
macros since the definition of struct inode may not be available in cpuset.h.
The dirty map is cleared when the inode is cleared. There is no
synchronization (except for atomic nature of node_set) for the dirty_map. The
only problem that could be done is that we do not write out an inode if a
node bit is not set. That is rare and will be impossibly rare if multiple pages
are involved. There is therefore a slight chance that we have missed a dirty
node if it just contains a single page. Which is likely tolerable.
This patch increases the size of struct inode for the NUMA case. For
most arches that only support up to 64 nodes this is simply adding one
unsigned long.
However, the default Itanium configuration allows for up to 1024 nodes.
On Itanium we add 128 byte per inode. A later patch will make the size of
the per node bit array dynamic so that the size of the inode slab caches
is properly sized.
Signed-off-by; Christoph Lameter <[email protected]>
Index: linux-2.6.20-rc5/fs/fs-writeback.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/fs-writeback.c 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/fs/fs-writeback.c 2007-01-15 22:34:12.065241639 -0600
@@ -22,6 +22,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
+#include <linux/cpuset.h>
#include "internal.h"
/**
@@ -223,11 +224,13 @@ __sync_single_inode(struct inode *inode,
/*
* The inode is clean, inuse
*/
+ cpuset_clear_dirty_nodes(inode);
list_move(&inode->i_list, &inode_in_use);
} else {
/*
* The inode is clean, unused
*/
+ cpuset_clear_dirty_nodes(inode);
list_move(&inode->i_list, &inode_unused);
}
}
Index: linux-2.6.20-rc5/fs/inode.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/inode.c 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/fs/inode.c 2007-01-15 22:33:55.802081773 -0600
@@ -22,6 +22,7 @@
#include <linux/bootmem.h>
#include <linux/inotify.h>
#include <linux/mount.h>
+#include <linux/cpuset.h>
/*
* This is needed for the following functions:
@@ -134,6 +135,7 @@ static struct inode *alloc_inode(struct
inode->i_cdev = NULL;
inode->i_rdev = 0;
inode->dirtied_when = 0;
+ cpuset_clear_dirty_nodes(inode);
if (security_inode_alloc(inode)) {
if (inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
Index: linux-2.6.20-rc5/include/linux/fs.h
===================================================================
--- linux-2.6.20-rc5.orig/include/linux/fs.h 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/include/linux/fs.h 2007-01-15 22:33:55.876307100 -0600
@@ -589,6 +589,9 @@ struct inode {
void *i_security;
#endif
void *i_private; /* fs or device private pointer */
+#ifdef CONFIG_CPUSETS
+ nodemask_t dirty_nodes; /* Map of nodes with dirty pages */
+#endif
};
/*
Index: linux-2.6.20-rc5/mm/page-writeback.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/page-writeback.c 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/mm/page-writeback.c 2007-01-15 22:34:14.425802376 -0600
@@ -33,6 +33,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
+#include <linux/cpuset.h>
/*
* The maximum number of pages to writeout in a single bdflush/kupdate
@@ -780,6 +781,7 @@ int __set_page_dirty_nobuffers(struct pa
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ cpuset_update_dirty_nodes(mapping->host, page);
}
return 1;
}
Index: linux-2.6.20-rc5/fs/buffer.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/buffer.c 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/fs/buffer.c 2007-01-15 22:34:14.459008443 -0600
@@ -42,6 +42,7 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <linux/cpuset.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void invalidate_bh_lrus(void);
@@ -739,6 +740,7 @@ int __set_page_dirty_buffers(struct page
}
write_unlock_irq(&mapping->tree_lock);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ cpuset_update_dirty_nodes(mapping->host, page);
return 1;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);
Index: linux-2.6.20-rc5/include/linux/cpuset.h
===================================================================
--- linux-2.6.20-rc5.orig/include/linux/cpuset.h 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/include/linux/cpuset.h 2007-01-15 22:34:37.757947994 -0600
@@ -75,6 +75,19 @@ static inline int cpuset_do_slab_mem_spr
extern void cpuset_track_online_nodes(void);
+/*
+ * We need macros since struct inode is not defined yet
+ */
+#define cpuset_update_dirty_nodes(__inode, __page) \
+ node_set(page_to_nid(__page), (__inode)->dirty_nodes)
+
+#define cpuset_clear_dirty_nodes(__inode) \
+ (__inode)->dirty_nodes = NODE_MASK_NONE
+
+#define cpuset_intersects_dirty_nodes(__inode, __nodemask_ptr) \
+ (!(__nodemask_ptr) || nodes_intersects((__inode)->dirty_nodes, \
+ *(__nodemask_ptr)))
+
#else /* !CONFIG_CPUSETS */
static inline int cpuset_init_early(void) { return 0; }
@@ -146,6 +159,17 @@ static inline int cpuset_do_slab_mem_spr
static inline void cpuset_track_online_nodes(void) {}
+static inline void cpuset_update_dirty_nodes(struct inode *i,
+ struct page *page) {}
+
+static inline void cpuset_clear_dirty_nodes(struct inode *i) {}
+
+static inline int cpuset_intersects_dirty_nodes(struct inode *i,
+ nodemask_t *n)
+{
+ return 1;
+}
+
#endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */
pdflush: Allow the passing of a nodemask parameter
If we want to support nodeset specific writeout then we need a way
to communicate the set of nodes that an operation should affect.
So add a nodemask_t parameter to the pdflush functions and also
store the nodemask in the pdflush control structure.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6.20-rc5/include/linux/writeback.h
===================================================================
--- linux-2.6.20-rc5.orig/include/linux/writeback.h 2007-01-15 21:34:38.564104333 -0600
+++ linux-2.6.20-rc5/include/linux/writeback.h 2007-01-15 21:34:43.135798088 -0600
@@ -81,7 +81,7 @@ static inline void wait_on_inode(struct
/*
* mm/page-writeback.c
*/
-int wakeup_pdflush(long nr_pages);
+int wakeup_pdflush(long nr_pages, nodemask_t *nodes);
void laptop_io_completion(void);
void laptop_sync_completion(void);
void throttle_vm_writeout(void);
@@ -109,7 +109,8 @@ balance_dirty_pages_ratelimited(struct a
balance_dirty_pages_ratelimited_nr(mapping, 1);
}
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
+int pdflush_operation(void (*fn)(unsigned long, nodemask_t *nodes),
+ unsigned long arg0, nodemask_t *nodes);
extern int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
Index: linux-2.6.20-rc5/mm/page-writeback.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/page-writeback.c 2007-01-15 21:34:38.573870823 -0600
+++ linux-2.6.20-rc5/mm/page-writeback.c 2007-01-15 21:34:43.150447823 -0600
@@ -101,7 +101,7 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
-static void background_writeout(unsigned long _min_pages);
+static void background_writeout(unsigned long _min_pages, nodemask_t *nodes);
/*
* Work out the current dirty-memory clamping and background writeout
@@ -244,7 +244,7 @@ static void balance_dirty_pages(struct a
*/
if ((laptop_mode && pages_written) ||
(!laptop_mode && (nr_reclaimable > background_thresh)))
- pdflush_operation(background_writeout, 0);
+ pdflush_operation(background_writeout, 0, NULL);
}
void set_page_dirty_balance(struct page *page)
@@ -325,7 +325,7 @@ void throttle_vm_writeout(void)
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
-static void background_writeout(unsigned long _min_pages)
+static void background_writeout(unsigned long _min_pages, nodemask_t *unused)
{
long min_pages = _min_pages;
struct writeback_control wbc = {
@@ -365,12 +365,12 @@ static void background_writeout(unsigned
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
* -1 if all pdflush threads were busy.
*/
-int wakeup_pdflush(long nr_pages)
+int wakeup_pdflush(long nr_pages, nodemask_t *nodes)
{
if (nr_pages == 0)
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- return pdflush_operation(background_writeout, nr_pages);
+ return pdflush_operation(background_writeout, nr_pages, nodes);
}
static void wb_timer_fn(unsigned long unused);
@@ -394,7 +394,7 @@ static DEFINE_TIMER(laptop_mode_wb_timer
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
-static void wb_kupdate(unsigned long arg)
+static void wb_kupdate(unsigned long arg, nodemask_t *unused)
{
unsigned long oldest_jif;
unsigned long start_jif;
@@ -454,18 +454,18 @@ int dirty_writeback_centisecs_handler(ct
static void wb_timer_fn(unsigned long unused)
{
- if (pdflush_operation(wb_kupdate, 0) < 0)
+ if (pdflush_operation(wb_kupdate, 0, NULL) < 0)
mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
}
-static void laptop_flush(unsigned long unused)
+static void laptop_flush(unsigned long unused, nodemask_t *unused2)
{
sys_sync();
}
static void laptop_timer_fn(unsigned long unused)
{
- pdflush_operation(laptop_flush, 0);
+ pdflush_operation(laptop_flush, 0, NULL);
}
/*
Index: linux-2.6.20-rc5/mm/pdflush.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/pdflush.c 2007-01-15 21:34:38.582660664 -0600
+++ linux-2.6.20-rc5/mm/pdflush.c 2007-01-15 21:34:43.161190961 -0600
@@ -83,10 +83,12 @@ static unsigned long last_empty_jifs;
*/
struct pdflush_work {
struct task_struct *who; /* The thread */
- void (*fn)(unsigned long); /* A callback function */
+ void (*fn)(unsigned long, nodemask_t *); /* A callback function */
unsigned long arg0; /* An argument to the callback */
struct list_head list; /* On pdflush_list, when idle */
unsigned long when_i_went_to_sleep;
+ int have_nodes; /* Nodes were specified */
+ nodemask_t nodes; /* Nodes of interest */
};
static int __pdflush(struct pdflush_work *my_work)
@@ -123,7 +125,8 @@ static int __pdflush(struct pdflush_work
}
spin_unlock_irq(&pdflush_lock);
- (*my_work->fn)(my_work->arg0);
+ (*my_work->fn)(my_work->arg0,
+ my_work->have_nodes ? &my_work->nodes : NULL);
/*
* Thread creation: For how long have there been zero
@@ -197,7 +200,8 @@ static int pdflush(void *dummy)
* Returns zero if it indeed managed to find a worker thread, and passed your
* payload to it.
*/
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
+int pdflush_operation(void (*fn)(unsigned long, nodemask_t *),
+ unsigned long arg0, nodemask_t *nodes)
{
unsigned long flags;
int ret = 0;
@@ -217,6 +221,11 @@ int pdflush_operation(void (*fn)(unsigne
last_empty_jifs = jiffies;
pdf->fn = fn;
pdf->arg0 = arg0;
+ if (nodes) {
+ pdf->nodes = *nodes;
+ pdf->have_nodes = 1;
+ } else
+ pdf->have_nodes = 0;
wake_up_process(pdf->who);
spin_unlock_irqrestore(&pdflush_lock, flags);
}
Index: linux-2.6.20-rc5/mm/vmscan.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/vmscan.c 2007-01-15 21:34:38.592427153 -0600
+++ linux-2.6.20-rc5/mm/vmscan.c 2007-01-15 21:34:43.173887398 -0600
@@ -1065,7 +1065,7 @@ unsigned long try_to_free_pages(struct z
*/
if (total_scanned > sc.swap_cluster_max +
sc.swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned, NULL);
sc.may_writepage = 1;
}
Index: linux-2.6.20-rc5/fs/buffer.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/buffer.c 2007-01-15 21:34:38.601216994 -0600
+++ linux-2.6.20-rc5/fs/buffer.c 2007-01-15 21:34:43.195373675 -0600
@@ -357,7 +357,7 @@ static void free_more_memory(void)
struct zone **zones;
pg_data_t *pgdat;
- wakeup_pdflush(1024);
+ wakeup_pdflush(1024, NULL);
yield();
for_each_online_pgdat(pgdat) {
Index: linux-2.6.20-rc5/fs/super.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/super.c 2007-01-15 21:34:38.610983483 -0600
+++ linux-2.6.20-rc5/fs/super.c 2007-01-15 21:34:43.208070111 -0600
@@ -618,7 +618,7 @@ int do_remount_sb(struct super_block *sb
return 0;
}
-static void do_emergency_remount(unsigned long foo)
+static void do_emergency_remount(unsigned long foo, nodemask_t *bar)
{
struct super_block *sb;
@@ -646,7 +646,7 @@ static void do_emergency_remount(unsigne
void emergency_remount(void)
{
- pdflush_operation(do_emergency_remount, 0);
+ pdflush_operation(do_emergency_remount, 0, NULL);
}
/*
Index: linux-2.6.20-rc5/fs/sync.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/sync.c 2007-01-15 21:34:38.622703271 -0600
+++ linux-2.6.20-rc5/fs/sync.c 2007-01-15 21:34:43.217836601 -0600
@@ -21,9 +21,9 @@
* sync everything. Start out by waking pdflush, because that writes back
* all queues in parallel.
*/
-static void do_sync(unsigned long wait)
+static void do_sync(unsigned long wait, nodemask_t *unused)
{
- wakeup_pdflush(0);
+ wakeup_pdflush(0, NULL);
sync_inodes(0); /* All mappings, inodes and their blockdevs */
DQUOT_SYNC(NULL);
sync_supers(); /* Write the superblocks */
@@ -38,13 +38,13 @@ static void do_sync(unsigned long wait)
asmlinkage long sys_sync(void)
{
- do_sync(1);
+ do_sync(1, NULL);
return 0;
}
void emergency_sync(void)
{
- pdflush_operation(do_sync, 0);
+ pdflush_operation(do_sync, 0, NULL);
}
/*
Dynamically reduce the size of the nodemask_t in struct inode
The nodemask_t in struct inode can potentially waste a lot of memory if
MAX_NUMNODES is high. For IA64 MAX_NUMNODES is 1024 by default which
results in 128 bytes to be used for the nodemask. This means that the
memory use for inodes may increase significantly since they all now
include a dirty_map. These may be unecessarily large on smaller systems.
We placed the nodemask at the end of struct inode. This patch avoids
touching the later part of the nodemask if the actual maximum possible
node on the system is less than 1024. If MAX_NUMNODES is larger than
BITS_PER_LONG (and we may use more than one word for the nodemask) then
we calculate the number of bytes that may be taken off the end of
an inode. We can then create the inode caches without those bytes
effectively saving memory. On a IA64 system booting with a
maximum of 64 nodes we may save 120 of those 128 bytes per inode.
This is only done for filesystems that are typically used for NUMA
systems: xfs, nfs, ext3, ext4 and reiserfs. Other filesystems will
always use the full length of the inode.
This solution may be a bit hokey. I tried other approaches but this
one seemed to be the simplest with the least complications. Maybe someone
else can come up with a better solution?
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6.20-rc5/fs/xfs/linux-2.6/xfs_super.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/xfs/linux-2.6/xfs_super.c 2007-01-15 22:33:55.000000000 -0600
+++ linux-2.6.20-rc5/fs/xfs/linux-2.6/xfs_super.c 2007-01-15 22:35:07.596529498 -0600
@@ -370,7 +370,9 @@ xfs_fs_inode_init_once(
STATIC int
xfs_init_zones(void)
{
- xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
+ xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t)
+ - unused_numa_nodemask_bytes,
+ "xfs_vnode",
KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
KM_ZONE_SPREAD,
xfs_fs_inode_init_once);
Index: linux-2.6.20-rc5/include/linux/fs.h
===================================================================
--- linux-2.6.20-rc5.orig/include/linux/fs.h 2007-01-15 22:33:55.000000000 -0600
+++ linux-2.6.20-rc5/include/linux/fs.h 2007-01-15 22:35:07.621922373 -0600
@@ -591,6 +591,14 @@ struct inode {
void *i_private; /* fs or device private pointer */
#ifdef CONFIG_CPUSETS
nodemask_t dirty_nodes; /* Map of nodes with dirty pages */
+ /*
+ * Note that we may only use a portion of the bitmap in dirty_nodes
+ * if we have a large MAX_NUMNODES but the number of possible nodes
+ * is small in order to reduce the size of the inode.
+ *
+ * Bits after nr_node_ids (one node beyond the last possible
+ * node_id) may not be accessed.
+ */
#endif
};
Index: linux-2.6.20-rc5/fs/ext3/super.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/ext3/super.c 2007-01-15 22:33:55.000000000 -0600
+++ linux-2.6.20-rc5/fs/ext3/super.c 2007-01-15 22:35:07.646338599 -0600
@@ -480,7 +480,8 @@ static void init_once(void * foo, struct
static int init_inodecache(void)
{
ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
- sizeof(struct ext3_inode_info),
+ sizeof(struct ext3_inode_info)
+ - unused_numa_nodemask_bytes,
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
init_once, NULL);
Index: linux-2.6.20-rc5/fs/inode.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/inode.c 2007-01-15 22:33:55.000000000 -0600
+++ linux-2.6.20-rc5/fs/inode.c 2007-01-15 22:35:07.661964984 -0600
@@ -1399,7 +1399,8 @@ void __init inode_init(unsigned long mem
/* inode slab cache */
inode_cachep = kmem_cache_create("inode_cache",
- sizeof(struct inode),
+ sizeof(struct inode)
+ - unused_numa_nodemask_bytes,
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
SLAB_MEM_SPREAD),
Index: linux-2.6.20-rc5/fs/reiserfs/super.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/reiserfs/super.c 2007-01-15 22:33:55.000000000 -0600
+++ linux-2.6.20-rc5/fs/reiserfs/super.c 2007-01-15 22:35:07.685404561 -0600
@@ -525,11 +525,11 @@ static void init_once(void *foo, struct
static int init_inodecache(void)
{
reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
- sizeof(struct
- reiserfs_inode_info),
- 0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
- init_once, NULL);
+ sizeof(struct reiserfs_inode_info)
+ - unused_numa_nodemask_bytes,
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ init_once, NULL);
if (reiserfs_inode_cachep == NULL)
return -ENOMEM;
return 0;
Index: linux-2.6.20-rc5/include/linux/cpuset.h
===================================================================
--- linux-2.6.20-rc5.orig/include/linux/cpuset.h 2007-01-15 22:34:37.000000000 -0600
+++ linux-2.6.20-rc5/include/linux/cpuset.h 2007-01-15 22:33:53.759908623 -0600
@@ -82,11 +82,13 @@ extern void cpuset_track_online_nodes(vo
node_set(page_to_nid(__page), (__inode)->dirty_nodes)
#define cpuset_clear_dirty_nodes(__inode) \
- (__inode)->dirty_nodes = NODE_MASK_NONE
+ bitmap_zero(nodes_addr((__inode)->dirty_nodes), nr_node_ids)
#define cpuset_intersects_dirty_nodes(__inode, __nodemask_ptr) \
- (!(__nodemask_ptr) || nodes_intersects((__inode)->dirty_nodes, \
- *(__nodemask_ptr)))
+ (!(__nodemask_ptr) || bitmap_intersects( \
+ nodes_addr((__inode)->dirty_nodes), \
+ nodes_addr(*(__nodemask_ptr)), \
+ nr_node_ids))
#else /* !CONFIG_CPUSETS */
Index: linux-2.6.20-rc5/include/linux/nodemask.h
===================================================================
--- linux-2.6.20-rc5.orig/include/linux/nodemask.h 2007-01-15 22:33:55.000000000 -0600
+++ linux-2.6.20-rc5/include/linux/nodemask.h 2007-01-15 22:35:07.712750734 -0600
@@ -353,6 +353,13 @@ extern nodemask_t node_possible_map;
#define first_online_node first_node(node_online_map)
#define next_online_node(nid) next_node((nid), node_online_map)
extern int nr_node_ids;
+
+#if MAX_NUMNODES > BITS_PER_LONG
+extern int unused_numa_nodemask_bytes;
+#else
+#define unused_numa_nodemask_bytes 0
+#endif
+
#else
#define num_online_nodes() 1
#define num_possible_nodes() 1
@@ -361,6 +368,7 @@ extern int nr_node_ids;
#define first_online_node 0
#define next_online_node(nid) (MAX_NUMNODES)
#define nr_node_ids 1
+#define unused_numa_nodemask_bytes 0
#endif
#define any_online_node(mask) \
Index: linux-2.6.20-rc5/mm/page_alloc.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/page_alloc.c 2007-01-15 22:33:55.000000000 -0600
+++ linux-2.6.20-rc5/mm/page_alloc.c 2007-01-15 22:35:07.735213662 -0600
@@ -663,6 +663,10 @@ static int rmqueue_bulk(struct zone *zon
#if MAX_NUMNODES > 1
int nr_node_ids __read_mostly;
EXPORT_SYMBOL(nr_node_ids);
+#if MAX_NUMNODES > BITS_PER_LONG
+int unused_numa_nodemask_bytes __read_mostly;
+EXPORT_SYMBOL(unused_numa_nodemask_bytes);
+#endif
/*
* Figure out the number of possible node ids.
@@ -675,6 +679,10 @@ static void __init setup_nr_node_ids(voi
for_each_node_mask(node, node_possible_map)
highest = node;
nr_node_ids = highest + 1;
+#if MAX_NUMNODES > BITS_PER_LONG
+ unused_numa_nodemask_bytes = BITS_TO_LONGS(MAX_NUMNODES)
+ - BITS_TO_LONGS(nr_node_ids);
+#endif
}
#else
static void __init setup_nr_node_ids(void) {}
Index: linux-2.6.20-rc5/fs/nfs/inode.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/nfs/inode.c 2007-01-15 22:33:55.000000000 -0600
+++ linux-2.6.20-rc5/fs/nfs/inode.c 2007-01-15 22:35:07.755723291 -0600
@@ -1136,7 +1136,8 @@ static void init_once(void * foo, struct
static int __init nfs_init_inodecache(void)
{
nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
- sizeof(struct nfs_inode),
+ sizeof(struct nfs_inode)
+ - unused_numa_nodemask_bytes,
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
init_once, NULL);
Consider unreclaimable pages during dirty limit calculation
Tracking unreclaimable pages helps us to calculate the dirty ratio
the right way. If a large number of unreclaimable pages are allocated
(through the slab or through huge pages) then write throttling will
no longer work since the limit cannot be reached anymore.
So we simply subtract the number of unreclaimable pages from the pages
considered for writeout threshold calculation.
Other code that allocates significant amounts of memory for device
drivers etc could also be modified to take advantage of this functionality.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6.20-rc5/include/linux/mmzone.h
===================================================================
--- linux-2.6.20-rc5.orig/include/linux/mmzone.h 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/include/linux/mmzone.h 2007-01-15 21:37:37.579950696 -0600
@@ -53,6 +53,7 @@ enum zone_stat_item {
NR_FILE_PAGES,
NR_SLAB_RECLAIMABLE,
NR_SLAB_UNRECLAIMABLE,
+ NR_UNRECLAIMABLE,
NR_PAGETABLE, /* used for pagetables */
NR_FILE_DIRTY,
NR_WRITEBACK,
Index: linux-2.6.20-rc5/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/proc/proc_misc.c 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/fs/proc/proc_misc.c 2007-01-15 21:37:37.641479580 -0600
@@ -174,6 +174,7 @@ static int meminfo_read_proc(char *page,
"Slab: %8lu kB\n"
"SReclaimable: %8lu kB\n"
"SUnreclaim: %8lu kB\n"
+ "Unreclaimabl: %8lu kB\n"
"PageTables: %8lu kB\n"
"NFS_Unstable: %8lu kB\n"
"Bounce: %8lu kB\n"
@@ -205,6 +206,7 @@ static int meminfo_read_proc(char *page,
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+ K(global_page_state(NR_UNRECLAIMABLE)),
K(global_page_state(NR_PAGETABLE)),
K(global_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
Index: linux-2.6.20-rc5/mm/hugetlb.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/hugetlb.c 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/mm/hugetlb.c 2007-01-15 21:37:37.664919155 -0600
@@ -115,6 +115,8 @@ static int alloc_fresh_huge_page(void)
nr_huge_pages_node[page_to_nid(page)]++;
spin_unlock(&hugetlb_lock);
put_page(page); /* free it into the hugepage allocator */
+ mod_zone_page_state(page_zone(page), NR_UNRECLAIMABLE,
+ HPAGE_SIZE / PAGE_SIZE);
return 1;
}
return 0;
@@ -183,6 +185,8 @@ static void update_and_free_page(struct
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1<< PG_writeback);
}
+ mod_zone_page_state(page_zone(page), NR_UNRECLAIMABLE,
+ - (HPAGE_SIZE / PAGE_SIZE));
page[1].lru.next = NULL;
set_page_refcounted(page);
__free_pages(page, HUGETLB_PAGE_ORDER);
Index: linux-2.6.20-rc5/mm/vmstat.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/vmstat.c 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/mm/vmstat.c 2007-01-15 21:37:37.686405431 -0600
@@ -459,6 +459,7 @@ static const char * const vmstat_text[]
"nr_file_pages",
"nr_slab_reclaimable",
"nr_slab_unreclaimable",
+ "nr_unreclaimable",
"nr_page_table_pages",
"nr_dirty",
"nr_writeback",
Index: linux-2.6.20-rc5/mm/page-writeback.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/page-writeback.c 2007-01-15 21:37:33.302228293 -0600
+++ linux-2.6.20-rc5/mm/page-writeback.c 2007-01-15 21:37:37.697148570 -0600
@@ -165,7 +165,9 @@ get_dirty_limits(struct dirty_limits *dl
dl->nr_writeback +=
node_page_state(node, NR_WRITEBACK);
available_memory +=
- NODE_DATA(node)->node_present_pages;
+ NODE_DATA(node)->node_present_pages
+ - node_page_state(node, NR_UNRECLAIMABLE)
+ - node_page_state(node, NR_SLAB_UNRECLAIMABLE);
#ifdef CONFIG_HIGHMEM
high_memory += NODE_DATA(node)
->node_zones[ZONE_HIGHMEM]->present_pages;
@@ -180,7 +182,9 @@ get_dirty_limits(struct dirty_limits *dl
dl->nr_dirty = global_page_state(NR_FILE_DIRTY);
dl->nr_unstable = global_page_state(NR_UNSTABLE_NFS);
dl->nr_writeback = global_page_state(NR_WRITEBACK);
- available_memory = vm_total_pages;
+ available_memory = vm_total_pages
+ - global_page_state(NR_UNRECLAIMABLE)
+ - global_page_state(NR_SLAB_UNRECLAIMABLE);
high_memory = totalhigh_pages;
nr_mapped = global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES);
Index: linux-2.6.20-rc5/drivers/base/node.c
===================================================================
--- linux-2.6.20-rc5.orig/drivers/base/node.c 2007-01-12 12:54:26.000000000 -0600
+++ linux-2.6.20-rc5/drivers/base/node.c 2007-01-15 21:37:37.759654103 -0600
@@ -70,7 +70,8 @@ static ssize_t node_read_meminfo(struct
"Node %d Bounce: %8lu kB\n"
"Node %d Slab: %8lu kB\n"
"Node %d SReclaimable: %8lu kB\n"
- "Node %d SUnreclaim: %8lu kB\n",
+ "Node %d SUnreclaim: %8lu kB\n"
+ "Node %d Unreclaimabl: %8lu kB\n",
nid, K(i.totalram),
nid, K(i.freeram),
nid, K(i.totalram - i.freeram),
@@ -93,7 +94,8 @@ static ssize_t node_read_meminfo(struct
nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
- nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
+ nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
+ nid, K(node_page_state(nid, NR_UNRECLAIMABLE)));
n += hugetlb_report_node_meminfo(nid, buf + n);
return n;
}
Make page writeback obey cpuset constraints
Currently dirty throttling does not work properly in a cpuset.
If f.e a cpuset contains only 1/10th of available memory then all of the
memory of a cpuset can be dirtied without any writes being triggered. If we
are writing to a device that is mounted via NFS then the write operation
may be terminated with OOM since NFS is not allowed to allocate more pages
for writeout.
If all of the cpusets memory is dirty then only 10% of total memory is dirty.
The background writeback threshold is usually set at 10% and the synchrononous
threshold at 40%. So we are still below the global limits while the dirty
ratio in the cpuset is 100%!
This patch makes dirty writeout cpuset aware. When determining the
dirty limits in get_dirty_limits() we calculate values based on the
nodes that are reachable from the current process (that has been
dirtying the page). Then we can trigger writeout based on the
dirty ratio of the memory in the cpuset.
We trigger writeout in a a cpuset specific way. We go through the dirty
inodes and search for inodes that have dirty pages on the nodes of the
active cpuset. If an inode fulfills that requirement then we begin writeout
of the dirty pages of that inode.
Adding up all the counters for each node in a cpuset may seem to be quite
an expensive operation (in particular for large cpusets with hundreds of
nodes) compared to just accessing the global counters if we do not have
a cpuset. However, please remember that I only recently introduced
the global counters. Before 2.6.18 we did add up per processor
counters for each processor on each invocation of get_dirty_limits().
We now add per node information which I think is equal or less effort
since there are less nodes than processors.
Christoph Lameter <[email protected]>
Index: linux-2.6.20-rc5/include/linux/writeback.h
===================================================================
--- linux-2.6.20-rc5.orig/include/linux/writeback.h 2007-01-15 21:34:43.000000000 -0600
+++ linux-2.6.20-rc5/include/linux/writeback.h 2007-01-15 21:37:05.209897874 -0600
@@ -59,11 +59,12 @@ struct writeback_control {
unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
+ nodemask_t *nodes; /* Set of nodes of interest */
};
/*
* fs/fs-writeback.c
- */
+ */
void writeback_inodes(struct writeback_control *wbc);
void wake_up_inode(struct inode *inode);
int inode_wait(void *);
Index: linux-2.6.20-rc5/mm/page-writeback.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/page-writeback.c 2007-01-15 21:34:43.000000000 -0600
+++ linux-2.6.20-rc5/mm/page-writeback.c 2007-01-15 21:35:28.013794159 -0600
@@ -103,6 +103,14 @@ EXPORT_SYMBOL(laptop_mode);
static void background_writeout(unsigned long _min_pages, nodemask_t *nodes);
+struct dirty_limits {
+ long thresh_background;
+ long thresh_dirty;
+ unsigned long nr_dirty;
+ unsigned long nr_unstable;
+ unsigned long nr_writeback;
+};
+
/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
@@ -120,31 +128,74 @@ static void background_writeout(unsigned
* We make sure that the background writeout level is below the adjusted
* clamping level.
*/
-static void
-get_dirty_limits(long *pbackground, long *pdirty,
- struct address_space *mapping)
+static int
+get_dirty_limits(struct dirty_limits *dl, struct address_space *mapping,
+ nodemask_t *nodes)
{
int background_ratio; /* Percentages */
int dirty_ratio;
int unmapped_ratio;
long background;
long dirty;
- unsigned long available_memory = vm_total_pages;
+ unsigned long available_memory;
+ unsigned long high_memory;
+ unsigned long nr_mapped;
struct task_struct *tsk;
+ int is_subset = 0;
+#ifdef CONFIG_CPUSETS
+ /*
+ * Calculate the limits relative to the current cpuset if necessary.
+ */
+ if (unlikely(nodes &&
+ !nodes_subset(node_online_map, *nodes))) {
+ int node;
+
+ is_subset = 1;
+ memset(dl, 0, sizeof(struct dirty_limits));
+ available_memory = 0;
+ high_memory = 0;
+ nr_mapped = 0;
+ for_each_node_mask(node, *nodes) {
+ if (!node_online(node))
+ continue;
+ dl->nr_dirty += node_page_state(node, NR_FILE_DIRTY);
+ dl->nr_unstable +=
+ node_page_state(node, NR_UNSTABLE_NFS);
+ dl->nr_writeback +=
+ node_page_state(node, NR_WRITEBACK);
+ available_memory +=
+ NODE_DATA(node)->node_present_pages;
+#ifdef CONFIG_HIGHMEM
+ high_memory += NODE_DATA(node)
+ ->node_zones[ZONE_HIGHMEM]->present_pages;
+#endif
+ nr_mapped += node_page_state(node, NR_FILE_MAPPED) +
+ node_page_state(node, NR_ANON_PAGES);
+ }
+ } else
+#endif
+ {
+ /* Global limits */
+ dl->nr_dirty = global_page_state(NR_FILE_DIRTY);
+ dl->nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ dl->nr_writeback = global_page_state(NR_WRITEBACK);
+ available_memory = vm_total_pages;
+ high_memory = totalhigh_pages;
+ nr_mapped = global_page_state(NR_FILE_MAPPED) +
+ global_page_state(NR_ANON_PAGES);
+ }
#ifdef CONFIG_HIGHMEM
/*
* If this mapping can only allocate from low memory,
* we exclude high memory from our count.
*/
if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
- available_memory -= totalhigh_pages;
+ available_memory -= high_memory;
#endif
- unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
- global_page_state(NR_ANON_PAGES)) * 100) /
- vm_total_pages;
+ unmapped_ratio = 100 - (nr_mapped * 100) / available_memory;
dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
@@ -164,8 +215,9 @@ get_dirty_limits(long *pbackground, long
background += background / 4;
dirty += dirty / 4;
}
- *pbackground = background;
- *pdirty = dirty;
+ dl->thresh_background = background;
+ dl->thresh_dirty = dirty;
+ return is_subset;
}
/*
@@ -178,8 +230,7 @@ get_dirty_limits(long *pbackground, long
static void balance_dirty_pages(struct address_space *mapping)
{
long nr_reclaimable;
- long background_thresh;
- long dirty_thresh;
+ struct dirty_limits dl;
unsigned long pages_written = 0;
unsigned long write_chunk = sync_writeback_pages();
@@ -194,11 +245,12 @@ static void balance_dirty_pages(struct a
.range_cyclic = 1,
};
- get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
- dirty_thresh)
+ if (get_dirty_limits(&dl, mapping,
+ &cpuset_current_mems_allowed))
+ wbc.nodes = &cpuset_current_mems_allowed;
+ nr_reclaimable = dl.nr_dirty + dl.nr_unstable;
+ if (nr_reclaimable + dl.nr_writeback <=
+ dl.thresh_dirty)
break;
if (!dirty_exceeded)
@@ -212,13 +264,10 @@ static void balance_dirty_pages(struct a
*/
if (nr_reclaimable) {
writeback_inodes(&wbc);
- get_dirty_limits(&background_thresh,
- &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable +
- global_page_state(NR_WRITEBACK)
- <= dirty_thresh)
+ get_dirty_limits(&dl, mapping,
+ &cpuset_current_mems_allowed);
+ nr_reclaimable = dl.nr_dirty + dl.nr_unstable;
+ if (nr_reclaimable + dl.nr_writeback <= dl.thresh_dirty)
break;
pages_written += write_chunk - wbc.nr_to_write;
if (pages_written >= write_chunk)
@@ -227,8 +276,8 @@ static void balance_dirty_pages(struct a
congestion_wait(WRITE, HZ/10);
}
- if (nr_reclaimable + global_page_state(NR_WRITEBACK)
- <= dirty_thresh && dirty_exceeded)
+ if (nr_reclaimable + dl.nr_writeback
+ <= dl.thresh_dirty && dirty_exceeded)
dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -243,8 +292,9 @@ static void balance_dirty_pages(struct a
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && (nr_reclaimable > background_thresh)))
- pdflush_operation(background_writeout, 0, NULL);
+ (!laptop_mode && (nr_reclaimable > dl.thresh_background)))
+ pdflush_operation(background_writeout, 0,
+ &cpuset_current_mems_allowed);
}
void set_page_dirty_balance(struct page *page)
@@ -301,21 +351,19 @@ EXPORT_SYMBOL(balance_dirty_pages_rateli
void throttle_vm_writeout(void)
{
- long background_thresh;
- long dirty_thresh;
+ struct dirty_limits dl;
for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+ get_dirty_limits(&dl, NULL, &node_online_map);
/*
* Boost the allowable dirty threshold a bit for page
* allocators so they don't get DoS'ed by heavy writers
*/
- dirty_thresh += dirty_thresh / 10; /* wheeee... */
+ dl.thresh_dirty += dl.thresh_dirty / 10; /* wheeee... */
- if (global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK) <= dirty_thresh)
- break;
+ if (dl.nr_unstable + dl.nr_writeback <= dl.thresh_dirty)
+ break;
congestion_wait(WRITE, HZ/10);
}
}
@@ -325,7 +373,7 @@ void throttle_vm_writeout(void)
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
-static void background_writeout(unsigned long _min_pages, nodemask_t *unused)
+static void background_writeout(unsigned long _min_pages, nodemask_t *nodes)
{
long min_pages = _min_pages;
struct writeback_control wbc = {
@@ -338,12 +386,11 @@ static void background_writeout(unsigned
};
for ( ; ; ) {
- long background_thresh;
- long dirty_thresh;
+ struct dirty_limits dl;
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) < background_thresh
+ if (get_dirty_limits(&dl, NULL, nodes))
+ wbc.nodes = nodes;
+ if (dl.nr_dirty + dl.nr_unstable < dl.thresh_background
&& min_pages <= 0)
break;
wbc.encountered_congestion = 0;
Index: linux-2.6.20-rc5/fs/fs-writeback.c
===================================================================
--- linux-2.6.20-rc5.orig/fs/fs-writeback.c 2007-01-15 21:34:25.000000000 -0600
+++ linux-2.6.20-rc5/fs/fs-writeback.c 2007-01-15 21:35:28.028443893 -0600
@@ -352,6 +352,12 @@ sync_sb_inodes(struct super_block *sb, s
continue; /* blockdev has wrong queue */
}
+ if (!cpuset_intersects_dirty_nodes(inode, wbc->nodes)) {
+ /* No pages on the nodes under writeback */
+ list_move(&inode->i_list, &sb->s_dirty);
+ continue;
+ }
+
/* Was this inode dirtied after sync_sb_inodes was called? */
if (time_after(inode->dirtied_when, start))
break;
Throttle VM writeout in a cpuset aware way
This bases the vm throttling from the reclaim path on the dirty ratio
of the cpuset. Note that a cpuset is only effective if shrink_zone is called
from direct reclaim.
kswapd has a cpuset context that includes the whole machine and will
therefore not throttle unless global limits are reached.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6.20-rc5/include/linux/writeback.h
===================================================================
--- linux-2.6.20-rc5.orig/include/linux/writeback.h 2007-01-15 21:37:05.209897874 -0600
+++ linux-2.6.20-rc5/include/linux/writeback.h 2007-01-15 21:37:33.283671963 -0600
@@ -85,7 +85,7 @@ static inline void wait_on_inode(struct
int wakeup_pdflush(long nr_pages, nodemask_t *nodes);
void laptop_io_completion(void);
void laptop_sync_completion(void);
-void throttle_vm_writeout(void);
+void throttle_vm_writeout(nodemask_t *);
/* These are exported to sysctl. */
extern int dirty_background_ratio;
Index: linux-2.6.20-rc5/mm/page-writeback.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/page-writeback.c 2007-01-15 21:35:28.013794159 -0600
+++ linux-2.6.20-rc5/mm/page-writeback.c 2007-01-15 21:37:33.302228293 -0600
@@ -349,12 +349,12 @@ void balance_dirty_pages_ratelimited_nr(
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
-void throttle_vm_writeout(void)
+void throttle_vm_writeout(nodemask_t *nodes)
{
struct dirty_limits dl;
for ( ; ; ) {
- get_dirty_limits(&dl, NULL, &node_online_map);
+ get_dirty_limits(&dl, NULL, nodes);
/*
* Boost the allowable dirty threshold a bit for page
Index: linux-2.6.20-rc5/mm/vmscan.c
===================================================================
--- linux-2.6.20-rc5.orig/mm/vmscan.c 2007-01-15 21:37:26.605346439 -0600
+++ linux-2.6.20-rc5/mm/vmscan.c 2007-01-15 21:37:33.316878027 -0600
@@ -949,7 +949,7 @@ static unsigned long shrink_zone(int pri
}
}
- throttle_vm_writeout();
+ throttle_vm_writeout(&cpuset_current_mems_allowed);
atomic_dec(&zone->reclaim_in_progress);
return nr_reclaimed;
On Mon, 2007-01-15 at 21:47 -0800, Christoph Lameter wrote:
> Currently cpusets are not able to do proper writeback since
> dirty ratio calculations and writeback are all done for the system
> as a whole. This may result in a large percentage of a cpuset
> to become dirty without writeout being triggered. Under NFS
> this can lead to OOM conditions.
>
> Writeback will occur during the LRU scans. But such writeout
> is not effective since we write page by page and not in inode page
> order (regular writeback).
>
> In order to fix the problem we first of all introduce a method to
> establish a map of nodes that contain dirty pages for each
> inode mapping.
>
> Secondly we modify the dirty limit calculation to be based
> on the acctive cpuset.
>
> If we are in a cpuset then we select only inodes for writeback
> that have pages on the nodes of the cpuset.
>
> After we have the cpuset throttling in place we can then make
> further fixups:
>
> A. We can do inode based writeout from direct reclaim
> avoiding single page writes to the filesystem.
>
> B. We add a new counter NR_UNRECLAIMABLE that is subtracted
> from the available pages in a node. This allows us to
> accurately calculate the dirty ratio even if large portions
> of the node have been allocated for huge pages or for
> slab pages.
What about mlock'ed pages?
> There are a couple of points where some better ideas could be used:
>
> 1. The nodemask expands the inode structure significantly if the
> architecture allows a high number of nodes. This is only an issue
> for IA64. For that platform we expand the inode structure by 128 byte
> (to support 1024 nodes). The last patch attempts to address the issue
> by using the knowledge about the maximum possible number of nodes
> determined on bootup to shrink the nodemask.
Not the prettiest indeed, no ideas though.
> 2. The calculation of the per cpuset limits can require looping
> over a number of nodes which may bring the performance of get_dirty_limits
> near pre 2.6.18 performance (before the introduction of the ZVC counters)
> (only for cpuset based limit calculation). There is no way of keeping these
> counters per cpuset since cpusets may overlap.
Well, you gain functionality, you loose some runtime, sad but probably
worth it.
Otherwise it all looks good.
Acked-by: Peter Zijlstra <[email protected]>
Christoph wrote:
> Currently cpusets are not able to do proper writeback since
> dirty ratio calculations and writeback are all done for the system
> as a whole.
Thanks for tackling this - it is sorely needed.
I'm afraid my review will be mostly cosmetic; I'm not competent
to comment on the really interesting stuff.
> If we are in a cpuset then we select only inodes for writeback
> that have pages on the nodes of the cpuset.
Sorry - you tripped over a subtle distinction that happens to be on my
list of things to notice.
When cpusets are configured, -all- tasks are in a cpuset. And
(correctly so, I trust) this patch doesn't look into the tasks cpuset
to see what nodes it allows. Rather it looks to the mems_allowed field
in the task struct, which is equal to or (when set_mempolicy is used) a
subset of that tasks cpusets allowed nodes.
Perhaps the following phrasing would be more accurate:
If CPUSETs are configured, then we select only the inodes for
writeback that have dirty pages on that tasks mems_allowed nodes.
> Secondly we modify the dirty limit calculation to be based
> on the acctive cpuset.
As above, perhaps the following would be more accurate:
Secondly we modify the dirty limit calculation to be based
on the current tasks mems_allowed nodes.
> 1. The nodemask expands the inode structure significantly if the
> architecture allows a high number of nodes. This is only an issue
> for IA64.
Should that logic be disabled if HOTPLUG is configured on? Or is
nr_node_ids a valid upper limit on what could be plugged in, even on a
mythical HOTPLUG system?
> 2. The calculation of the per cpuset limits can require looping
> over a number of nodes which may bring the performance of get_dirty_limits
> near pre 2.6.18 performance
Could we cache these limits? Perhaps they only need to be recalculated if
a tasks mems_allowed changes?
Separate question - what happens if a tasks mems_allowed changes while it is
dirtying pages? We could easily end up with dirty pages on nodes that are
no longer allowed to the task. Is there anyway that such a miscalculation
could cause us to do harmful things?
In patch 2/8:
> The dirty map is cleared when the inode is cleared. There is no
> synchronization (except for atomic nature of node_set) for the dirty_map. The
> only problem that could be done is that we do not write out an inode if a
> node bit is not set.
Does this mean that a dirty page could be left 'forever' in memory, unwritten,
exposing us to risk of data corruption on disk, from some write done weeks ago,
but unwritten, in the event of say a power loss?
Also in patch 2/8:
> +static inline void cpuset_update_dirty_nodes(struct inode *i,
> + struct page *page) {}
Is an incomplete 'struct inode;' declaration needed here in cpuset.h,
to avoid a warning if compiling with CPUSETS not configured?
In patch 4/8:
> We now add per node information which I think is equal or less effort
> since there are less nodes than processors.
Not so on Paul Menage's fake NUMA nodes - he can have say 64 fake nodes on
a system with 2 or 4 CPUs and one real node. But I guess that's ok ...
In patch 4/8:
> +#ifdef CONFIG_CPUSETS
> + /*
> + * Calculate the limits relative to the current cpuset if necessary.
> + */
> + if (unlikely(nodes &&
> + !nodes_subset(node_online_map, *nodes))) {
> + int node;
> +
> + is_subset = 1;
> + ...
> +#ifdef CONFIG_HIGHMEM
> + high_memory += NODE_DATA(node)
> + ->node_zones[ZONE_HIGHMEM]->present_pages;
> +#endif
> + nr_mapped += node_page_state(node, NR_FILE_MAPPED) +
> + node_page_state(node, NR_ANON_PAGES);
> + }
> + } else
> +#endif
> + {
I'm wishing there was a clearer way to write the above code. Nested
ifdef's and an ifdef block ending in an open 'else' and perhaps the first
#ifdef CONFIG_CPUSETS ever, outside of fs/proc/base.c ...
However I have no clue if such a clearer way exists. Sorry.
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401
On Tue, 16 Jan 2007, Paul Jackson wrote:
> > 1. The nodemask expands the inode structure significantly if the
> > architecture allows a high number of nodes. This is only an issue
> > for IA64.
>
> Should that logic be disabled if HOTPLUG is configured on? Or is
> nr_node_ids a valid upper limit on what could be plugged in, even on a
> mythical HOTPLUG system?
nr_node_ids is a valid upper limit on what could be plugged in. We could
modify nodemasks to only use nr_node_ids bits and the kernel would still
be functioning correctly.
> > 2. The calculation of the per cpuset limits can require looping
> > over a number of nodes which may bring the performance of get_dirty_limits
> > near pre 2.6.18 performance
>
> Could we cache these limits? Perhaps they only need to be recalculated if
> a tasks mems_allowed changes?
No they change dynamically. In particular writeout reduces the number of
dirty / unstable pages.
> Separate question - what happens if a tasks mems_allowed changes while it is
> dirtying pages? We could easily end up with dirty pages on nodes that are
> no longer allowed to the task. Is there anyway that such a miscalculation
> could cause us to do harmful things?
The dirty_map on an inode is independent of a cpuset. The cpuset only
comes into effect when we decide to do writeout and are scanning for files
with pages on the nodes of interest.
> In patch 2/8:
> > The dirty map is cleared when the inode is cleared. There is no
> > synchronization (except for atomic nature of node_set) for the dirty_map. The
> > only problem that could be done is that we do not write out an inode if a
> > node bit is not set.
>
> Does this mean that a dirty page could be left 'forever' in memory, unwritten,
> exposing us to risk of data corruption on disk, from some write done weeks ago,
> but unwritten, in the event of say a power loss?
No it will age and be written out anyways. Note that there are usually
multiple dirty pages which reduces the chance of the race. These are node
bits that help to decide when to start writeout of all dirty pages of an
inode regardless of where the other pages are.
> Also in patch 2/8:
> > +static inline void cpuset_update_dirty_nodes(struct inode *i,
> > + struct page *page) {}
>
> Is an incomplete 'struct inode;' declaration needed here in cpuset.h,
> to avoid a warning if compiling with CPUSETS not configured?
Correct.
>
> In patch 4/8:
> > We now add per node information which I think is equal or less effort
> > since there are less nodes than processors.
>
> Not so on Paul Menage's fake NUMA nodes - he can have say 64 fake nodes on
> a system with 2 or 4 CPUs and one real node. But I guess that's ok ...
True but then its fake.
> In patch 4/8:
> > +#ifdef CONFIG_CPUSETS
> > + /*
> > + * Calculate the limits relative to the current cpuset if necessary.
> > + */
> > + if (unlikely(nodes &&
> > + !nodes_subset(node_online_map, *nodes))) {
> > + int node;
> > +
> > + is_subset = 1;
> > + ...
> > +#ifdef CONFIG_HIGHMEM
> > + high_memory += NODE_DATA(node)
> > + ->node_zones[ZONE_HIGHMEM]->present_pages;
> > +#endif
> > + nr_mapped += node_page_state(node, NR_FILE_MAPPED) +
> > + node_page_state(node, NR_ANON_PAGES);
> > + }
> > + } else
> > +#endif
> > + {
>
> I'm wishing there was a clearer way to write the above code. Nested
> ifdef's and an ifdef block ending in an open 'else' and perhaps the first
> #ifdef CONFIG_CPUSETS ever, outside of fs/proc/base.c ...
I have tried to replicate the structure for global dirty_limits
calculation which has the same ifdef.
On 1/15/07, Christoph Lameter <[email protected]> wrote:
>
> This solution may be a bit hokey. I tried other approaches but this
> one seemed to be the simplest with the least complications. Maybe someone
> else can come up with a better solution?
How about a 64-bit field in struct inode that's used as a bitmask if
there are no more than 64 nodes, and a pointer to a bitmask if there
are more than 64 nodes. The filesystems wouldn't need to be involved
then, as the bitmap allocation could be done in the generic code.
Paul
On Tue, 16 Jan 2007, Paul Menage wrote:
> On 1/15/07, Christoph Lameter <[email protected]> wrote:
> >
> > This solution may be a bit hokey. I tried other approaches but this
> > one seemed to be the simplest with the least complications. Maybe someone
> > else can come up with a better solution?
>
> How about a 64-bit field in struct inode that's used as a bitmask if
> there are no more than 64 nodes, and a pointer to a bitmask if there
> are more than 64 nodes. The filesystems wouldn't need to be involved
> then, as the bitmap allocation could be done in the generic code.
How would we decide if there are more than 64 nodes? Runtime or compile
time?
If done at compile time then we will end up with a pointer to an unsigned
long for a system with <= 64 nodes. If we allocate the nodemask via
kmalloc then we will always end up with a mininum allocation size of 64
bytes. Then there is the slab management overhead which will increase
overhead closer to the 128 byte wastage that we have now. So its likely
not worth it. There are some additional complications since there is a
need to free and allocate the nodemask during inode initialization and
disposal.
If the decision to allocate a node mask is made at run time then it gets
even more complex. And we still have the same troubling issues with the
64 byte mininum allocation by the slab.
On 1/16/07, Christoph Lameter <[email protected]> wrote:
> On Tue, 16 Jan 2007, Paul Menage wrote:
>
> > On 1/15/07, Christoph Lameter <[email protected]> wrote:
> > >
> > > This solution may be a bit hokey. I tried other approaches but this
> > > one seemed to be the simplest with the least complications. Maybe someone
> > > else can come up with a better solution?
> >
> > How about a 64-bit field in struct inode that's used as a bitmask if
> > there are no more than 64 nodes, and a pointer to a bitmask if there
> > are more than 64 nodes. The filesystems wouldn't need to be involved
> > then, as the bitmap allocation could be done in the generic code.
>
> How would we decide if there are more than 64 nodes? Runtime or compile
> time?
I was thinking runtime, unless MAX_NUMNODES is less than 64 in which
case you can make the decision at compile time.
>
> If done at compile time then we will end up with a pointer to an unsigned
> long for a system with <= 64 nodes. If we allocate the nodemask via
> kmalloc then we will always end up with a mininum allocation size of 64
> bytes.
Can't we get less overhead with a slab cache with appropriate-sized objects?
Paul
On Tue, 16 Jan 2007, Peter Zijlstra wrote:
> > B. We add a new counter NR_UNRECLAIMABLE that is subtracted
> > from the available pages in a node. This allows us to
> > accurately calculate the dirty ratio even if large portions
> > of the node have been allocated for huge pages or for
> > slab pages.
>
> What about mlock'ed pages?
mlocked pages can be dirty and written back right? So for the
dirty ratio calculation they do not play a role. We may need a
separate counter for mlocked pages if they are to be considered
for other decisions in the VM.
> Otherwise it all looks good.
>
> Acked-by: Peter Zijlstra <[email protected]>
Thanks.
On Tue, 16 Jan 2007, Paul Menage wrote:
> I was thinking runtime, unless MAX_NUMNODES is less than 64 in which
> case you can make the decision at compile time.
>
> >
> > If done at compile time then we will end up with a pointer to an unsigned
> > long for a system with <= 64 nodes. If we allocate the nodemask via
> > kmalloc then we will always end up with a mininum allocation size of 64
> > bytes.
>
> Can't we get less overhead with a slab cache with appropriate-sized objects?
Ok but then we are going to have quite small objects. Plus we will have
additional slab overhead per node.
> On Mon, 15 Jan 2007 21:47:43 -0800 (PST) Christoph Lameter <[email protected]> wrote:
>
> Currently cpusets are not able to do proper writeback since
> dirty ratio calculations and writeback are all done for the system
> as a whole.
We _do_ do proper writeback. But it's less efficient than it might be, and
there's an NFS problem.
> This may result in a large percentage of a cpuset
> to become dirty without writeout being triggered. Under NFS
> this can lead to OOM conditions.
OK, a big question: is this patchset a performance improvement or a
correctness fix? Given the above, and the lack of benchmark results I'm
assuming it's for correctness.
- Why does NFS go oom? Because it allocates potentially-unbounded
numbers of requests in the writeback path?
It was able to go oom on non-numa machines before dirty-page-tracking
went in. So a general problem has now become specific to some NUMA
setups.
We have earlier discussed fixing NFS to not do that. Make it allocate
a fixed number of requests and to then block. Just like
get_request_wait(). This is one reason why block_congestion_wait() and
friends got renamed to congestion_wait(): it's on the path to getting NFS
better aligned with how block devices are handling this.
- There's no reason which I can see why NFS _has_ to go oom. It could
just fail the memory allocation for the request and then wait for the
stuff which it _has_ submitted to complete. We do that for block
devices, backed by mempools.
- Why does NFS go oom if there's free memory in other nodes? I assume
that's what's happening, because things apparently work OK if you ask
pdflush to do exactly the thing which the direct-reclaim process was
attempting to do: allocate NFS requests and do writeback.
So an obvious, equivalent and vastly simpler "fix" would be to teach
the NFS client to go off-cpuset when trying to allocate these requests.
I suspect that if we do some or all of the above, NFS gets better and the
bug which motivated this patchset goes away.
But that being said, yes, allowing a zone to go 100% dirty like this is
bad, and it's be nice to be able to fix it.
(But is it really bad? What actual problems will it cause once NFS is fixed?)
Assuming that it is bad, yes, we'll obviously need the extra per-zone
dirty-memory accounting.
I don't understand why the proposed patches are cpuset-aware at all. This
is a per-zone problem, and a per-zone fix would seem to be appropriate, and
more general. For example, i386 machines can presumably get into trouble
if all of ZONE_DMA or ZONE_NORMAL get dirty. A good implementation would
address that problem as well. So I think it should all be per-zone?
Do we really need those per-inode cpumasks? When page reclaim encounters a
dirty page on the zone LRU, we automatically know that page->mapping->host
has at least one dirty page in this zone, yes? We could immediately ask
pdflush to write out some pages from that inode. We would need to take a
ref on the inode (while the page is locked, to avoid racing with inode
reclaim) and pass that inode off to pdflush (actually pass a list of such
inodes off to pdflush, keep appending to it).
Extra refinements would include
- telling pdflush the file offset of the page so it can do writearound
- getting pdflush to deactivate any pages which it writes out, so that
rotate_reclaimable_page() has a good chance of moving them to the tail of
the inactive list for immediate reclaim.
But all of this is, I think, unneeded if NFS is fixed. It's hopefully a
performance optimisation to permit writeout in a less seeky fashion.
Unless there's some other problem with excessively dirty zones.
> Secondly we modify the dirty limit calculation to be based
> on the acctive cpuset.
The global dirty limit definitely seems to be a problem
in several cases, but my feeling is that the cpuset is the wrong unit
to keep track of it. Most likely it should be more fine grained.
> If we are in a cpuset then we select only inodes for writeback
> that have pages on the nodes of the cpuset.
Is there any indication this change helps on smaller systems
or is it purely a large system optimization?
> B. We add a new counter NR_UNRECLAIMABLE that is subtracted
> from the available pages in a node. This allows us to
> accurately calculate the dirty ratio even if large portions
> of the node have been allocated for huge pages or for
> slab pages.
That sounds like a useful change by itself.
-Andi
Subject: nfs: fix congestion control
The current NFS client congestion logic is severely broken, it marks the
backing device congested during each nfs_writepages() call and implements
its own waitqueue.
Replace this by a more regular congestion implementation that puts a cap
on the number of active writeback pages and uses the bdi congestion waitqueue.
NFSv[34] commit pages are allowed to go unchecked as long as we are under
the dirty page limit and not in direct reclaim.
A buxom young lass from Neale's Flat,
Bore triplets, named Matt, Pat and Tat.
"Oh Lord," she protested,
"'Tis somewhat congested ...
"You've given me no tit for Tat."
Signed-off-by: Peter Zijlstra <[email protected]>
---
fs/inode.c | 1
fs/nfs/pagelist.c | 8 +
fs/nfs/write.c | 257 +++++++++++++++++++++++++++++++++++---------
include/linux/backing-dev.h | 1
include/linux/nfs_fs_sb.h | 2
include/linux/nfs_page.h | 2
include/linux/writeback.h | 1
mm/backing-dev.c | 16 ++
mm/page-writeback.c | 6 +
9 files changed, 240 insertions(+), 54 deletions(-)
Index: linux-2.6-git/fs/nfs/write.c
===================================================================
--- linux-2.6-git.orig/fs/nfs/write.c 2007-01-12 08:03:47.000000000 +0100
+++ linux-2.6-git/fs/nfs/write.c 2007-01-12 09:31:41.000000000 +0100
@@ -89,8 +89,6 @@ static struct kmem_cache *nfs_wdata_cach
static mempool_t *nfs_wdata_mempool;
static mempool_t *nfs_commit_mempool;
-static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
-
struct nfs_write_data *nfs_commit_alloc(void)
{
struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
@@ -245,6 +243,91 @@ static int wb_priority(struct writeback_
}
/*
+ * NFS congestion control
+ *
+ * Congestion is composed of two parts, writeback and commit pages.
+ * Writeback pages are active, that is, all that is required to get out of
+ * writeback state is sit around and wait. Commit pages on the other hand
+ * are not and they need a little nudge to go away.
+ *
+ * Normally we want to maximise the number of commit pages for it allows the
+ * server to make best use of unstable storage. However when we are running
+ * low on memory we do need to reduce the commit pages.
+ *
+ * Hence writeback pages are managed in the conventional way, but for commit
+ * pages we poll on every write.
+ *
+ * The threshold is picked so that it allows for 16M of in flight data
+ * given current transfer speeds that seems reasonable.
+ */
+
+#define NFS_CONGESTION_SIZE 16
+#define NFS_CONGESTION_ON_THRESH (NFS_CONGESTION_SIZE << (20 - PAGE_SHIFT))
+#define NFS_CONGESTION_OFF_THRESH \
+ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+
+/*
+ * Include the commit pages into the congestion logic when there is either
+ * pressure from the total dirty limit or when we're in direct reclaim.
+ */
+static inline int commit_pressure(struct inode *inode)
+{
+ return dirty_pages_exceeded(inode->i_mapping) ||
+ (current->flags & PF_MEMALLOC);
+}
+
+static void nfs_congestion_on(struct inode *inode)
+{
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ if (atomic_read(&nfss->writeback) > NFS_CONGESTION_ON_THRESH)
+ set_bdi_congested(&nfss->backing_dev_info, WRITE);
+}
+
+static void nfs_congestion_off(struct inode *inode)
+{
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ if (atomic_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+ clear_bdi_congested(&nfss->backing_dev_info, WRITE);
+ congestion_end(WRITE);
+ }
+}
+
+static inline void nfs_set_page_writeback(struct page *page)
+{
+ if (!test_set_page_writeback(page)) {
+ struct inode *inode = page->mapping->host;
+ atomic_inc(&NFS_SERVER(inode)->writeback);
+ nfs_congestion_on(inode);
+ }
+}
+
+static inline void nfs_end_page_writeback(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ end_page_writeback(page);
+ atomic_dec(&NFS_SERVER(inode)->writeback);
+ nfs_congestion_off(inode);
+}
+
+static inline void nfs_set_page_commit(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ inc_zone_page_state(page, NR_UNSTABLE_NFS);
+ atomic_inc(&NFS_SERVER(inode)->commit);
+ nfs_congestion_on(inode);
+}
+
+static inline void nfs_end_page_commit(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ dec_zone_page_state(page, NR_UNSTABLE_NFS);
+ atomic_dec(&NFS_SERVER(inode)->commit);
+ nfs_congestion_off(inode);
+}
+
+/*
* Find an associated nfs write request, and prepare to flush it out
* Returns 1 if there was no write request, or if the request was
* already tagged by nfs_set_page_dirty.Returns 0 if the request
@@ -281,7 +364,7 @@ static int nfs_page_mark_flush(struct pa
spin_unlock(req_lock);
if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) {
nfs_mark_request_dirty(req);
- set_page_writeback(page);
+ nfs_set_page_writeback(page);
}
ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
nfs_unlock_request(req);
@@ -336,13 +419,8 @@ int nfs_writepage(struct page *page, str
return err;
}
-/*
- * Note: causes nfs_update_request() to block on the assumption
- * that the writeback is generated due to memory pressure.
- */
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
struct inode *inode = mapping->host;
int err;
@@ -351,11 +429,6 @@ int nfs_writepages(struct address_space
err = generic_writepages(mapping, wbc);
if (err)
return err;
- while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
- if (wbc->nonblocking)
- return 0;
- nfs_wait_on_write_congestion(mapping, 0);
- }
err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
if (err < 0)
goto out;
@@ -369,9 +442,6 @@ int nfs_writepages(struct address_space
if (err > 0)
err = 0;
out:
- clear_bit(BDI_write_congested, &bdi->state);
- wake_up_all(&nfs_write_congestion);
- congestion_end(WRITE);
return err;
}
@@ -401,7 +471,7 @@ static int nfs_inode_add_request(struct
}
/*
- * Insert a write request into an inode
+ * Remove a write request from an inode
*/
static void nfs_inode_remove_request(struct nfs_page *req)
{
@@ -473,7 +543,7 @@ nfs_mark_request_commit(struct nfs_page
nfs_list_add_request(req, &nfsi->commit);
nfsi->ncommit++;
spin_unlock(&nfsi->req_lock);
- inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ nfs_set_page_commit(req->wb_page);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
#endif
@@ -544,7 +614,7 @@ static void nfs_cancel_commit_list(struc
while(!list_empty(head)) {
req = nfs_list_entry(head->next);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ nfs_end_page_commit(req->wb_page);
nfs_list_remove_request(req);
nfs_inode_remove_request(req);
nfs_unlock_request(req);
@@ -563,13 +633,14 @@ static void nfs_cancel_commit_list(struc
* The requests are *not* checked to ensure that they form a contiguous set.
*/
static int
-nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+nfs_scan_commit(struct inode *inode, struct list_head *dst,
+ unsigned long idx_start, unsigned int npages, unsigned int maxpages)
{
struct nfs_inode *nfsi = NFS_I(inode);
int res = 0;
if (nfsi->ncommit != 0) {
- res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages);
+ res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages, maxpages);
nfsi->ncommit -= res;
if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -577,48 +648,93 @@ nfs_scan_commit(struct inode *inode, str
return res;
}
#else
-static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst,
+ unsigned long idx_start, unsigned int npages, unsigned int maxpages)
{
return 0;
}
#endif
+static int nfs_do_commit(struct super_block *sb, int npages);
+
+static int nfs_commit_congestion(struct inode *inode)
+{
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ int thresh, excess_commit;
+ int ret = 0;
+
+ if (!commit_pressure(inode))
+ goto out;
+
+ thresh = atomic_read(&nfss->writeback);
+ if (thresh < 128)
+ thresh = 128;
+ if (thresh > NFS_CONGESTION_OFF_THRESH)
+ thresh = NFS_CONGESTION_OFF_THRESH;
+
+ excess_commit = atomic_read(&nfss->commit) - thresh;
+
+ /*
+ * limit the flush to stable storage - this should
+ * avoid flushing most of the unstable pages when there
+ * is not much real pressure but they just tripped the
+ * dirty page boundary.
+ */
+ thresh /= 2;
+ if (excess_commit > thresh)
+ excess_commit = thresh;
+
+ while (excess_commit > 0 && commit_pressure(inode)) {
+ int npages = min(16, excess_commit);
+ npages = nfs_do_commit(inode->i_sb, npages);
+ if (npages <= 0)
+ break;
+ excess_commit -= npages;
+ ret = 1;
+ cond_resched();
+ }
+out:
+ return ret;
+}
+
static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
{
+ struct inode *inode = mapping->host;
struct backing_dev_info *bdi = mapping->backing_dev_info;
- DEFINE_WAIT(wait);
int ret = 0;
might_sleep();
- if (!bdi_write_congested(bdi))
+ if (!bdi_write_congested(bdi)) {
+ nfs_commit_congestion(inode);
return 0;
+ }
- nfs_inc_stats(mapping->host, NFSIOS_CONGESTIONWAIT);
+ nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
- if (intr) {
- struct rpc_clnt *clnt = NFS_CLIENT(mapping->host);
- sigset_t oldset;
-
- rpc_clnt_sigmask(clnt, &oldset);
- prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
- if (bdi_write_congested(bdi)) {
- if (signalled())
- ret = -ERESTARTSYS;
- else
- schedule();
+ do {
+ if (nfs_commit_congestion(inode) &&
+ !bdi_write_congested(bdi))
+ break;
+
+ if (intr) {
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ sigset_t oldset;
+
+ rpc_clnt_sigmask(clnt, &oldset);
+ ret = congestion_wait_interruptible(WRITE, HZ/10);
+ rpc_clnt_sigunmask(clnt, &oldset);
+ if (ret == -ERESTARTSYS)
+ return ret;
+ ret = 0;
+ } else {
+ congestion_wait(WRITE, HZ/10);
}
- rpc_clnt_sigunmask(clnt, &oldset);
- } else {
- prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
- if (bdi_write_congested(bdi))
- schedule();
- }
- finish_wait(&nfs_write_congestion, &wait);
+ } while (bdi_write_congested(bdi));
+
return ret;
}
-
/*
* Try to update any existing write request, or create one if there is none.
* In order to match, the request's credentials must match those of
@@ -779,7 +895,7 @@ int nfs_updatepage(struct file *file, st
static void nfs_writepage_release(struct nfs_page *req)
{
- end_page_writeback(req->wb_page);
+ nfs_end_page_writeback(req->wb_page);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (!PageError(req->wb_page)) {
@@ -1095,12 +1211,12 @@ static void nfs_writeback_done_full(stru
ClearPageUptodate(page);
SetPageError(page);
req->wb_context->error = task->tk_status;
- end_page_writeback(page);
+ nfs_end_page_writeback(page);
nfs_inode_remove_request(req);
dprintk(", error = %d\n", task->tk_status);
goto next;
}
- end_page_writeback(page);
+ nfs_end_page_writeback(page);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
@@ -1277,7 +1393,7 @@ nfs_commit_list(struct inode *inode, str
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_mark_request_commit(req);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ nfs_end_page_commit(req->wb_page);
nfs_clear_page_writeback(req);
}
return -ENOMEM;
@@ -1301,7 +1417,7 @@ static void nfs_commit_done(struct rpc_t
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ nfs_end_page_commit(req->wb_page);
dprintk("NFS: commit (%s/%Ld %d@%Ld)",
req->wb_context->dentry->d_inode->i_sb->s_id,
@@ -1367,7 +1483,7 @@ int nfs_commit_inode(struct inode *inode
int res;
spin_lock(&nfsi->req_lock);
- res = nfs_scan_commit(inode, &head, 0, 0);
+ res = nfs_scan_commit(inode, &head, 0, 0, 0);
spin_unlock(&nfsi->req_lock);
if (res) {
int error = nfs_commit_list(inode, &head, how);
@@ -1378,6 +1494,43 @@ int nfs_commit_inode(struct inode *inode
}
#endif
+static int nfs_do_commit(struct super_block *sb, int npages)
+{
+ struct inode *inode;
+ int ret = 0;
+
+ down_read(&sb->s_umount);
+ spin_lock(&inode_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ struct nfs_inode *nsfi = NFS_I(inode);
+ LIST_HEAD(head);
+ int res;
+
+ if (inode->i_state & (I_FREEING|I_WILL_FREE))
+ continue;
+
+ spin_lock(&nsfi->req_lock);
+ res = nfs_scan_commit(inode, &head, 0, 0, npages);
+ spin_unlock(&nsfi->req_lock);
+ ret += res;
+ npages -= res;
+ if (res) {
+ res = nfs_commit_list(inode, &head, FLUSH_HIGHPRI);
+ if (res < 0) {
+ ret = res;
+ break;
+ }
+ }
+
+ if (npages <= 0)
+ break;
+ }
+ spin_unlock(&inode_lock);
+ up_read(&sb->s_umount);
+
+ return ret;
+}
+
long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
{
struct inode *inode = mapping->host;
@@ -1424,7 +1577,7 @@ long nfs_sync_mapping_wait(struct addres
continue;
if (nocommit)
break;
- pages = nfs_scan_commit(inode, &head, idx_start, npages);
+ pages = nfs_scan_commit(inode, &head, idx_start, npages, 0);
if (pages == 0) {
if (wbc->pages_skipped != 0)
continue;
@@ -1437,7 +1590,7 @@ long nfs_sync_mapping_wait(struct addres
spin_lock(&nfsi->req_lock);
continue;
}
- pages += nfs_scan_commit(inode, &head, 0, 0);
+ pages += nfs_scan_commit(inode, &head, 0, 0, 0);
spin_unlock(&nfsi->req_lock);
ret = nfs_commit_list(inode, &head, how);
spin_lock(&nfsi->req_lock);
Index: linux-2.6-git/mm/backing-dev.c
===================================================================
--- linux-2.6-git.orig/mm/backing-dev.c 2007-01-12 08:03:47.000000000 +0100
+++ linux-2.6-git/mm/backing-dev.c 2007-01-12 08:53:26.000000000 +0100
@@ -55,6 +55,22 @@ long congestion_wait(int rw, long timeou
}
EXPORT_SYMBOL(congestion_wait);
+long congestion_wait_interruptible(int rw, long timeout)
+{
+ long ret;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+ prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ ret = -ERESTARTSYS;
+ else
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+ return ret;
+}
+EXPORT_SYMBOL(congestion_wait_interruptible);
+
/**
* congestion_end - wake up sleepers on a congested backing_dev_info
* @rw: READ or WRITE
Index: linux-2.6-git/include/linux/nfs_fs_sb.h
===================================================================
--- linux-2.6-git.orig/include/linux/nfs_fs_sb.h 2007-01-12 08:03:47.000000000 +0100
+++ linux-2.6-git/include/linux/nfs_fs_sb.h 2007-01-12 08:53:26.000000000 +0100
@@ -82,6 +82,8 @@ struct nfs_server {
struct rpc_clnt * client_acl; /* ACL RPC client handle */
struct nfs_iostats * io_stats; /* I/O statistics */
struct backing_dev_info backing_dev_info;
+ atomic_t writeback; /* number of writeback pages */
+ atomic_t commit; /* number of commit pages */
int flags; /* various flags */
unsigned int caps; /* server capabilities */
unsigned int rsize; /* read size */
Index: linux-2.6-git/include/linux/backing-dev.h
===================================================================
--- linux-2.6-git.orig/include/linux/backing-dev.h 2007-01-12 08:03:47.000000000 +0100
+++ linux-2.6-git/include/linux/backing-dev.h 2007-01-12 08:53:26.000000000 +0100
@@ -93,6 +93,7 @@ static inline int bdi_rw_congested(struc
void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
void set_bdi_congested(struct backing_dev_info *bdi, int rw);
long congestion_wait(int rw, long timeout);
+long congestion_wait_interruptible(int rw, long timeout);
void congestion_end(int rw);
#define bdi_cap_writeback_dirty(bdi) \
Index: linux-2.6-git/include/linux/writeback.h
===================================================================
--- linux-2.6-git.orig/include/linux/writeback.h 2007-01-12 08:03:47.000000000 +0100
+++ linux-2.6-git/include/linux/writeback.h 2007-01-12 08:53:26.000000000 +0100
@@ -100,6 +100,7 @@ int dirty_writeback_centisecs_handler(st
void __user *, size_t *, loff_t *);
void page_writeback_init(void);
+int dirty_pages_exceeded(struct address_space *mapping);
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied);
Index: linux-2.6-git/mm/page-writeback.c
===================================================================
--- linux-2.6-git.orig/mm/page-writeback.c 2007-01-12 08:03:47.000000000 +0100
+++ linux-2.6-git/mm/page-writeback.c 2007-01-12 08:53:26.000000000 +0100
@@ -167,6 +167,12 @@ get_dirty_limits(long *pbackground, long
*pdirty = dirty;
}
+int dirty_pages_exceeded(struct address_space *mapping)
+{
+ return dirty_exceeded;
+}
+EXPORT_SYMBOL_GPL(dirty_pages_exceeded);
+
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
Index: linux-2.6-git/fs/nfs/pagelist.c
===================================================================
--- linux-2.6-git.orig/fs/nfs/pagelist.c 2007-01-12 08:03:47.000000000 +0100
+++ linux-2.6-git/fs/nfs/pagelist.c 2007-01-12 08:53:26.000000000 +0100
@@ -356,7 +356,7 @@ out:
*/
int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
struct list_head *dst, unsigned long idx_start,
- unsigned int npages)
+ unsigned int npages, unsigned int maxpages)
{
struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
struct nfs_page *req;
@@ -370,6 +370,9 @@ int nfs_scan_list(struct nfs_inode *nfsi
else
idx_end = idx_start + npages - 1;
+ if (maxpages == 0)
+ maxpages = ~0;
+
for (;;) {
found = radix_tree_gang_lookup(&nfsi->nfs_page_tree,
(void **)&pgvec[0], idx_start,
@@ -387,7 +390,10 @@ int nfs_scan_list(struct nfs_inode *nfsi
nfs_list_remove_request(req);
nfs_list_add_request(req, dst);
res++;
+ maxpages--;
}
+ if (!maxpages)
+ goto out;
}
}
Index: linux-2.6-git/include/linux/nfs_page.h
===================================================================
--- linux-2.6-git.orig/include/linux/nfs_page.h 2007-01-12 08:03:47.000000000 +0100
+++ linux-2.6-git/include/linux/nfs_page.h 2007-01-12 08:53:26.000000000 +0100
@@ -66,7 +66,7 @@ extern long nfs_scan_dirty(struct addres
struct writeback_control *wbc,
struct list_head *dst);
extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, struct list_head *dst,
- unsigned long idx_start, unsigned int npages);
+ unsigned long idx_start, unsigned int npages, unsigned int maxpages);
extern int nfs_coalesce_requests(struct list_head *, struct list_head *,
unsigned int);
extern int nfs_wait_on_request(struct nfs_page *);
Index: linux-2.6-git/fs/inode.c
===================================================================
--- linux-2.6-git.orig/fs/inode.c 2007-01-12 08:03:47.000000000 +0100
+++ linux-2.6-git/fs/inode.c 2007-01-12 08:53:26.000000000 +0100
@@ -81,6 +81,7 @@ static struct hlist_head *inode_hashtabl
* the i_state of an inode while it is in use..
*/
DEFINE_SPINLOCK(inode_lock);
+EXPORT_SYMBOL_GPL(inode_lock);
/*
* iprune_mutex provides exclusion between the kswapd or try_to_free_pages
On Tue, 16 Jan 2007, Andrew Morton wrote:
> > On Mon, 15 Jan 2007 21:47:43 -0800 (PST) Christoph Lameter <[email protected]> wrote:
> >
> > Currently cpusets are not able to do proper writeback since
> > dirty ratio calculations and writeback are all done for the system
> > as a whole.
>
> We _do_ do proper writeback. But it's less efficient than it might be, and
> there's an NFS problem.
Well yes we write back during LRU scans when a potentially high percentage
of the memory in cpuset is dirty.
> > This may result in a large percentage of a cpuset
> > to become dirty without writeout being triggered. Under NFS
> > this can lead to OOM conditions.
>
> OK, a big question: is this patchset a performance improvement or a
> correctness fix? Given the above, and the lack of benchmark results I'm
> assuming it's for correctness.
It is a correctness fix both for NFS OOM and doing proper cpuset writeout.
> - Why does NFS go oom? Because it allocates potentially-unbounded
> numbers of requests in the writeback path?
>
> It was able to go oom on non-numa machines before dirty-page-tracking
> went in. So a general problem has now become specific to some NUMA
> setups.
Right. The issue is that large portions of memory become dirty /
writeback since no writeback occurs because dirty limits are not checked
for a cpuset. Then NFS attempt to writeout when doing LRU scans but is
unable to allocate memory.
> So an obvious, equivalent and vastly simpler "fix" would be to teach
> the NFS client to go off-cpuset when trying to allocate these requests.
Yes we can fix these allocations by allowing processes to allocate from
other nodes. But then the container function of cpusets is no longer
there.
> (But is it really bad? What actual problems will it cause once NFS is fixed?)
NFS is okay as far as I can tell. dirty throttling works fine in non
cpuset environments because we throttle if 40% of memory becomes dirty or
under writeback.
> I don't understand why the proposed patches are cpuset-aware at all. This
> is a per-zone problem, and a per-zone fix would seem to be appropriate, and
> more general. For example, i386 machines can presumably get into trouble
> if all of ZONE_DMA or ZONE_NORMAL get dirty. A good implementation would
> address that problem as well. So I think it should all be per-zone?
No. A zone can be completely dirty as long as we are allowed to allocate
from other zones.
> Do we really need those per-inode cpumasks? When page reclaim encounters a
> dirty page on the zone LRU, we automatically know that page->mapping->host
> has at least one dirty page in this zone, yes? We could immediately ask
Yes, but when we enter reclaim most of the pages of a zone may already be
dirty/writeback so we fail. Also when we enter reclaim we may not have
the proper process / cpuset context. There is no use to throttle kswapd.
We need to throttle the process that is dirtying memory.
> But all of this is, I think, unneeded if NFS is fixed. It's hopefully a
> performance optimisation to permit writeout in a less seeky fashion.
> Unless there's some other problem with excessively dirty zones.
The patchset improves performance because the filesystem can do sequential
writeouts. So yes in some ways this is a performance improvement. But this
is only because this patch makes dirty throttling for cpusets work in the
same way as for non NUMA system.
On Wed, 17 Jan 2007, Andi Kleen wrote:
> > Secondly we modify the dirty limit calculation to be based
> > on the acctive cpuset.
>
> The global dirty limit definitely seems to be a problem
> in several cases, but my feeling is that the cpuset is the wrong unit
> to keep track of it. Most likely it should be more fine grained.
We already have zone reclaim that can take care of smaller units but why
would we start writeback if only one zone is full of dirty pages and there
are lots of other zones (nodes) that are free?
> > If we are in a cpuset then we select only inodes for writeback
> > that have pages on the nodes of the cpuset.
>
> Is there any indication this change helps on smaller systems
> or is it purely a large system optimization?
The bigger the system the larger the problem because the ratio of dirty
pages is calculated is currently based on the percentage of dirty pages
in the system as a whole. The less percentage of a system a cpuset
contains the less effective the dirty_ratio and background_dirty_ratio
become.
> > B. We add a new counter NR_UNRECLAIMABLE that is subtracted
> > from the available pages in a node. This allows us to
> > accurately calculate the dirty ratio even if large portions
> > of the node have been allocated for huge pages or for
> > slab pages.
>
> That sounds like a useful change by itself.
I can separate that one out.
On Tue, 2007-01-16 at 23:08 +0100, Peter Zijlstra wrote:
> Subject: nfs: fix congestion control
>
> The current NFS client congestion logic is severely broken, it marks the
> backing device congested during each nfs_writepages() call and implements
> its own waitqueue.
>
> Replace this by a more regular congestion implementation that puts a cap
> on the number of active writeback pages and uses the bdi congestion waitqueue.
>
> NFSv[34] commit pages are allowed to go unchecked as long as we are under
> the dirty page limit and not in direct reclaim.
>
> A buxom young lass from Neale's Flat,
> Bore triplets, named Matt, Pat and Tat.
> "Oh Lord," she protested,
> "'Tis somewhat congested ...
> "You've given me no tit for Tat."
What on earth is the point of adding congestion control to COMMIT?
Strongly NACKed.
Why 16MB of on-the-wire data? Why not 32, or 128, or ...
Solaris already allows you to send 2MB of write data in a single RPC
request, and the RPC engine has for some time allowed you to tune the
number of simultaneous RPC requests you have on the wire: Chuck has
already shown that read/write performance is greatly improved by upping
that value to 64 or more in the case of RPC over TCP. Why are we then
suddenly telling people that they are limited to 8 simultaneous writes?
Trond
> On Tue, 16 Jan 2007 14:15:56 -0800 (PST) Christoph Lameter <[email protected]> wrote:
>
> ...
>
> > > This may result in a large percentage of a cpuset
> > > to become dirty without writeout being triggered. Under NFS
> > > this can lead to OOM conditions.
> >
> > OK, a big question: is this patchset a performance improvement or a
> > correctness fix? Given the above, and the lack of benchmark results I'm
> > assuming it's for correctness.
>
> It is a correctness fix both for NFS OOM and doing proper cpuset writeout.
It's a workaround for a still-unfixed NFS problem.
> > - Why does NFS go oom? Because it allocates potentially-unbounded
> > numbers of requests in the writeback path?
> >
> > It was able to go oom on non-numa machines before dirty-page-tracking
> > went in. So a general problem has now become specific to some NUMA
> > setups.
>
>
> Right. The issue is that large portions of memory become dirty /
> writeback since no writeback occurs because dirty limits are not checked
> for a cpuset. Then NFS attempt to writeout when doing LRU scans but is
> unable to allocate memory.
>
> > So an obvious, equivalent and vastly simpler "fix" would be to teach
> > the NFS client to go off-cpuset when trying to allocate these requests.
>
> Yes we can fix these allocations by allowing processes to allocate from
> other nodes. But then the container function of cpusets is no longer
> there.
But that's what your patch already does!
It asks pdflush to write the pages instead of the direct-reclaim caller.
The only reason pdflush doesn't go oom is that pdflush lives outside the
direct-reclaim caller's cpuset and is hence able to obtain those nfs
requests from off-cpuset zones.
> > (But is it really bad? What actual problems will it cause once NFS is fixed?)
>
> NFS is okay as far as I can tell. dirty throttling works fine in non
> cpuset environments because we throttle if 40% of memory becomes dirty or
> under writeback.
Repeat: NFS shouldn't go oom. It should fail the allocation, recover, wait
for existing IO to complete. Back that up with a mempool for NFS requests
and the problem is solved, I think?
> > I don't understand why the proposed patches are cpuset-aware at all. This
> > is a per-zone problem, and a per-zone fix would seem to be appropriate, and
> > more general. For example, i386 machines can presumably get into trouble
> > if all of ZONE_DMA or ZONE_NORMAL get dirty. A good implementation would
> > address that problem as well. So I think it should all be per-zone?
>
> No. A zone can be completely dirty as long as we are allowed to allocate
> from other zones.
But we also can get into trouble if a *zone* is all-dirty. Any solution to
the cpuset problem should solve that problem too, no?
> > Do we really need those per-inode cpumasks? When page reclaim encounters a
> > dirty page on the zone LRU, we automatically know that page->mapping->host
> > has at least one dirty page in this zone, yes? We could immediately ask
>
> Yes, but when we enter reclaim most of the pages of a zone may already be
> dirty/writeback so we fail.
No. If the dirty limits become per-zone then no zone will ever have >40%
dirty.
The obvious fix here is: when a zone hits 40% dirty, perform dirty-memory
reduction in that zone, throttling the dirtying process. I suspect this
would work very badly in common situations with, say, typical i386 boxes.
On Tue, Jan 16, 2007 at 01:53:25PM -0800, Andrew Morton wrote:
> > On Mon, 15 Jan 2007 21:47:43 -0800 (PST) Christoph Lameter
> > <[email protected]> wrote:
> >
> > Currently cpusets are not able to do proper writeback since dirty ratio
> > calculations and writeback are all done for the system as a whole.
>
> We _do_ do proper writeback. But it's less efficient than it might be, and
> there's an NFS problem.
>
> > This may result in a large percentage of a cpuset to become dirty without
> > writeout being triggered. Under NFS this can lead to OOM conditions.
>
> OK, a big question: is this patchset a performance improvement or a
> correctness fix? Given the above, and the lack of benchmark results I'm
> assuming it's for correctness.
Given that we've already got a 25-30% buffered write performance
degradation between 2.6.18 and 2.6.20-rc4 for simple sequential
write I/O to multiple filesystems concurrently, I'd really like
to see some serious I/O performance regression testing on this
change before it goes anywhere.
Cheers,
Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group
On Tue, 16 Jan 2007, Andrew Morton wrote:
> It's a workaround for a still-unfixed NFS problem.
No its doing proper throttling. Without this patchset there will *no*
writeback and throttling at all. F.e. lets say we have 20 nodes of 1G each
and a cpuset that only spans one node.
Then a process runniung in that cpuset can dirty all of memory and still
continue running without writeback continuing. background dirty ratio
is at 10% and the dirty ratio at 40%. Neither of those boundaries can ever
be reached because the process will only ever be able to dirty memory on
one node which is 5%. There will be no throttling, no background
writeback, no blocking for dirty pages.
At some point we run into reclaim (possibly we have ~99% of of the cpuset
dirty) and then we trigger writeout. Okay so if the filesystem / block
device is robust enough and does not require memory allocations then we
likely will survive that and do slow writeback page by page from the LRU.
writback is completely hosed for that situation. This patch restores
expected behavior in a cpuset (which is a form of system partition that
should mirror the system as a whole). At 10% dirty we should start
background writeback and at 40% we should block. If that is done then even
fragile combinations of filesystem/block devices will work as they do
without cpusets.
> > Yes we can fix these allocations by allowing processes to allocate from
> > other nodes. But then the container function of cpusets is no longer
> > there.
> But that's what your patch already does!
The patchset does not allow processes to allocate from other nodes than
the current cpuset. There is no change as to the source of memory
allocations.
> > NFS is okay as far as I can tell. dirty throttling works fine in non
> > cpuset environments because we throttle if 40% of memory becomes dirty or
> > under writeback.
>
> Repeat: NFS shouldn't go oom. It should fail the allocation, recover, wait
> for existing IO to complete. Back that up with a mempool for NFS requests
> and the problem is solved, I think?
AFAIK any filesyste/block device can go oom with the current broken
writeback it just does a few allocations. Its a matter of hitting the
sweet spots.
> But we also can get into trouble if a *zone* is all-dirty. Any solution to
> the cpuset problem should solve that problem too, no?
Nope. Why would a dirty zone pose a problem? The proble exist if you
cannot allocate more memory. If a cpuset contains a single node which is a
single zone then this patchset will also address that issue.
If we have multiple zones then other zones may still provide memory to
continue (same as in UP).
> > Yes, but when we enter reclaim most of the pages of a zone may already be
> > dirty/writeback so we fail.
>
> No. If the dirty limits become per-zone then no zone will ever have >40%
> dirty.
I am still confused as to why you would want per zone dirty limits?
Lets say we have a cpuset with 4 nodes (thus 4 zones) and we are running
on the first node. Then we copy a large file to disk. Node local
allocation means that we allocate from the first node. After we reach 40%
of the node then we throttle? This is going to be a significant
performance degradation since we can no longer use the memory of other
nodes to buffer writeout.
> The obvious fix here is: when a zone hits 40% dirty, perform dirty-memory
> reduction in that zone, throttling the dirtying process. I suspect this
> would work very badly in common situations with, say, typical i386 boxes.
Absolute crap. You can prototype that broken behavior with zone reclaim by
the way. Just switch on writeback during zone reclaim and watch how memory
on a cpuset is unused and how the system becomes slow as molasses.
> On Tue, 16 Jan 2007 16:16:30 -0800 (PST) Christoph Lameter <[email protected]> wrote:
> On Tue, 16 Jan 2007, Andrew Morton wrote:
>
> > It's a workaround for a still-unfixed NFS problem.
>
> No its doing proper throttling. Without this patchset there will *no*
> writeback and throttling at all. F.e. lets say we have 20 nodes of 1G each
> and a cpuset that only spans one node.
>
> Then a process runniung in that cpuset can dirty all of memory and still
> continue running without writeback continuing. background dirty ratio
> is at 10% and the dirty ratio at 40%. Neither of those boundaries can ever
> be reached because the process will only ever be able to dirty memory on
> one node which is 5%. There will be no throttling, no background
> writeback, no blocking for dirty pages.
>
> At some point we run into reclaim (possibly we have ~99% of of the cpuset
> dirty) and then we trigger writeout. Okay so if the filesystem / block
> device is robust enough and does not require memory allocations then we
> likely will survive that and do slow writeback page by page from the LRU.
>
> writback is completely hosed for that situation. This patch restores
> expected behavior in a cpuset (which is a form of system partition that
> should mirror the system as a whole). At 10% dirty we should start
> background writeback and at 40% we should block. If that is done then even
> fragile combinations of filesystem/block devices will work as they do
> without cpusets.
Nope. You've completely omitted the little fact that we'll do writeback in
the offending zone off the LRU. Slower, maybe. But it should work and the
system should recover. If it's not doing that (it isn't) then we should
fix it rather than avoiding it (by punting writeback over to pdflush).
Once that's fixed, if we determine that there are remaining and significant
performance issues then we can take a look at that.
>
> > > Yes we can fix these allocations by allowing processes to allocate from
> > > other nodes. But then the container function of cpusets is no longer
> > > there.
> > But that's what your patch already does!
>
> The patchset does not allow processes to allocate from other nodes than
> the current cpuset.
Yes it does. It asks pdflush to perform writeback of the offending zone(s)
rather than (or as well as) doing it directly. The only reason pdflush can
sucessfuly do that is because pdflush can allocate its requests from other
zones.
>
> AFAIK any filesyste/block device can go oom with the current broken
> writeback it just does a few allocations. Its a matter of hitting the
> sweet spots.
That shouldn't be possible, in theory. Block IO is supposed to succeed if
*all memory in the machine is dirty*: the old
dirty-everything-with-MAP_SHARED-then-exit problem. Lots of testing went
into that and it works. It also failed on NFS although I thought that got
"fixed" a year or so ago. Apparently not.
> > But we also can get into trouble if a *zone* is all-dirty. Any solution to
> > the cpuset problem should solve that problem too, no?
>
> Nope. Why would a dirty zone pose a problem? The proble exist if you
> cannot allocate more memory.
Well one example would be a GFP_KERNEL allocation on a highmem machine in
whcih all of ZONE_NORMAL is dirty.
> If a cpuset contains a single node which is a
> single zone then this patchset will also address that issue.
>
> If we have multiple zones then other zones may still provide memory to
> continue (same as in UP).
Not if all the eligible zones are all-dirty.
> > > Yes, but when we enter reclaim most of the pages of a zone may already be
> > > dirty/writeback so we fail.
> >
> > No. If the dirty limits become per-zone then no zone will ever have >40%
> > dirty.
>
> I am still confused as to why you would want per zone dirty limits?
The need for that has yet to be demonstrated. There _might_ be a problem,
but we need test cases and analyses to demonstrate that need.
Right now, what we have is an NFS bug. How about we fix it, then
reevaluate the situation?
A good starting point would be to show us one of these oom-killer traces.
> Lets say we have a cpuset with 4 nodes (thus 4 zones) and we are running
> on the first node. Then we copy a large file to disk. Node local
> allocation means that we allocate from the first node. After we reach 40%
> of the node then we throttle? This is going to be a significant
> performance degradation since we can no longer use the memory of other
> nodes to buffer writeout.
That was what I was referring to.
On Tue, 16 Jan 2007, Andrew Morton wrote:
> Nope. You've completely omitted the little fact that we'll do writeback in
> the offending zone off the LRU. Slower, maybe. But it should work and the
> system should recover. If it's not doing that (it isn't) then we should
> fix it rather than avoiding it (by punting writeback over to pdflush).
pdflush is not running *at* all nor is dirty throttling working. That is
correct behavior? We could do background writeback but we choose not to do
so? Instead we wait until we hit reclaim and then block (well it seems
that we do not block the blocking there also fails since we again check
global ratios)?
> > The patchset does not allow processes to allocate from other nodes than
> > the current cpuset.
>
> Yes it does. It asks pdflush to perform writeback of the offending zone(s)
> rather than (or as well as) doing it directly. The only reason pdflush can
> sucessfuly do that is because pdflush can allocate its requests from other
> zones.
Ok pdflush is able to do that. Still the application is not able to
extend its memory beyond the cpuset. What about writeback throttling?
There it all breaks down. The cpuset is effective and we are unable to
allocate any more memory.
The reason this works is because not all of memory is dirty. Thus reclaim
will be able to free up memory or there is enough memory free.
> > AFAIK any filesyste/block device can go oom with the current broken
> > writeback it just does a few allocations. Its a matter of hitting the
> > sweet spots.
>
> That shouldn't be possible, in theory. Block IO is supposed to succeed if
> *all memory in the machine is dirty*: the old
> dirty-everything-with-MAP_SHARED-then-exit problem. Lots of testing went
> into that and it works. It also failed on NFS although I thought that got
> "fixed" a year or so ago. Apparently not.
Humm... Really?
> > Nope. Why would a dirty zone pose a problem? The proble exist if you
> > cannot allocate more memory.
>
> Well one example would be a GFP_KERNEL allocation on a highmem machine in
> whcih all of ZONE_NORMAL is dirty.
That is a restricted allocation which will lead to reclaim.
> > If we have multiple zones then other zones may still provide memory to
> > continue (same as in UP).
>
> Not if all the eligible zones are all-dirty.
They are all dirty if we do not throttle the dirty pages.
> Right now, what we have is an NFS bug. How about we fix it, then
> reevaluate the situation?
The "NFS bug" only exists when using a cpuset. If you run NFS without
cpusets then the throttling will kick in and everything is fine.
> A good starting point would be to show us one of these oom-killer traces.
No traces. Since the process is killed within a cpuset we only get
messages like:
Nov 28 16:19:52 ic4 kernel: Out of Memory: Kill process 679783 (ncks) score 0 and children.
Nov 28 16:19:52 ic4 kernel: No available memory in cpuset: Killed process 679783 (ncks).
Nov 28 16:27:58 ic4 kernel: oom-killer: gfp_mask=0x200d2, order=0
Probably need to rerun these with some patches.
> > Lets say we have a cpuset with 4 nodes (thus 4 zones) and we are running
> > on the first node. Then we copy a large file to disk. Node local
> > allocation means that we allocate from the first node. After we reach 40%
> > of the node then we throttle? This is going to be a significant
> > performance degradation since we can no longer use the memory of other
> > nodes to buffer writeout.
>
> That was what I was referring to.
Note that this was describing the behavior you wanted not the way things
work. It is desired behavior not to use all the memory resources of the
cpuset and slow down the system?
On Tuesday 16 January 2007 16:47, Christoph Lameter wrote:
> I think having the ability to determine the maximum amount of nodes in
> a system at runtime is useful but then we should name this entry
> correspondingly and also only calculate the value once on bootup.
Are you sure this is even possible in general on systems with node
hotplug? The firmware might not pass a maximum limit.
At least CPU hotplug definitely has this issue and I don't see nodes
to be very different.
-Andi
>
> On Tue, 16 Jan 2007 17:30:26 -0800 (PST) Christoph Lameter <[email protected]> wrote:
> > Nope. You've completely omitted the little fact that we'll do writeback in
> > the offending zone off the LRU. Slower, maybe. But it should work and the
> > system should recover. If it's not doing that (it isn't) then we should
> > fix it rather than avoiding it (by punting writeback over to pdflush).
>
> pdflush is not running *at* all nor is dirty throttling working. That is
> correct behavior? We could do background writeback but we choose not to do
> so? Instead we wait until we hit reclaim and then block (well it seems
> that we do not block the blocking there also fails since we again check
> global ratios)?
I agree that it is a worthy objective to be able to constrain a cpuset's
dirty memory levels. But as a performance optimisation and NOT as a
correctness fix.
Consider: non-exclusive cpuset A consists of mems 0-15, non-exclusive
cpuset B consists of mems 0-3. A task running in cpuset A can freely dirty
all of cpuset B's memory. A task running in cpuset B gets oomkilled.
Consider: a 32-node machine has nodes 0-3 full of dirty memory. I create a
cpuset containing nodes 0-2 and start using it. I get oomkilled.
There may be other scenarios.
IOW, we have a correctness problem, and we have a probable,
not-yet-demonstrated-and-quantified performance problem. Fixing the latter
(in the proposed fashion) will *not* fix the former.
So what I suggest we do is to fix the NFS bug, then move on to considering
the performance problems.
On reflection, I agree that your proposed changes are sensible-looking for
addressing the probable, not-yet-demonstrated-and-quantified performance
problem. The per-inode (should be per-address_space, maybe it is?) node
map is unfortunate. Need to think about that a bit more. For a start, it
should be dynamically allocated (from a new, purpose-created slab cache):
most in-core inodes don't have any dirty pages and don't need this
additional storage.
Also, I worry about the worst-case performance of that linear search across
the inodes.
But this is unrelated to the NFS bug ;)
On Wed, 17 Jan 2007, Andi Kleen wrote:
> On Tuesday 16 January 2007 16:47, Christoph Lameter wrote:
>
> > I think having the ability to determine the maximum amount of nodes in
> > a system at runtime is useful but then we should name this entry
> > correspondingly and also only calculate the value once on bootup.
>
> Are you sure this is even possible in general on systems with node
> hotplug? The firmware might not pass a maximum limit.
In that case the node possible map must include all nodes right?
On Tue, 16 Jan 2007, Andrew Morton wrote:
> Consider: non-exclusive cpuset A consists of mems 0-15, non-exclusive
> cpuset B consists of mems 0-3. A task running in cpuset A can freely dirty
> all of cpuset B's memory. A task running in cpuset B gets oomkilled.
>
> Consider: a 32-node machine has nodes 0-3 full of dirty memory. I create a
> cpuset containing nodes 0-2 and start using it. I get oomkilled.
>
> There may be other scenarios.
Yes this is the result of the hierachical nature of cpusets which already
causes issues with the scheduler. It is rather typical that cpusets are
used to partition the memory and cpus. Overlappig cpusets seem to have
mainly an administrative function. Paul?
> So what I suggest we do is to fix the NFS bug, then move on to considering
> the performance problems.
The NFS "bug" has been there for ages and no one cares since write
throttling works effectively. Since NFS can go via any network technology
(f.e. infiniband) we have many potential issues at that point that depend
on the underlying network technology. As far as I can recall we decided
that these stacking issues are inherently problematic and basically
unsolvable.
> On reflection, I agree that your proposed changes are sensible-looking for
> addressing the probable, not-yet-demonstrated-and-quantified performance
> problem. The per-inode (should be per-address_space, maybe it is?) node
The address space is part of the inode. Some of my development versions at
the dirty_map in the address space. However, the end of the inode was a
convenient place for a runtime sizes nodemask.
> map is unfortunate. Need to think about that a bit more. For a start, it
> should be dynamically allocated (from a new, purpose-created slab cache):
> most in-core inodes don't have any dirty pages and don't need this
> additional storage.
We also considered such an approach. However. it creates the problem
of performing a slab allocation while dirtying pages. At that point we do
not have an allocation context, nor can we block.
> But this is unrelated to the NFS bug ;)
Looks more like a design issue (given its layering on top of the
networking layer) and not a bug. The "bug" surfaces when writeback is not
done properly. I wonder what happens if other filesystems are pushed to
the border of the dirty abyss. .... The mmap tracking
fixes that were done in 2.6.19 were done because of similar symptoms
because the systems dirty tracking was off. This is fundamentally the
same issue showing up in a cpuset. So we should be able to produce the
hangs (looks ... yes another customer reported issue on this one is that
reclaim is continually running and we basically livelock the system) that
we saw for the mmap dirty tracking issues in addition to the NFS problems
seen so far.
Memory allocation is required in most filesystem flush paths. If we cannot
allocate memory then we cannot clean pages and thus we continue trying ->
Livelock. I still see this as a fundamental correctness issue in the
kernel.
On Tue, 2007-01-16 at 17:27 -0500, Trond Myklebust wrote:
> On Tue, 2007-01-16 at 23:08 +0100, Peter Zijlstra wrote:
> > Subject: nfs: fix congestion control
> >
> > The current NFS client congestion logic is severely broken, it marks the
> > backing device congested during each nfs_writepages() call and implements
> > its own waitqueue.
> >
> > Replace this by a more regular congestion implementation that puts a cap
> > on the number of active writeback pages and uses the bdi congestion waitqueue.
> >
> > NFSv[34] commit pages are allowed to go unchecked as long as we are under
> > the dirty page limit and not in direct reclaim.
>
> What on earth is the point of adding congestion control to COMMIT?
> Strongly NACKed.
They are dirty pages, how are we getting rid of them when we reached the
dirty limit?
> Why 16MB of on-the-wire data? Why not 32, or 128, or ...
Andrew always promotes a fixed number for congestion control, I pulled
one from a dark place. I have no problem with a more dynamic solution.
> Solaris already allows you to send 2MB of write data in a single RPC
> request, and the RPC engine has for some time allowed you to tune the
> number of simultaneous RPC requests you have on the wire: Chuck has
> already shown that read/write performance is greatly improved by upping
> that value to 64 or more in the case of RPC over TCP. Why are we then
> suddenly telling people that they are limited to 8 simultaneous writes?
min(max RPC size * max concurrent RPC reqs, dirty threshold) then?
> Yes this is the result of the hierachical nature of cpusets which already
> causes issues with the scheduler. It is rather typical that cpusets are
> used to partition the memory and cpus. Overlappig cpusets seem to have
> mainly an administrative function. Paul?
The heavy weight tasks, which are expected to be applying serious memory
pressure (whether for data pages or dirty file pages), are usually in
non-overlapping cpusets, or sharing the same cpuset, but not partially
overlapping with, or a proper superset of, some other cpuset holding an
active job.
The higher level cpusets, such as the top cpuset, or the one deeded over
to the Batch Scheduler, are proper supersets of many other cpusets. We
avoid putting anything heavy weight in those cpusets.
Sometimes of course a task turns out to be unexpectedly heavy weight.
But in that case, we're mostly interested in function (system keeps
running), not performance.
That is, if someone setup what Andrew described, with a job in a large
cpuset sucking up all available memory from one in a smaller, contained
cpuset, I don't think I'm tuning for optimum performance anymore.
Rather I'm just trying to keep the system running and keep unrelated
jobs unaffected while we dig our way out of the hole. If the smaller
job OOM's, that's tough nuggies. They asked for it. Jobs in
-unrelated- (non-overlapping) cpusets should ride out the storm with
little or no impact on their performance.
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401
> On Tue, 16 Jan 2007 19:40:17 -0800 (PST) Christoph Lameter <[email protected]> wrote:
> On Tue, 16 Jan 2007, Andrew Morton wrote:
>
> > Consider: non-exclusive cpuset A consists of mems 0-15, non-exclusive
> > cpuset B consists of mems 0-3. A task running in cpuset A can freely dirty
> > all of cpuset B's memory. A task running in cpuset B gets oomkilled.
> >
> > Consider: a 32-node machine has nodes 0-3 full of dirty memory. I create a
> > cpuset containing nodes 0-2 and start using it. I get oomkilled.
> >
> > There may be other scenarios.
>
> Yes this is the result of the hierachical nature of cpusets which already
> causes issues with the scheduler. It is rather typical that cpusets are
> used to partition the memory and cpus. Overlappig cpusets seem to have
> mainly an administrative function. Paul?
The typical usage scenarios don't matter a lot: the examples I gave show
that the core problem remains unsolved. People can still hit the bug.
> > So what I suggest we do is to fix the NFS bug, then move on to considering
> > the performance problems.
>
> The NFS "bug" has been there for ages and no one cares since write
> throttling works effectively. Since NFS can go via any network technology
> (f.e. infiniband) we have many potential issues at that point that depend
> on the underlying network technology. As far as I can recall we decided
> that these stacking issues are inherently problematic and basically
> unsolvable.
The problem you refer to arises from the inability of the net driver to
allocate memory for an outbound ack. Such allocations aren't constrained to
a cpuset.
I expect that we can solve the NFS oom problem along the same lines as
block devices. Certainly it's dumb of us to oom-kill a process rather than
going off-cpuset for a small and short-lived allocation. It's also dumb of
us to allocate a basically unbounded number of nfs requests rather than
waiting for some of the ones which we _have_ allocated to complete.
> > On reflection, I agree that your proposed changes are sensible-looking for
> > addressing the probable, not-yet-demonstrated-and-quantified performance
> > problem. The per-inode (should be per-address_space, maybe it is?) node
>
> The address space is part of the inode.
Physically, yes. Logically, it is not. The address_space controls the
data-plane part of a file and is the appropriate place in which to store
this nodemask.
> Some of my development versions at
> the dirty_map in the address space. However, the end of the inode was a
> convenient place for a runtime sizes nodemask.
>
> > map is unfortunate. Need to think about that a bit more. For a start, it
> > should be dynamically allocated (from a new, purpose-created slab cache):
> > most in-core inodes don't have any dirty pages and don't need this
> > additional storage.
>
> We also considered such an approach. However. it creates the problem
> of performing a slab allocation while dirtying pages. At that point we do
> not have an allocation context, nor can we block.
Yes, it must be an atomic allocation. If it fails, we don't care. Chances
are it'll succeed when the next page in this address_space gets dirtied.
Plus we don't waste piles of memory on read-only files.
> > But this is unrelated to the NFS bug ;)
>
> Looks more like a design issue (given its layering on top of the
> networking layer) and not a bug. The "bug" surfaces when writeback is not
> done properly. I wonder what happens if other filesystems are pushed to
> the border of the dirty abyss. .... The mmap tracking
> fixes that were done in 2.6.19 were done because of similar symptoms
> because the systems dirty tracking was off. This is fundamentally the
> same issue showing up in a cpuset. So we should be able to produce the
> hangs (looks ... yes another customer reported issue on this one is that
> reclaim is continually running and we basically livelock the system) that
> we saw for the mmap dirty tracking issues in addition to the NFS problems
> seen so far.
>
> Memory allocation is required in most filesystem flush paths. If we cannot
> allocate memory then we cannot clean pages and thus we continue trying ->
> Livelock. I still see this as a fundamental correctness issue in the
> kernel.
I'll believe all that once someone has got down and tried to fix NFS, and
has failed ;)
On Tuesday 16 January 2007 16:48, Christoph Lameter wrote:
> Direct reclaim: cpuset aware writeout
>
> During direct reclaim we traverse down a zonelist and are carefully
> checking each zone if its a member of the active cpuset. But then we call
> pdflush without enforcing the same restrictions. In a larger system this
> may have the effect of a massive amount of pages being dirtied and then
> either
Is there a reason this can't be just done by node, ignoring the cpusets?
-Andi
On Wednesday 17 January 2007 14:14, Christoph Lameter wrote:
> On Wed, 17 Jan 2007, Andi Kleen wrote:
> > On Tuesday 16 January 2007 16:47, Christoph Lameter wrote:
> > > I think having the ability to determine the maximum amount of nodes in
> > > a system at runtime is useful but then we should name this entry
> > > correspondingly and also only calculate the value once on bootup.
> >
> > Are you sure this is even possible in general on systems with node
> > hotplug? The firmware might not pass a maximum limit.
>
> In that case the node possible map must include all nodes right?
Yes.
-Andi
Andi wrote:
> Is there a reason this can't be just done by node, ignoring the cpusets?
This suggestion doesn't make a whole lot of sense to me.
We're looking to see if a task has dirtied most of the
pages in the nodes it is allowed to use. If it has, then
we want to start pushing pages to the disk harder, and
slowing down the tasks writes.
What would it mean to do this per-node? And why would
that be better?
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401
On Wed, 17 Jan 2007, Andi Kleen wrote:
> On Tuesday 16 January 2007 16:48, Christoph Lameter wrote:
> > Direct reclaim: cpuset aware writeout
> >
> > During direct reclaim we traverse down a zonelist and are carefully
> > checking each zone if its a member of the active cpuset. But then we call
> > pdflush without enforcing the same restrictions. In a larger system this
> > may have the effect of a massive amount of pages being dirtied and then
> > either
>
> Is there a reason this can't be just done by node, ignoring the cpusets?
We want to writeout dirty pages that help our situation. Those are located
on the nodes of the cpuset.
On Wed, 17 Jan 2007, Andi Kleen wrote:
> > > Are you sure this is even possible in general on systems with node
> > > hotplug? The firmware might not pass a maximum limit.
> >
> > In that case the node possible map must include all nodes right?
>
> Yes.
Then we are fine.
On Wednesday 17 January 2007 15:20, Paul Jackson wrote:
> Andi wrote:
> > Is there a reason this can't be just done by node, ignoring the cpusets?
>
> This suggestion doesn't make a whole lot of sense to me.
>
> We're looking to see if a task has dirtied most of the
> pages in the nodes it is allowed to use. If it has, then
> we want to start pushing pages to the disk harder, and
> slowing down the tasks writes.
>
> What would it mean to do this per-node? And why would
> that be better?
With a per node dirty limit you would get essentially the
same effect and it would have the advantage of helping
people who don't configure any cpusets but run on a NUMA
system.
-Andi
> With a per node dirty limit ...
What would this mean?
Lets say we have a simple machine with 4 nodes, cpusets disabled.
Lets say all tasks are allowed to use all nodes, no set_mempolicy
either.
If a task happens to fill up 80% of one node with dirty pages, but
we have no dirty pages yet on other nodes, and we have a dirty ratio
of 40%, then do we throttle that task's writes?
I am surprised you are asking for this, Andi. I would have thought
that on no-cpuset systems, the system wide throttling served your
needs fine. If not, then I can only guess that is because NUMA
mempolicy constraints on allowed nodes are causing the same dirty page
problems as cpuset constrained systems -- is that your concern?
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401
On Wednesday 17 January 2007 15:36, Paul Jackson wrote:
> > With a per node dirty limit ...
>
> What would this mean?
>
> Lets say we have a simple machine with 4 nodes, cpusets disabled.
There can be always NUMA policy without cpusets for once.
> Lets say all tasks are allowed to use all nodes, no set_mempolicy
> either.
Ok.
> If a task happens to fill up 80% of one node with dirty pages, but
> we have no dirty pages yet on other nodes, and we have a dirty ratio
> of 40%, then do we throttle that task's writes?
Yes we should actually. Every node should be able to supply
memory (unless extreme circumstances like mlock) and that much dirty
memory on a node will make that hard.
> I am surprised you are asking for this, Andi. I would have thought
> that on no-cpuset systems, the system wide throttling served your
> needs fine.
No actually people are fairly unhappy when one node is filled with
file data and then they don't get local memory from it anymore.
I get regular complaints about that for Opteron.
Dirty limit wouldn't be a full solution, but a good step.
> If not, then I can only guess that is because NUMA
> mempolicy constraints on allowed nodes are causing the same dirty page
> problems as cpuset constrained systems -- is that your concern?
That is another concern. I haven't checked recently, but it used
to be fairly simple to put a system to its knees by oversubscribing
a single node with a strict memory policy. Fixing that would be good.
-Andi
On Wed, 2007-01-17 at 03:41 +0100, Peter Zijlstra wrote:
> On Tue, 2007-01-16 at 17:27 -0500, Trond Myklebust wrote:
> > On Tue, 2007-01-16 at 23:08 +0100, Peter Zijlstra wrote:
> > > Subject: nfs: fix congestion control
> > >
> > > The current NFS client congestion logic is severely broken, it marks the
> > > backing device congested during each nfs_writepages() call and implements
> > > its own waitqueue.
> > >
> > > Replace this by a more regular congestion implementation that puts a cap
> > > on the number of active writeback pages and uses the bdi congestion waitqueue.
> > >
> > > NFSv[34] commit pages are allowed to go unchecked as long as we are under
> > > the dirty page limit and not in direct reclaim.
>
> >
> > What on earth is the point of adding congestion control to COMMIT?
> > Strongly NACKed.
>
> They are dirty pages, how are we getting rid of them when we reached the
> dirty limit?
They are certainly _not_ dirty pages. They are pages that have been
written to the server but are not yet guaranteed to have hit the disk
(they were only written to the server's page cache). We don't care if
they are paged in or swapped out on the local client.
\All the COMMIT does, is to ask the server to write the data from its
page cache onto disk. Once that has been done, we can release the pages.
If the commit fails, then we iterate through the whole writepage()
process again. The commit itself does, however, not even look at the
page data.
> > Why 16MB of on-the-wire data? Why not 32, or 128, or ...
>
> Andrew always promotes a fixed number for congestion control, I pulled
> one from a dark place. I have no problem with a more dynamic solution.
>
> > Solaris already allows you to send 2MB of write data in a single RPC
> > request, and the RPC engine has for some time allowed you to tune the
> > number of simultaneous RPC requests you have on the wire: Chuck has
> > already shown that read/write performance is greatly improved by upping
> > that value to 64 or more in the case of RPC over TCP. Why are we then
> > suddenly telling people that they are limited to 8 simultaneous writes?
>
> min(max RPC size * max concurrent RPC reqs, dirty threshold) then?
That would be far preferable. For instance, it allows those who have
long latency fat pipes to actually use the bandwidth optimally when
writing out the data.
Trond
On Wed, 17 Jan 2007, Andi Kleen wrote:
> No actually people are fairly unhappy when one node is filled with
> file data and then they don't get local memory from it anymore.
> I get regular complaints about that for Opteron.
Switch on zone_reclaim and it will take care of it. You can even switch it
to write mode in order to get rid of dirty pages. However, be aware of the
significantly reduced performance since you cannot go off node without
writeback anymore.
> That is another concern. I haven't checked recently, but it used
> to be fairly simple to put a system to its knees by oversubscribing
> a single node with a strict memory policy. Fixing that would be good.
zone_reclaim has dealt with most of those issues.
On Tue, 16 Jan 2007, Andrew Morton wrote:
> > Yes this is the result of the hierachical nature of cpusets which already
> > causes issues with the scheduler. It is rather typical that cpusets are
> > used to partition the memory and cpus. Overlappig cpusets seem to have
> > mainly an administrative function. Paul?
>
> The typical usage scenarios don't matter a lot: the examples I gave show
> that the core problem remains unsolved. People can still hit the bug.
I agree the overlap issue is a problem and I hope it can be addressed
somehow for the rare cases in which such nesting takes place.
One easy solution may be to check the dirty ratio before engaging in
reclaim. If the dirty ratio is sufficiently high then trigger writeout via
pdflush (we already wakeup pdflush while scanning and you already noted
that pdflush writeout is not occurring within the context of the current
cpuset) and pass over any dirty pages during LRU scans until some pages
have been cleaned up.
This means we allow allocation of additional kernel memory outside of the
cpuset while triggering writeout of inodes that have pages on the nodes
of the cpuset. The memory directly used by the application is still
limited. Just the temporary information needed for writeback is allocated
outside.
Well sounds somehow still like a hack. Any other ideas out there?
> On Tue, 16 Jan 2007 22:27:36 -0800 (PST) Christoph Lameter <[email protected]> wrote:
> On Tue, 16 Jan 2007, Andrew Morton wrote:
>
> > > Yes this is the result of the hierachical nature of cpusets which already
> > > causes issues with the scheduler. It is rather typical that cpusets are
> > > used to partition the memory and cpus. Overlappig cpusets seem to have
> > > mainly an administrative function. Paul?
> >
> > The typical usage scenarios don't matter a lot: the examples I gave show
> > that the core problem remains unsolved. People can still hit the bug.
>
> I agree the overlap issue is a problem and I hope it can be addressed
> somehow for the rare cases in which such nesting takes place.
>
> One easy solution may be to check the dirty ratio before engaging in
> reclaim. If the dirty ratio is sufficiently high then trigger writeout via
> pdflush (we already wakeup pdflush while scanning and you already noted
> that pdflush writeout is not occurring within the context of the current
> cpuset) and pass over any dirty pages during LRU scans until some pages
> have been cleaned up.
>
> This means we allow allocation of additional kernel memory outside of the
> cpuset while triggering writeout of inodes that have pages on the nodes
> of the cpuset. The memory directly used by the application is still
> limited. Just the temporary information needed for writeback is allocated
> outside.
Gad. None of that should be necessary.
> Well sounds somehow still like a hack. Any other ideas out there?
Do what blockdevs do: limit the number of in-flight requests (Peter's
recent patch seems to be doing that for us) (perhaps only when PF_MEMALLOC
is in effect, to keep Trond happy) and implement a mempool for the NFS
request critical store. Additionally:
- we might need to twiddle the NFS gfp_flags so it doesn't call the
oom-killer on failure: just return NULL.
- consider going off-cpuset for critical allocations. It's better than
going oom. A suitable implementation might be to ignore the caller's
cpuset if PF_MEMALLOC. Maybe put a WARN_ON_ONCE in there: we prefer that
it not happen and we want to know when it does.
btw, regarding the per-address_space node mask: I think we should free it
when the inode is clean (!mapping_tagged(PAGECACHE_TAG_DIRTY)). Chances
are, the inode will be dirty for 30 seconds and in-core for hours. We
might as well steal its nodemask storage and give it to the next file which
gets written to. A suitable place to do all this is in
__mark_inode_dirty(I_DIRTY_PAGES), using inode_lock to protect
address_space.dirty_page_nodemask.
Andrew wrote:
> - consider going off-cpuset for critical allocations.
We do ... in mm/page_alloc.c:
* This is the last chance, in general, before the goto nopage.
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
We also allow GFP_KERNEL requests to escape the current cpuset, to the nearest
enclosing mem_exclusive cpuset, which is typically a big cpuset covering most
of the system.
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401
On Wed, 2007-01-17 at 01:15 -0500, Trond Myklebust wrote:
> On Wed, 2007-01-17 at 03:41 +0100, Peter Zijlstra wrote:
> > On Tue, 2007-01-16 at 17:27 -0500, Trond Myklebust wrote:
> > > On Tue, 2007-01-16 at 23:08 +0100, Peter Zijlstra wrote:
> > > > Subject: nfs: fix congestion control
> > > >
> > > > The current NFS client congestion logic is severely broken, it marks the
> > > > backing device congested during each nfs_writepages() call and implements
> > > > its own waitqueue.
> > > >
> > > > Replace this by a more regular congestion implementation that puts a cap
> > > > on the number of active writeback pages and uses the bdi congestion waitqueue.
> > > >
> > > > NFSv[34] commit pages are allowed to go unchecked as long as we are under
> > > > the dirty page limit and not in direct reclaim.
> >
> > >
> > > What on earth is the point of adding congestion control to COMMIT?
> > > Strongly NACKed.
> >
> > They are dirty pages, how are we getting rid of them when we reached the
> > dirty limit?
>
> They are certainly _not_ dirty pages. They are pages that have been
> written to the server but are not yet guaranteed to have hit the disk
> (they were only written to the server's page cache). We don't care if
> they are paged in or swapped out on the local client.
>
> \All the COMMIT does, is to ask the server to write the data from its
> page cache onto disk. Once that has been done, we can release the pages.
> If the commit fails, then we iterate through the whole writepage()
> process again. The commit itself does, however, not even look at the
> page data.
Thou art correct from an NFS point of view, however for the VM they are
(still) just dirty pages and we need shed them.
You talk of swapping them out, they are filecache pages not swapcache
pages. The writepage() process needs to complete and that entails
committing them.
> > > Why 16MB of on-the-wire data? Why not 32, or 128, or ...
> >
> > Andrew always promotes a fixed number for congestion control, I pulled
> > one from a dark place. I have no problem with a more dynamic solution.
> >
> > > Solaris already allows you to send 2MB of write data in a single RPC
> > > request, and the RPC engine has for some time allowed you to tune the
> > > number of simultaneous RPC requests you have on the wire: Chuck has
> > > already shown that read/write performance is greatly improved by upping
> > > that value to 64 or more in the case of RPC over TCP. Why are we then
> > > suddenly telling people that they are limited to 8 simultaneous writes?
> >
> > min(max RPC size * max concurrent RPC reqs, dirty threshold) then?
>
> That would be far preferable. For instance, it allows those who have
> long latency fat pipes to actually use the bandwidth optimally when
> writing out the data.
I will make it so then ;-)
I found max concurrent RPCs to be xprt_{tcp,udp}_slot_table_entries(?),
where might I find the max RPC size?
(Any pointer to just RTFM are equally well received)
> On Wed, 17 Jan 2007 00:01:58 -0800 Paul Jackson <[email protected]> wrote:
> Andrew wrote:
> > - consider going off-cpuset for critical allocations.
>
> We do ... in mm/page_alloc.c:
>
> * This is the last chance, in general, before the goto nopage.
> * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
> * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
> */
> page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
>
> We also allow GFP_KERNEL requests to escape the current cpuset, to the nearest
> enclosing mem_exclusive cpuset, which is typically a big cpuset covering most
> of the system.
hrm. So how come NFS is getting oom-killings?
The oom-killer normally spews lots of useful stuff, including backtrace. For some
reason that's not coming out for Christoph. Log facility level, perhaps?
On Wed, 2007-01-17 at 09:49 +0100, Peter Zijlstra wrote:
> > They are certainly _not_ dirty pages. They are pages that have been
> > written to the server but are not yet guaranteed to have hit the disk
> > (they were only written to the server's page cache). We don't care if
> > they are paged in or swapped out on the local client.
> >
> > \All the COMMIT does, is to ask the server to write the data from its
> > page cache onto disk. Once that has been done, we can release the pages.
> > If the commit fails, then we iterate through the whole writepage()
> > process again. The commit itself does, however, not even look at the
> > page data.
>
> Thou art correct from an NFS point of view, however for the VM they are
> (still) just dirty pages and we need shed them.
>
> You talk of swapping them out, they are filecache pages not swapcache
> pages. The writepage() process needs to complete and that entails
> committing them.
My point is that we can and should collect as many of the little buggers
as we can and treat them with ONE commit call. We don't look at the
data, we don't lock the pages, we don't care what the VM is doing with
them. Throttling is not only unnecessary, it is actually a bad idea
since it slows up the rate at which we can free up the pages.
Trond
On Wed, 2007-01-17 at 08:50 -0500, Trond Myklebust wrote:
> On Wed, 2007-01-17 at 09:49 +0100, Peter Zijlstra wrote:
> > > They are certainly _not_ dirty pages. They are pages that have been
> > > written to the server but are not yet guaranteed to have hit the disk
> > > (they were only written to the server's page cache). We don't care if
> > > they are paged in or swapped out on the local client.
> > >
> > > \All the COMMIT does, is to ask the server to write the data from its
> > > page cache onto disk. Once that has been done, we can release the pages.
> > > If the commit fails, then we iterate through the whole writepage()
> > > process again. The commit itself does, however, not even look at the
> > > page data.
> >
> > Thou art correct from an NFS point of view, however for the VM they are
> > (still) just dirty pages and we need shed them.
> >
> > You talk of swapping them out, they are filecache pages not swapcache
> > pages. The writepage() process needs to complete and that entails
> > committing them.
>
> My point is that we can and should collect as many of the little buggers
> as we can and treat them with ONE commit call. We don't look at the
> data, we don't lock the pages, we don't care what the VM is doing with
> them. Throttling is not only unnecessary, it is actually a bad idea
> since it slows up the rate at which we can free up the pages.
Ah, OK.
I was thinking that since the server needs to actually sync the page a
commit might be quite expensive (timewise), hence I didn't want to flush
too much, and interleave them with writing out some real pages to
utilise bandwidth.
But if you think I should just bulk commit I can do that.
On Wed, 2007-01-17 at 15:29 +0100, Peter Zijlstra wrote:
> I was thinking that since the server needs to actually sync the page a
> commit might be quite expensive (timewise), hence I didn't want to flush
> too much, and interleave them with writing out some real pages to
> utilise bandwidth.
Most servers just call fsync()/fdatasync() whenever we send a COMMIT, in
which case the extra round trips are just adding unnecessary latency.
On Tue, 16 Jan 2007, Andrew Morton wrote:
> Do what blockdevs do: limit the number of in-flight requests (Peter's
> recent patch seems to be doing that for us) (perhaps only when PF_MEMALLOC
> is in effect, to keep Trond happy) and implement a mempool for the NFS
> request critical store. Additionally:
>
> - we might need to twiddle the NFS gfp_flags so it doesn't call the
> oom-killer on failure: just return NULL.
>
> - consider going off-cpuset for critical allocations. It's better than
> going oom. A suitable implementation might be to ignore the caller's
> cpuset if PF_MEMALLOC. Maybe put a WARN_ON_ONCE in there: we prefer that
> it not happen and we want to know when it does.
Given the intermediate layers (network, additional gizmos (ip over xxx)
and the network cards) that will not be easy.
> btw, regarding the per-address_space node mask: I think we should free it
> when the inode is clean (!mapping_tagged(PAGECACHE_TAG_DIRTY)). Chances
> are, the inode will be dirty for 30 seconds and in-core for hours. We
> might as well steal its nodemask storage and give it to the next file which
> gets written to. A suitable place to do all this is in
> __mark_inode_dirty(I_DIRTY_PAGES), using inode_lock to protect
> address_space.dirty_page_nodemask.
The inode lock is not taken when the page is dirtied. The tree_lock
is already taken when the mapping is dirtied and so I used that to
avoid races adding and removing pointers to nodemasks from the address
space.
On Tue, 16 Jan 2007, Peter Zijlstra wrote:
> The current NFS client congestion logic is severely broken, it marks the
> backing device congested during each nfs_writepages() call and implements
> its own waitqueue.
This is the magic bullet that Andrew is looking for to fix the NFS issues?
> Index: linux-2.6-git/include/linux/nfs_fs_sb.h
> ===================================================================
> --- linux-2.6-git.orig/include/linux/nfs_fs_sb.h 2007-01-12 08:03:47.000000000 +0100
> +++ linux-2.6-git/include/linux/nfs_fs_sb.h 2007-01-12 08:53:26.000000000 +0100
> @@ -82,6 +82,8 @@ struct nfs_server {
> struct rpc_clnt * client_acl; /* ACL RPC client handle */
> struct nfs_iostats * io_stats; /* I/O statistics */
> struct backing_dev_info backing_dev_info;
> + atomic_t writeback; /* number of writeback pages */
> + atomic_t commit; /* number of commit pages */
> int flags; /* various flags */
I think writeback is frequently incremented? Would it be possible to avoid
a single global instance of an atomic_t here? In a busy NFS system
with lots of processors writing via NFS this may cause a hot cacheline
that limits write speed.
Would it be possible to use NR_WRITEBACK? If not then maybe add another
ZVC counter named NFS_NFS_WRITEBACK?
> Index: linux-2.6-git/mm/page-writeback.c
> ===================================================================
> --- linux-2.6-git.orig/mm/page-writeback.c 2007-01-12 08:03:47.000000000 +0100
> +++ linux-2.6-git/mm/page-writeback.c 2007-01-12 08:53:26.000000000 +0100
> @@ -167,6 +167,12 @@ get_dirty_limits(long *pbackground, long
> *pdirty = dirty;
> }
>
> +int dirty_pages_exceeded(struct address_space *mapping)
> +{
> + return dirty_exceeded;
> +}
> +EXPORT_SYMBOL_GPL(dirty_pages_exceeded);
> +
Export the variable instead of adding a new function? Why does it take an
address space parameter that is not used?
> Index: linux-2.6-git/fs/inode.c
> ===================================================================
> --- linux-2.6-git.orig/fs/inode.c 2007-01-12 08:03:47.000000000 +0100
> +++ linux-2.6-git/fs/inode.c 2007-01-12 08:53:26.000000000 +0100
> @@ -81,6 +81,7 @@ static struct hlist_head *inode_hashtabl
> * the i_state of an inode while it is in use..
> */
> DEFINE_SPINLOCK(inode_lock);
> +EXPORT_SYMBOL_GPL(inode_lock);
Hmmm... Commits to all NFS servers will be globally serialized via the
inode_lock?
On Wed, 2007-01-17 at 12:05 -0800, Christoph Lameter wrote:
> On Tue, 16 Jan 2007, Peter Zijlstra wrote:
>
> > The current NFS client congestion logic is severely broken, it marks the
> > backing device congested during each nfs_writepages() call and implements
> > its own waitqueue.
>
> This is the magic bullet that Andrew is looking for to fix the NFS issues?
Dunno if its magical, but it does solve a few issues I ran into.
> > Index: linux-2.6-git/include/linux/nfs_fs_sb.h
> > ===================================================================
> > --- linux-2.6-git.orig/include/linux/nfs_fs_sb.h 2007-01-12 08:03:47.000000000 +0100
> > +++ linux-2.6-git/include/linux/nfs_fs_sb.h 2007-01-12 08:53:26.000000000 +0100
> > @@ -82,6 +82,8 @@ struct nfs_server {
> > struct rpc_clnt * client_acl; /* ACL RPC client handle */
> > struct nfs_iostats * io_stats; /* I/O statistics */
> > struct backing_dev_info backing_dev_info;
> > + atomic_t writeback; /* number of writeback pages */
> > + atomic_t commit; /* number of commit pages */
> > int flags; /* various flags */
>
> I think writeback is frequently incremented? Would it be possible to avoid
> a single global instance of an atomic_t here? In a busy NFS system
> with lots of processors writing via NFS this may cause a hot cacheline
> that limits write speed.
This would be per NFS mount, pretty global indeed. But not different
that other backing_dev_info's. request_queue::nr_requests suffers a
similar fate.
> Would it be possible to use NR_WRITEBACK? If not then maybe add another
> ZVC counter named NFS_NFS_WRITEBACK?
Its a per backing_dev_info thing. So using the zone counters will not
work.
> > Index: linux-2.6-git/mm/page-writeback.c
> > ===================================================================
> > --- linux-2.6-git.orig/mm/page-writeback.c 2007-01-12 08:03:47.000000000 +0100
> > +++ linux-2.6-git/mm/page-writeback.c 2007-01-12 08:53:26.000000000 +0100
> > @@ -167,6 +167,12 @@ get_dirty_limits(long *pbackground, long
> > *pdirty = dirty;
> > }
> >
> > +int dirty_pages_exceeded(struct address_space *mapping)
> > +{
> > + return dirty_exceeded;
> > +}
> > +EXPORT_SYMBOL_GPL(dirty_pages_exceeded);
> > +
>
> Export the variable instead of adding a new function? Why does it take an
> address space parameter that is not used?
>
Yeah, that function used to be larger.
>
> > Index: linux-2.6-git/fs/inode.c
> > ===================================================================
> > --- linux-2.6-git.orig/fs/inode.c 2007-01-12 08:03:47.000000000 +0100
> > +++ linux-2.6-git/fs/inode.c 2007-01-12 08:53:26.000000000 +0100
> > @@ -81,6 +81,7 @@ static struct hlist_head *inode_hashtabl
> > * the i_state of an inode while it is in use..
> > */
> > DEFINE_SPINLOCK(inode_lock);
> > +EXPORT_SYMBOL_GPL(inode_lock);
>
> Hmmm... Commits to all NFS servers will be globally serialized via the
> inode_lock?
Hmm, right, thats not good indeed, I can pull the call to
nfs_commit_list() out of that loop.
On Wed, 2007-01-17 at 22:52 +0100, Peter Zijlstra wrote:
> >
> > > Index: linux-2.6-git/fs/inode.c
> > > ===================================================================
> > > --- linux-2.6-git.orig/fs/inode.c 2007-01-12 08:03:47.000000000 +0100
> > > +++ linux-2.6-git/fs/inode.c 2007-01-12 08:53:26.000000000 +0100
> > > @@ -81,6 +81,7 @@ static struct hlist_head *inode_hashtabl
> > > * the i_state of an inode while it is in use..
> > > */
> > > DEFINE_SPINLOCK(inode_lock);
> > > +EXPORT_SYMBOL_GPL(inode_lock);
> >
> > Hmmm... Commits to all NFS servers will be globally serialized via the
> > inode_lock?
>
> Hmm, right, thats not good indeed, I can pull the call to
> nfs_commit_list() out of that loop.
There is no reason to modify any of the commit stuff. Please just drop
that code.
Trond
> On Wed, 17 Jan 2007 11:43:42 -0800 (PST) Christoph Lameter <[email protected]> wrote:
> On Tue, 16 Jan 2007, Andrew Morton wrote:
>
> > Do what blockdevs do: limit the number of in-flight requests (Peter's
> > recent patch seems to be doing that for us) (perhaps only when PF_MEMALLOC
> > is in effect, to keep Trond happy) and implement a mempool for the NFS
> > request critical store. Additionally:
> >
> > - we might need to twiddle the NFS gfp_flags so it doesn't call the
> > oom-killer on failure: just return NULL.
> >
> > - consider going off-cpuset for critical allocations. It's better than
> > going oom. A suitable implementation might be to ignore the caller's
> > cpuset if PF_MEMALLOC. Maybe put a WARN_ON_ONCE in there: we prefer that
> > it not happen and we want to know when it does.
>
> Given the intermediate layers (network, additional gizmos (ip over xxx)
> and the network cards) that will not be easy.
Paul has observed that it's already done. But it seems to not be working.
> > btw, regarding the per-address_space node mask: I think we should free it
> > when the inode is clean (!mapping_tagged(PAGECACHE_TAG_DIRTY)). Chances
> > are, the inode will be dirty for 30 seconds and in-core for hours. We
> > might as well steal its nodemask storage and give it to the next file which
> > gets written to. A suitable place to do all this is in
> > __mark_inode_dirty(I_DIRTY_PAGES), using inode_lock to protect
> > address_space.dirty_page_nodemask.
>
> The inode lock is not taken when the page is dirtied.
The inode_lock is taken when the address_space's first page is dirtied. It is
also taken when the address_space's last dirty page is cleaned. So the place
where the inode is added to and removed from sb->s_dirty is, I think, exactly
the place where we want to attach and detach address_space.dirty_page_nodemask.
> The tree_lock
> is already taken when the mapping is dirtied and so I used that to
> avoid races adding and removing pointers to nodemasks from the address
> space.
> --- linux-2.6-git.orig/fs/inode.c 2007-01-12 08:03:47.000000000 +0100
> +++ linux-2.6-git/fs/inode.c 2007-01-12 08:53:26.000000000 +0100
> @@ -81,6 +81,7 @@ static struct hlist_head *inode_hashtabl
> * the i_state of an inode while it is in use..
> */
> DEFINE_SPINLOCK(inode_lock);
> +EXPORT_SYMBOL_GPL(inode_lock);
Btw, big "no fucking way" here. There is no chance we're going to export
this, even _GPL
On Wed, 17 Jan 2007, Andrew Morton wrote:
> > The inode lock is not taken when the page is dirtied.
>
> The inode_lock is taken when the address_space's first page is dirtied. It is
> also taken when the address_space's last dirty page is cleaned. So the place
> where the inode is added to and removed from sb->s_dirty is, I think, exactly
> the place where we want to attach and detach address_space.dirty_page_nodemask.
The problem there is that we do a GFP_ATOMIC allocation (no allocation
context) that may fail when the first page is dirtied. We must therefore
be able to subsequently allocate the nodemask_t in set_page_dirty().
Otherwise the first failure will mean that there will never be a dirty
map for the inode/mapping.
> On Wed, 17 Jan 2007 17:10:25 -0800 (PST) Christoph Lameter <[email protected]> wrote:
> On Wed, 17 Jan 2007, Andrew Morton wrote:
>
> > > The inode lock is not taken when the page is dirtied.
> >
> > The inode_lock is taken when the address_space's first page is dirtied. It is
> > also taken when the address_space's last dirty page is cleaned. So the place
> > where the inode is added to and removed from sb->s_dirty is, I think, exactly
> > the place where we want to attach and detach address_space.dirty_page_nodemask.
>
> The problem there is that we do a GFP_ATOMIC allocation (no allocation
> context) that may fail when the first page is dirtied. We must therefore
> be able to subsequently allocate the nodemask_t in set_page_dirty().
> Otherwise the first failure will mean that there will never be a dirty
> map for the inode/mapping.
True. But it's pretty simple to change __mark_inode_dirty() to fix this.
On Wed, 17 Jan 2007, Andrew Morton wrote:
> > The problem there is that we do a GFP_ATOMIC allocation (no allocation
> > context) that may fail when the first page is dirtied. We must therefore
> > be able to subsequently allocate the nodemask_t in set_page_dirty().
> > Otherwise the first failure will mean that there will never be a dirty
> > map for the inode/mapping.
>
> True. But it's pretty simple to change __mark_inode_dirty() to fix this.
Ok I tried it but this wont work unless I also pass the page struct pointer to
__mark_inode_dirty() since the dirty_node pointer could be freed
when the inode_lock is droppped. So I cannot dereference the
dirty_nodes pointer outside of __mark_inode_dirty.
If I expand __mark_inode_dirty then all variations of mark_inode_dirty()
need to be changed and we need to pass a page struct everywhere. This
result in extensive changes.
I think I need to stick with the tree_lock. This also makes more sense
since we modify dirty information in the address_space structure and the
radix tree is already protected by that lock.
On Wed, 2007-01-17 at 16:54 -0500, Trond Myklebust wrote:
> On Wed, 2007-01-17 at 22:52 +0100, Peter Zijlstra wrote:
>
> > >
> > > > Index: linux-2.6-git/fs/inode.c
> > > > ===================================================================
> > > > --- linux-2.6-git.orig/fs/inode.c 2007-01-12 08:03:47.000000000 +0100
> > > > +++ linux-2.6-git/fs/inode.c 2007-01-12 08:53:26.000000000 +0100
> > > > @@ -81,6 +81,7 @@ static struct hlist_head *inode_hashtabl
> > > > * the i_state of an inode while it is in use..
> > > > */
> > > > DEFINE_SPINLOCK(inode_lock);
> > > > +EXPORT_SYMBOL_GPL(inode_lock);
> > >
> > > Hmmm... Commits to all NFS servers will be globally serialized via the
> > > inode_lock?
> >
> > Hmm, right, thats not good indeed, I can pull the call to
> > nfs_commit_list() out of that loop.
>
> There is no reason to modify any of the commit stuff. Please just drop
> that code.
I though you agreed to flushing commit pages when hitting the dirty page
limit.
Or would you rather see a notifier chain from congestion_wait() calling
into the various NFS mounts?
Christoph Lameter writes:
> Consider unreclaimable pages during dirty limit calculation
>
> Tracking unreclaimable pages helps us to calculate the dirty ratio
> the right way. If a large number of unreclaimable pages are allocated
> (through the slab or through huge pages) then write throttling will
> no longer work since the limit cannot be reached anymore.
>
> So we simply subtract the number of unreclaimable pages from the pages
> considered for writeout threshold calculation.
>
> Other code that allocates significant amounts of memory for device
> drivers etc could also be modified to take advantage of this functionality.
I think that simpler solution of this problem is to use only potentially
reclaimable pages (that is, active, inactive, and free pages) to
calculate writeout threshold. This way there is no need to maintain
counters for unreclaimable pages. Below is a patch implementing this
idea, it got some testing.
Nikita.
Fix write throttling to calculate its thresholds from amount of memory that
can be consumed by file system and swap caches, rather than from the total
amount of physical memory. This avoids situations (among other things) when
memory consumed by kernel slab allocator prevents write throttling from ever
happening.
Signed-off-by: Nikita Danilov <[email protected]>
mm/page-writeback.c | 33 ++++++++++++++++++++++++---------
1 files changed, 24 insertions(+), 9 deletions(-)
Index: git-linux/mm/page-writeback.c
===================================================================
--- git-linux.orig/mm/page-writeback.c
+++ git-linux/mm/page-writeback.c
@@ -101,6 +101,18 @@ EXPORT_SYMBOL(laptop_mode);
static void background_writeout(unsigned long _min_pages);
+/* Maximal number of pages that can be consumed by pageable caches. */
+static unsigned long total_pageable_pages(void)
+{
+ unsigned long active;
+ unsigned long inactive;
+ unsigned long free;
+
+ get_zone_counts(&active, &inactive, &free);
+ /* +1 to never return 0. */
+ return active + inactive + free + 1;
+}
+
/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
@@ -127,22 +139,31 @@ get_dirty_limits(long *pbackground, long
int unmapped_ratio;
long background;
long dirty;
- unsigned long available_memory = vm_total_pages;
+ unsigned long total_pages;
+ unsigned long available_memory;
struct task_struct *tsk;
+ available_memory = total_pages = total_pageable_pages();
+
#ifdef CONFIG_HIGHMEM
/*
* If this mapping can only allocate from low memory,
* we exclude high memory from our count.
*/
- if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
+ if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) {
+ if (available_memory > totalhigh_pages)
available_memory -= totalhigh_pages;
+ else
+ available_memory = 1;
+ }
#endif
unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES)) * 100) /
- vm_total_pages;
+ total_pages;
+ if (unmapped_ratio < 0)
+ unmapped_ratio = 0;
dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
@@ -513,7 +534,7 @@ void laptop_sync_completion(void)
void writeback_set_ratelimit(void)
{
- ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
+ ratelimit_pages = total_pageable_pages() / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
@@ -542,7 +563,7 @@ void __init page_writeback_init(void)
long buffer_pages = nr_free_buffer_pages();
long correction;
- correction = (100 * 4 * buffer_pages) / vm_total_pages;
+ correction = (100 * 4 * buffer_pages) / total_pageable_pages();
if (correction < 100) {
dirty_background_ratio *= correction;
On Thu, 2007-01-18 at 14:27 +0100, Peter Zijlstra wrote:
> On Wed, 2007-01-17 at 16:54 -0500, Trond Myklebust wrote:
> > On Wed, 2007-01-17 at 22:52 +0100, Peter Zijlstra wrote:
> >
> > > >
> > > > > Index: linux-2.6-git/fs/inode.c
> > > > > ===================================================================
> > > > > --- linux-2.6-git.orig/fs/inode.c 2007-01-12 08:03:47.000000000 +0100
> > > > > +++ linux-2.6-git/fs/inode.c 2007-01-12 08:53:26.000000000 +0100
> > > > > @@ -81,6 +81,7 @@ static struct hlist_head *inode_hashtabl
> > > > > * the i_state of an inode while it is in use..
> > > > > */
> > > > > DEFINE_SPINLOCK(inode_lock);
> > > > > +EXPORT_SYMBOL_GPL(inode_lock);
> > > >
> > > > Hmmm... Commits to all NFS servers will be globally serialized via the
> > > > inode_lock?
> > >
> > > Hmm, right, thats not good indeed, I can pull the call to
> > > nfs_commit_list() out of that loop.
> >
> > There is no reason to modify any of the commit stuff. Please just drop
> > that code.
>
> I though you agreed to flushing commit pages when hitting the dirty page
> limit.
After the dirty page has been written to unstable storage, it marks the
inode using I_DIRTY_DATASYNC, which should then ensure that the VFS
calls write_inode() on the next pass through __sync_single_inode.
> Or would you rather see a notifier chain from congestion_wait() calling
> into the various NFS mounts?
I'd rather like to see fs/fs-writeback.c do this correctly (assuming
that it is incorrect now).
Trond
On Thu, 18 Jan 2007, Nikita Danilov wrote:
> I think that simpler solution of this problem is to use only potentially
> reclaimable pages (that is, active, inactive, and free pages) to
> calculate writeout threshold. This way there is no need to maintain
> counters for unreclaimable pages. Below is a patch implementing this
> idea, it got some testing.
Hmmm... the problem is that it is expensive to calculate these numbers on
larger systems. In order to calculate active and inactive pages we
have to first go through all the zones of the system. In a NUMA system
there could be many zones.
> +/* Maximal number of pages that can be consumed by pageable caches. */
> +static unsigned long total_pageable_pages(void)
> +{
> + unsigned long active;
> + unsigned long inactive;
> + unsigned long free;
> +
> + get_zone_counts(&active, &inactive, &free);
> + /* +1 to never return 0. */
> + return active + inactive + free + 1;
> +}
An expensive function. And we need to call it whenever we calculate dirty
limits.
Maybe could create ZVC counters that allow an inexpensive determination of
these numbers? Then we first need to make sure that the counters are not
assumed to be accurate at all times.
On Thu, 2007-01-18 at 10:49 -0500, Trond Myklebust wrote:
> After the dirty page has been written to unstable storage, it marks the
> inode using I_DIRTY_DATASYNC, which should then ensure that the VFS
> calls write_inode() on the next pass through __sync_single_inode.
> I'd rather like to see fs/fs-writeback.c do this correctly (assuming
> that it is incorrect now).
balance_dirty_pages()
wbc.sync_mode = WB_SYNC_NONE;
writeback_inodes()
sync_sb_inodes()
__writeback_single_inode()
__sync_single_inode()
write_inode()
nfs_write_inode()
Ah, yes, I see. That ought to work.
/me goes verify he didn't mess it up himself...
On Fri, 2007-01-19 at 10:34 +0100, Peter Zijlstra wrote:
> On Thu, 2007-01-18 at 10:49 -0500, Trond Myklebust wrote:
>
> > After the dirty page has been written to unstable storage, it marks the
> > inode using I_DIRTY_DATASYNC, which should then ensure that the VFS
> > calls write_inode() on the next pass through __sync_single_inode.
>
> > I'd rather like to see fs/fs-writeback.c do this correctly (assuming
> > that it is incorrect now).
>
> balance_dirty_pages()
> wbc.sync_mode = WB_SYNC_NONE;
> writeback_inodes()
> sync_sb_inodes()
> __writeback_single_inode()
> __sync_single_inode()
> write_inode()
> nfs_write_inode()
>
> Ah, yes, I see. That ought to work.
>
> /me goes verify he didn't mess it up himself...
And of course I did. This seems to work.
The problem I had was that throttle_vm_writeout() got stuck on commit
pages. But that can be solved with a much much simpler and better
solution. Force all swap traffic to be |FLUSH_STABLE, unstable swap
space doesn't make sense anyway :-)
So with that out of the way I now have this
---
Subject: nfs: fix congestion control
The current NFS client congestion logic is severly broken, it marks the backing
device congested during each nfs_writepages() call but doesn't mirror this in
nfs_writepage() which makes for deadlocks. Also it implements its own waitqueue.
Replace this by a more regular congestion implementation that puts a cap on the
number of active writeback pages and uses the bdi congestion waitqueue.
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Trond Myklebust <[email protected]>
---
fs/nfs/write.c | 113 ++++++++++++++++++++++++++++----------------
include/linux/backing-dev.h | 1
include/linux/nfs_fs_sb.h | 1
mm/backing-dev.c | 16 ++++++
4 files changed, 90 insertions(+), 41 deletions(-)
Index: linux-2.6-git/fs/nfs/write.c
===================================================================
--- linux-2.6-git.orig/fs/nfs/write.c 2007-01-19 13:57:49.000000000 +0100
+++ linux-2.6-git/fs/nfs/write.c 2007-01-19 14:03:30.000000000 +0100
@@ -89,8 +89,6 @@ static struct kmem_cache *nfs_wdata_cach
static mempool_t *nfs_wdata_mempool;
static mempool_t *nfs_commit_mempool;
-static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
-
struct nfs_write_data *nfs_commit_alloc(void)
{
struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
@@ -245,6 +243,39 @@ static int wb_priority(struct writeback_
}
/*
+ * NFS congestion control
+ */
+
+static unsigned long nfs_congestion_pages;
+
+#define NFS_CONGESTION_ON_THRESH nfs_congestion_pages
+#define NFS_CONGESTION_OFF_THRESH \
+ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+
+static inline void nfs_set_page_writeback(struct page *page)
+{
+ if (!test_set_page_writeback(page)) {
+ struct inode *inode = page->mapping->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ if (atomic_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH)
+ set_bdi_congested(&nfss->backing_dev_info, WRITE);
+ }
+}
+
+static inline void nfs_end_page_writeback(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ end_page_writeback(page);
+ if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+ clear_bdi_congested(&nfss->backing_dev_info, WRITE);
+ congestion_end(WRITE);
+ }
+}
+
+/*
* Find an associated nfs write request, and prepare to flush it out
* Returns 1 if there was no write request, or if the request was
* already tagged by nfs_set_page_dirty.Returns 0 if the request
@@ -281,7 +312,7 @@ static int nfs_page_mark_flush(struct pa
spin_unlock(req_lock);
if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) {
nfs_mark_request_dirty(req);
- set_page_writeback(page);
+ nfs_set_page_writeback(page);
}
ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
nfs_unlock_request(req);
@@ -336,13 +367,8 @@ int nfs_writepage(struct page *page, str
return err;
}
-/*
- * Note: causes nfs_update_request() to block on the assumption
- * that the writeback is generated due to memory pressure.
- */
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
struct inode *inode = mapping->host;
int err;
@@ -351,11 +377,6 @@ int nfs_writepages(struct address_space
err = generic_writepages(mapping, wbc);
if (err)
return err;
- while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
- if (wbc->nonblocking)
- return 0;
- nfs_wait_on_write_congestion(mapping, 0);
- }
err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
if (err < 0)
goto out;
@@ -369,9 +390,6 @@ int nfs_writepages(struct address_space
if (err > 0)
err = 0;
out:
- clear_bit(BDI_write_congested, &bdi->state);
- wake_up_all(&nfs_write_congestion);
- congestion_end(WRITE);
return err;
}
@@ -401,7 +419,7 @@ static int nfs_inode_add_request(struct
}
/*
- * Insert a write request into an inode
+ * Remove a write request from an inode
*/
static void nfs_inode_remove_request(struct nfs_page *req)
{
@@ -585,8 +603,8 @@ static inline int nfs_scan_commit(struct
static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
{
+ struct inode *inode = mapping->host;
struct backing_dev_info *bdi = mapping->backing_dev_info;
- DEFINE_WAIT(wait);
int ret = 0;
might_sleep();
@@ -594,31 +612,27 @@ static int nfs_wait_on_write_congestion(
if (!bdi_write_congested(bdi))
return 0;
- nfs_inc_stats(mapping->host, NFSIOS_CONGESTIONWAIT);
+ nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
- if (intr) {
- struct rpc_clnt *clnt = NFS_CLIENT(mapping->host);
- sigset_t oldset;
-
- rpc_clnt_sigmask(clnt, &oldset);
- prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
- if (bdi_write_congested(bdi)) {
- if (signalled())
- ret = -ERESTARTSYS;
- else
- schedule();
+ do {
+ if (intr) {
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ sigset_t oldset;
+
+ rpc_clnt_sigmask(clnt, &oldset);
+ ret = congestion_wait_interruptible(WRITE, HZ/10);
+ rpc_clnt_sigunmask(clnt, &oldset);
+ if (ret == -ERESTARTSYS)
+ return ret;
+ ret = 0;
+ } else {
+ congestion_wait(WRITE, HZ/10);
}
- rpc_clnt_sigunmask(clnt, &oldset);
- } else {
- prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
- if (bdi_write_congested(bdi))
- schedule();
- }
- finish_wait(&nfs_write_congestion, &wait);
+ } while (bdi_write_congested(bdi));
+
return ret;
}
-
/*
* Try to update any existing write request, or create one if there is none.
* In order to match, the request's credentials must match those of
@@ -779,7 +793,7 @@ int nfs_updatepage(struct file *file, st
static void nfs_writepage_release(struct nfs_page *req)
{
- end_page_writeback(req->wb_page);
+ nfs_end_page_writeback(req->wb_page);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (!PageError(req->wb_page)) {
@@ -1095,12 +1109,12 @@ static void nfs_writeback_done_full(stru
ClearPageUptodate(page);
SetPageError(page);
req->wb_context->error = task->tk_status;
- end_page_writeback(page);
+ nfs_end_page_writeback(page);
nfs_inode_remove_request(req);
dprintk(", error = %d\n", task->tk_status);
goto next;
}
- end_page_writeback(page);
+ nfs_end_page_writeback(page);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
@@ -1565,6 +1579,23 @@ int __init nfs_init_writepagecache(void)
if (nfs_commit_mempool == NULL)
return -ENOMEM;
+ /*
+ * NFS congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ */
+ nfs_congestion_size = 32*int_sqrt(totalram_pages);
+
return 0;
}
Index: linux-2.6-git/mm/backing-dev.c
===================================================================
--- linux-2.6-git.orig/mm/backing-dev.c 2007-01-19 13:57:49.000000000 +0100
+++ linux-2.6-git/mm/backing-dev.c 2007-01-19 13:57:52.000000000 +0100
@@ -55,6 +55,22 @@ long congestion_wait(int rw, long timeou
}
EXPORT_SYMBOL(congestion_wait);
+long congestion_wait_interruptible(int rw, long timeout)
+{
+ long ret;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+ prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ ret = -ERESTARTSYS;
+ else
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+ return ret;
+}
+EXPORT_SYMBOL(congestion_wait_interruptible);
+
/**
* congestion_end - wake up sleepers on a congested backing_dev_info
* @rw: READ or WRITE
Index: linux-2.6-git/include/linux/nfs_fs_sb.h
===================================================================
--- linux-2.6-git.orig/include/linux/nfs_fs_sb.h 2007-01-19 13:57:49.000000000 +0100
+++ linux-2.6-git/include/linux/nfs_fs_sb.h 2007-01-19 13:57:52.000000000 +0100
@@ -82,6 +82,7 @@ struct nfs_server {
struct rpc_clnt * client_acl; /* ACL RPC client handle */
struct nfs_iostats * io_stats; /* I/O statistics */
struct backing_dev_info backing_dev_info;
+ atomic_t writeback; /* number of writeback pages */
int flags; /* various flags */
unsigned int caps; /* server capabilities */
unsigned int rsize; /* read size */
Index: linux-2.6-git/include/linux/backing-dev.h
===================================================================
--- linux-2.6-git.orig/include/linux/backing-dev.h 2007-01-19 13:57:49.000000000 +0100
+++ linux-2.6-git/include/linux/backing-dev.h 2007-01-19 13:57:52.000000000 +0100
@@ -93,6 +93,7 @@ static inline int bdi_rw_congested(struc
void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
void set_bdi_congested(struct backing_dev_info *bdi, int rw);
long congestion_wait(int rw, long timeout);
+long congestion_wait_interruptible(int rw, long timeout);
void congestion_end(int rw);
#define bdi_cap_writeback_dirty(bdi) \
On Fri, 2007-01-19 at 14:07 +0100, Peter Zijlstra wrote:
> On Fri, 2007-01-19 at 10:34 +0100, Peter Zijlstra wrote:
> > On Thu, 2007-01-18 at 10:49 -0500, Trond Myklebust wrote:
> >
> > > After the dirty page has been written to unstable storage, it marks the
> > > inode using I_DIRTY_DATASYNC, which should then ensure that the VFS
> > > calls write_inode() on the next pass through __sync_single_inode.
> >
> > > I'd rather like to see fs/fs-writeback.c do this correctly (assuming
> > > that it is incorrect now).
> >
> > balance_dirty_pages()
> > wbc.sync_mode = WB_SYNC_NONE;
> > writeback_inodes()
> > sync_sb_inodes()
> > __writeback_single_inode()
> > __sync_single_inode()
> > write_inode()
> > nfs_write_inode()
> >
> > Ah, yes, I see. That ought to work.
> >
> > /me goes verify he didn't mess it up himself...
>
> And of course I did. This seems to work.
>
> The problem I had was that throttle_vm_writeout() got stuck on commit
> pages. But that can be solved with a much much simpler and better
> solution. Force all swap traffic to be |FLUSH_STABLE, unstable swap
> space doesn't make sense anyway :-)
Yep. Since we don't care too much about metadata updates on the swap
file (the file size should never change), we could perhaps make the
swapper write out the pages with DATA_SYNC rather than the full
FILE_SYNC.
> So with that out of the way I now have this
Looks much better. Just one obvious buglet...
> ---
>
> Subject: nfs: fix congestion control
>
> The current NFS client congestion logic is severly broken, it marks the backing
> device congested during each nfs_writepages() call but doesn't mirror this in
> nfs_writepage() which makes for deadlocks. Also it implements its own waitqueue.
>
> Replace this by a more regular congestion implementation that puts a cap on the
> number of active writeback pages and uses the bdi congestion waitqueue.
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Cc: Trond Myklebust <[email protected]>
> ---
> fs/nfs/write.c | 113 ++++++++++++++++++++++++++++----------------
> include/linux/backing-dev.h | 1
> include/linux/nfs_fs_sb.h | 1
> mm/backing-dev.c | 16 ++++++
> 4 files changed, 90 insertions(+), 41 deletions(-)
>
> Index: linux-2.6-git/fs/nfs/write.c
> ===================================================================
> --- linux-2.6-git.orig/fs/nfs/write.c 2007-01-19 13:57:49.000000000 +0100
> +++ linux-2.6-git/fs/nfs/write.c 2007-01-19 14:03:30.000000000 +0100
> @@ -89,8 +89,6 @@ static struct kmem_cache *nfs_wdata_cach
> static mempool_t *nfs_wdata_mempool;
> static mempool_t *nfs_commit_mempool;
>
> -static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
> -
> struct nfs_write_data *nfs_commit_alloc(void)
> {
> struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
> @@ -245,6 +243,39 @@ static int wb_priority(struct writeback_
> }
>
> /*
> + * NFS congestion control
> + */
> +
> +static unsigned long nfs_congestion_pages;
> +
> +#define NFS_CONGESTION_ON_THRESH nfs_congestion_pages
> +#define NFS_CONGESTION_OFF_THRESH \
> + (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
> +
> +static inline void nfs_set_page_writeback(struct page *page)
> +{
> + if (!test_set_page_writeback(page)) {
> + struct inode *inode = page->mapping->host;
> + struct nfs_server *nfss = NFS_SERVER(inode);
> +
> + if (atomic_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH)
> + set_bdi_congested(&nfss->backing_dev_info, WRITE);
> + }
> +}
> +
> +static inline void nfs_end_page_writeback(struct page *page)
> +{
> + struct inode *inode = page->mapping->host;
> + struct nfs_server *nfss = NFS_SERVER(inode);
> +
> + end_page_writeback(page);
> + if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
> + clear_bdi_congested(&nfss->backing_dev_info, WRITE);
> + congestion_end(WRITE);
> + }
> +}
> +
> +/*
> * Find an associated nfs write request, and prepare to flush it out
> * Returns 1 if there was no write request, or if the request was
> * already tagged by nfs_set_page_dirty.Returns 0 if the request
> @@ -281,7 +312,7 @@ static int nfs_page_mark_flush(struct pa
> spin_unlock(req_lock);
> if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) {
> nfs_mark_request_dirty(req);
> - set_page_writeback(page);
> + nfs_set_page_writeback(page);
> }
> ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
> nfs_unlock_request(req);
> @@ -336,13 +367,8 @@ int nfs_writepage(struct page *page, str
> return err;
> }
>
> -/*
> - * Note: causes nfs_update_request() to block on the assumption
> - * that the writeback is generated due to memory pressure.
> - */
> int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
> {
> - struct backing_dev_info *bdi = mapping->backing_dev_info;
> struct inode *inode = mapping->host;
> int err;
>
> @@ -351,11 +377,6 @@ int nfs_writepages(struct address_space
> err = generic_writepages(mapping, wbc);
> if (err)
> return err;
> - while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
> - if (wbc->nonblocking)
> - return 0;
> - nfs_wait_on_write_congestion(mapping, 0);
> - }
> err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
> if (err < 0)
> goto out;
> @@ -369,9 +390,6 @@ int nfs_writepages(struct address_space
> if (err > 0)
> err = 0;
> out:
> - clear_bit(BDI_write_congested, &bdi->state);
> - wake_up_all(&nfs_write_congestion);
> - congestion_end(WRITE);
> return err;
> }
>
> @@ -401,7 +419,7 @@ static int nfs_inode_add_request(struct
> }
>
> /*
> - * Insert a write request into an inode
> + * Remove a write request from an inode
> */
> static void nfs_inode_remove_request(struct nfs_page *req)
> {
> @@ -585,8 +603,8 @@ static inline int nfs_scan_commit(struct
>
> static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
> {
> + struct inode *inode = mapping->host;
> struct backing_dev_info *bdi = mapping->backing_dev_info;
> - DEFINE_WAIT(wait);
> int ret = 0;
>
> might_sleep();
> @@ -594,31 +612,27 @@ static int nfs_wait_on_write_congestion(
> if (!bdi_write_congested(bdi))
> return 0;
>
> - nfs_inc_stats(mapping->host, NFSIOS_CONGESTIONWAIT);
> + nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
>
> - if (intr) {
> - struct rpc_clnt *clnt = NFS_CLIENT(mapping->host);
> - sigset_t oldset;
> -
> - rpc_clnt_sigmask(clnt, &oldset);
> - prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
> - if (bdi_write_congested(bdi)) {
> - if (signalled())
> - ret = -ERESTARTSYS;
> - else
> - schedule();
> + do {
> + if (intr) {
> + struct rpc_clnt *clnt = NFS_CLIENT(inode);
> + sigset_t oldset;
> +
> + rpc_clnt_sigmask(clnt, &oldset);
> + ret = congestion_wait_interruptible(WRITE, HZ/10);
> + rpc_clnt_sigunmask(clnt, &oldset);
> + if (ret == -ERESTARTSYS)
> + return ret;
> + ret = 0;
> + } else {
> + congestion_wait(WRITE, HZ/10);
> }
> - rpc_clnt_sigunmask(clnt, &oldset);
> - } else {
> - prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
> - if (bdi_write_congested(bdi))
> - schedule();
> - }
> - finish_wait(&nfs_write_congestion, &wait);
> + } while (bdi_write_congested(bdi));
> +
> return ret;
> }
>
> -
> /*
> * Try to update any existing write request, or create one if there is none.
> * In order to match, the request's credentials must match those of
> @@ -779,7 +793,7 @@ int nfs_updatepage(struct file *file, st
>
> static void nfs_writepage_release(struct nfs_page *req)
> {
> - end_page_writeback(req->wb_page);
> + nfs_end_page_writeback(req->wb_page);
>
> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
> if (!PageError(req->wb_page)) {
> @@ -1095,12 +1109,12 @@ static void nfs_writeback_done_full(stru
> ClearPageUptodate(page);
> SetPageError(page);
> req->wb_context->error = task->tk_status;
> - end_page_writeback(page);
> + nfs_end_page_writeback(page);
> nfs_inode_remove_request(req);
> dprintk(", error = %d\n", task->tk_status);
> goto next;
> }
> - end_page_writeback(page);
> + nfs_end_page_writeback(page);
>
> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
> if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
> @@ -1565,6 +1579,23 @@ int __init nfs_init_writepagecache(void)
> if (nfs_commit_mempool == NULL)
> return -ENOMEM;
>
> + /*
> + * NFS congestion size, scale with available memory.
> + *
> + * 64MB: 8192k
> + * 128MB: 11585k
> + * 256MB: 16384k
> + * 512MB: 23170k
> + * 1GB: 32768k
> + * 2GB: 46340k
> + * 4GB: 65536k
> + * 8GB: 92681k
> + * 16GB: 131072k
> + *
> + * This allows larger machines to have larger/more transfers.
> + */
> + nfs_congestion_size = 32*int_sqrt(totalram_pages);
> +
^^^^^^^^^^^^^^^^^^^ nfs_congestion_pages?
> return 0;
> }
>
> Index: linux-2.6-git/mm/backing-dev.c
> ===================================================================
> --- linux-2.6-git.orig/mm/backing-dev.c 2007-01-19 13:57:49.000000000 +0100
> +++ linux-2.6-git/mm/backing-dev.c 2007-01-19 13:57:52.000000000 +0100
> @@ -55,6 +55,22 @@ long congestion_wait(int rw, long timeou
> }
> EXPORT_SYMBOL(congestion_wait);
>
> +long congestion_wait_interruptible(int rw, long timeout)
> +{
> + long ret;
> + DEFINE_WAIT(wait);
> + wait_queue_head_t *wqh = &congestion_wqh[rw];
> +
> + prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
> + if (signal_pending(current))
> + ret = -ERESTARTSYS;
> + else
> + ret = io_schedule_timeout(timeout);
> + finish_wait(wqh, &wait);
> + return ret;
> +}
> +EXPORT_SYMBOL(congestion_wait_interruptible);
> +
> /**
> * congestion_end - wake up sleepers on a congested backing_dev_info
> * @rw: READ or WRITE
> Index: linux-2.6-git/include/linux/nfs_fs_sb.h
> ===================================================================
> --- linux-2.6-git.orig/include/linux/nfs_fs_sb.h 2007-01-19 13:57:49.000000000 +0100
> +++ linux-2.6-git/include/linux/nfs_fs_sb.h 2007-01-19 13:57:52.000000000 +0100
> @@ -82,6 +82,7 @@ struct nfs_server {
> struct rpc_clnt * client_acl; /* ACL RPC client handle */
> struct nfs_iostats * io_stats; /* I/O statistics */
> struct backing_dev_info backing_dev_info;
> + atomic_t writeback; /* number of writeback pages */
> int flags; /* various flags */
> unsigned int caps; /* server capabilities */
> unsigned int rsize; /* read size */
> Index: linux-2.6-git/include/linux/backing-dev.h
> ===================================================================
> --- linux-2.6-git.orig/include/linux/backing-dev.h 2007-01-19 13:57:49.000000000 +0100
> +++ linux-2.6-git/include/linux/backing-dev.h 2007-01-19 13:57:52.000000000 +0100
> @@ -93,6 +93,7 @@ static inline int bdi_rw_congested(struc
> void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
> void set_bdi_congested(struct backing_dev_info *bdi, int rw);
> long congestion_wait(int rw, long timeout);
> +long congestion_wait_interruptible(int rw, long timeout);
> void congestion_end(int rw);
>
> #define bdi_cap_writeback_dirty(bdi) \
>
>
On Fri, 19 Jan 2007, Peter Zijlstra wrote:
> + /*
> + * NFS congestion size, scale with available memory.
> + *
Well this all depends on the memory available to the running process.
If the process is just allowed to allocate from a subset of memory
(cpusets) then this may need to be lower.
> + * 64MB: 8192k
> + * 128MB: 11585k
> + * 256MB: 16384k
> + * 512MB: 23170k
> + * 1GB: 32768k
> + * 2GB: 46340k
> + * 4GB: 65536k
> + * 8GB: 92681k
> + * 16GB: 131072k
Hmmm... lets say we have the worst case of an 8TB IA64 system with 1k
nodes of 8G each. On Ia64 the number of pages is 8TB/16KB pagesize = 512
million pages. Thus nfs_congestion_size is 724064 pages which is
11.1Gbytes?
If we now restrict a cpuset to a single node then have a
nfs_congestion_size of 11.1G vs an available memory on a node of 8G.
On Fri, 2007-01-19 at 11:51 -0500, Trond Myklebust wrote:
> > So with that out of the way I now have this
>
> Looks much better. Just one obvious buglet...
> > @@ -1565,6 +1579,23 @@ int __init nfs_init_writepagecache(void)
> > if (nfs_commit_mempool == NULL)
> > return -ENOMEM;
> >
> > + /*
> > + * NFS congestion size, scale with available memory.
> > + *
> > + * 64MB: 8192k
> > + * 128MB: 11585k
> > + * 256MB: 16384k
> > + * 512MB: 23170k
> > + * 1GB: 32768k
> > + * 2GB: 46340k
> > + * 4GB: 65536k
> > + * 8GB: 92681k
> > + * 16GB: 131072k
> > + *
> > + * This allows larger machines to have larger/more transfers.
> > + */
> > + nfs_congestion_size = 32*int_sqrt(totalram_pages);
> > +
> ^^^^^^^^^^^^^^^^^^^ nfs_congestion_pages?
Ah, yeah, forgot to refresh the patch one last time before sending
out :-(.
On Fri, 2007-01-19 at 09:20 -0800, Christoph Lameter wrote:
> On Fri, 19 Jan 2007, Peter Zijlstra wrote:
>
> > + /*
> > + * NFS congestion size, scale with available memory.
> > + *
>
> Well this all depends on the memory available to the running process.
> If the process is just allowed to allocate from a subset of memory
> (cpusets) then this may need to be lower.
>
> > + * 64MB: 8192k
> > + * 128MB: 11585k
> > + * 256MB: 16384k
> > + * 512MB: 23170k
> > + * 1GB: 32768k
> > + * 2GB: 46340k
> > + * 4GB: 65536k
> > + * 8GB: 92681k
> > + * 16GB: 131072k
>
> Hmmm... lets say we have the worst case of an 8TB IA64 system with 1k
> nodes of 8G each.
Eeuh, right. Glad to have you around to remind how puny my boxens
are :-)
> On Ia64 the number of pages is 8TB/16KB pagesize = 512
> million pages. Thus nfs_congestion_size is 724064 pages which is
> 11.1Gbytes?
>
> If we now restrict a cpuset to a single node then have a
> nfs_congestion_size of 11.1G vs an available memory on a node of 8G.
Right, perhaps cap this to a max of 256M. That would allow 128 2M RPC
transfers, much more would not be needed I guess. Trond?
On Fri, 19 Jan 2007, Peter Zijlstra wrote:
> Eeuh, right. Glad to have you around to remind how puny my boxens
> are :-)
Sorry about that but it was unavoidable if we want to get to reasonable
limits that will work in all situations.
On Fri, 2007-01-19 at 18:57 +0100, Peter Zijlstra wrote:
> On Fri, 2007-01-19 at 09:20 -0800, Christoph Lameter wrote:
> > On Fri, 19 Jan 2007, Peter Zijlstra wrote:
> >
> > > + /*
> > > + * NFS congestion size, scale with available memory.
> > > + *
> >
> > Well this all depends on the memory available to the running process.
> > If the process is just allowed to allocate from a subset of memory
> > (cpusets) then this may need to be lower.
> >
> > > + * 64MB: 8192k
> > > + * 128MB: 11585k
> > > + * 256MB: 16384k
> > > + * 512MB: 23170k
> > > + * 1GB: 32768k
> > > + * 2GB: 46340k
> > > + * 4GB: 65536k
> > > + * 8GB: 92681k
> > > + * 16GB: 131072k
> >
> > Hmmm... lets say we have the worst case of an 8TB IA64 system with 1k
> > nodes of 8G each.
>
> Eeuh, right. Glad to have you around to remind how puny my boxens
> are :-)
>
> > On Ia64 the number of pages is 8TB/16KB pagesize = 512
> > million pages. Thus nfs_congestion_size is 724064 pages which is
> > 11.1Gbytes?
> >
> > If we now restrict a cpuset to a single node then have a
> > nfs_congestion_size of 11.1G vs an available memory on a node of 8G.
>
> Right, perhaps cap this to a max of 256M. That would allow 128 2M RPC
> transfers, much more would not be needed I guess. Trond?
That would be good as a default, but I've been thinking that we could
perhaps also add a sysctl in /proc/sys/fs/nfs in order to make it a
tunable?
Cheers,
Trond
On Fri, 19 Jan 2007, Trond Myklebust wrote:
> That would be good as a default, but I've been thinking that we could
> perhaps also add a sysctl in /proc/sys/fs/nfs in order to make it a
> tunable?
Good idea.
Subject: nfs: fix congestion control
The current NFS client congestion logic is severly broken, it marks the backing
device congested during each nfs_writepages() call but doesn't mirror this in
nfs_writepage() which makes for deadlocks. Also it implements its own waitqueue.
Replace this by a more regular congestion implementation that puts a cap on the
number of active writeback pages and uses the bdi congestion waitqueue.
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Trond Myklebust <[email protected]>
---
fs/nfs/super.c | 4 -
fs/nfs/sysctl.c | 8 +++
fs/nfs/write.c | 116 ++++++++++++++++++++++++++++----------------
include/linux/backing-dev.h | 1
include/linux/nfs_fs.h | 1
include/linux/nfs_fs_sb.h | 1
mm/backing-dev.c | 16 ++++++
7 files changed, 104 insertions(+), 43 deletions(-)
Index: linux-2.6-git/fs/nfs/write.c
===================================================================
--- linux-2.6-git.orig/fs/nfs/write.c 2007-01-20 07:20:10.000000000 +0100
+++ linux-2.6-git/fs/nfs/write.c 2007-01-20 07:20:12.000000000 +0100
@@ -89,8 +89,6 @@ static struct kmem_cache *nfs_wdata_cach
static mempool_t *nfs_wdata_mempool;
static mempool_t *nfs_commit_mempool;
-static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
-
struct nfs_write_data *nfs_commit_alloc(void)
{
struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
@@ -245,6 +243,39 @@ static int wb_priority(struct writeback_
}
/*
+ * NFS congestion control
+ */
+
+int nfs_congestion_kb;
+
+#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10))
+#define NFS_CONGESTION_OFF_THRESH \
+ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+
+static inline void nfs_set_page_writeback(struct page *page)
+{
+ if (!test_set_page_writeback(page)) {
+ struct inode *inode = page->mapping->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ if (atomic_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH)
+ set_bdi_congested(&nfss->backing_dev_info, WRITE);
+ }
+}
+
+static inline void nfs_end_page_writeback(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ end_page_writeback(page);
+ if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+ clear_bdi_congested(&nfss->backing_dev_info, WRITE);
+ congestion_end(WRITE);
+ }
+}
+
+/*
* Find an associated nfs write request, and prepare to flush it out
* Returns 1 if there was no write request, or if the request was
* already tagged by nfs_set_page_dirty.Returns 0 if the request
@@ -281,7 +312,7 @@ static int nfs_page_mark_flush(struct pa
spin_unlock(req_lock);
if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) {
nfs_mark_request_dirty(req);
- set_page_writeback(page);
+ nfs_set_page_writeback(page);
}
ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
nfs_unlock_request(req);
@@ -336,13 +367,8 @@ int nfs_writepage(struct page *page, str
return err;
}
-/*
- * Note: causes nfs_update_request() to block on the assumption
- * that the writeback is generated due to memory pressure.
- */
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
struct inode *inode = mapping->host;
int err;
@@ -351,11 +377,6 @@ int nfs_writepages(struct address_space
err = generic_writepages(mapping, wbc);
if (err)
return err;
- while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
- if (wbc->nonblocking)
- return 0;
- nfs_wait_on_write_congestion(mapping, 0);
- }
err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
if (err < 0)
goto out;
@@ -369,9 +390,6 @@ int nfs_writepages(struct address_space
if (err > 0)
err = 0;
out:
- clear_bit(BDI_write_congested, &bdi->state);
- wake_up_all(&nfs_write_congestion);
- congestion_end(WRITE);
return err;
}
@@ -401,7 +419,7 @@ static int nfs_inode_add_request(struct
}
/*
- * Insert a write request into an inode
+ * Remove a write request from an inode
*/
static void nfs_inode_remove_request(struct nfs_page *req)
{
@@ -585,8 +603,8 @@ static inline int nfs_scan_commit(struct
static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
{
+ struct inode *inode = mapping->host;
struct backing_dev_info *bdi = mapping->backing_dev_info;
- DEFINE_WAIT(wait);
int ret = 0;
might_sleep();
@@ -594,31 +612,27 @@ static int nfs_wait_on_write_congestion(
if (!bdi_write_congested(bdi))
return 0;
- nfs_inc_stats(mapping->host, NFSIOS_CONGESTIONWAIT);
+ nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
- if (intr) {
- struct rpc_clnt *clnt = NFS_CLIENT(mapping->host);
- sigset_t oldset;
-
- rpc_clnt_sigmask(clnt, &oldset);
- prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
- if (bdi_write_congested(bdi)) {
- if (signalled())
- ret = -ERESTARTSYS;
- else
- schedule();
+ do {
+ if (intr) {
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ sigset_t oldset;
+
+ rpc_clnt_sigmask(clnt, &oldset);
+ ret = congestion_wait_interruptible(WRITE, HZ/10);
+ rpc_clnt_sigunmask(clnt, &oldset);
+ if (ret == -ERESTARTSYS)
+ return ret;
+ ret = 0;
+ } else {
+ congestion_wait(WRITE, HZ/10);
}
- rpc_clnt_sigunmask(clnt, &oldset);
- } else {
- prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
- if (bdi_write_congested(bdi))
- schedule();
- }
- finish_wait(&nfs_write_congestion, &wait);
+ } while (bdi_write_congested(bdi));
+
return ret;
}
-
/*
* Try to update any existing write request, or create one if there is none.
* In order to match, the request's credentials must match those of
@@ -779,7 +793,7 @@ int nfs_updatepage(struct file *file, st
static void nfs_writepage_release(struct nfs_page *req)
{
- end_page_writeback(req->wb_page);
+ nfs_end_page_writeback(req->wb_page);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (!PageError(req->wb_page)) {
@@ -1095,12 +1109,12 @@ static void nfs_writeback_done_full(stru
ClearPageUptodate(page);
SetPageError(page);
req->wb_context->error = task->tk_status;
- end_page_writeback(page);
+ nfs_end_page_writeback(page);
nfs_inode_remove_request(req);
dprintk(", error = %d\n", task->tk_status);
goto next;
}
- end_page_writeback(page);
+ nfs_end_page_writeback(page);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
@@ -1565,6 +1579,26 @@ int __init nfs_init_writepagecache(void)
if (nfs_commit_mempool == NULL)
return -ENOMEM;
+ /*
+ * NFS congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ * Limit the default to 256M
+ */
+ nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ if (nfs_congestion_kb > 256*1024)
+ nfs_congestion_kb = 256*1024;
+
return 0;
}
Index: linux-2.6-git/mm/backing-dev.c
===================================================================
--- linux-2.6-git.orig/mm/backing-dev.c 2007-01-20 07:20:10.000000000 +0100
+++ linux-2.6-git/mm/backing-dev.c 2007-01-20 07:20:12.000000000 +0100
@@ -55,6 +55,22 @@ long congestion_wait(int rw, long timeou
}
EXPORT_SYMBOL(congestion_wait);
+long congestion_wait_interruptible(int rw, long timeout)
+{
+ long ret;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+ prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ ret = -ERESTARTSYS;
+ else
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+ return ret;
+}
+EXPORT_SYMBOL(congestion_wait_interruptible);
+
/**
* congestion_end - wake up sleepers on a congested backing_dev_info
* @rw: READ or WRITE
Index: linux-2.6-git/include/linux/nfs_fs_sb.h
===================================================================
--- linux-2.6-git.orig/include/linux/nfs_fs_sb.h 2007-01-20 07:20:10.000000000 +0100
+++ linux-2.6-git/include/linux/nfs_fs_sb.h 2007-01-20 07:20:12.000000000 +0100
@@ -82,6 +82,7 @@ struct nfs_server {
struct rpc_clnt * client_acl; /* ACL RPC client handle */
struct nfs_iostats * io_stats; /* I/O statistics */
struct backing_dev_info backing_dev_info;
+ atomic_t writeback; /* number of writeback pages */
int flags; /* various flags */
unsigned int caps; /* server capabilities */
unsigned int rsize; /* read size */
Index: linux-2.6-git/include/linux/backing-dev.h
===================================================================
--- linux-2.6-git.orig/include/linux/backing-dev.h 2007-01-20 07:20:10.000000000 +0100
+++ linux-2.6-git/include/linux/backing-dev.h 2007-01-20 07:20:12.000000000 +0100
@@ -93,6 +93,7 @@ static inline int bdi_rw_congested(struc
void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
void set_bdi_congested(struct backing_dev_info *bdi, int rw);
long congestion_wait(int rw, long timeout);
+long congestion_wait_interruptible(int rw, long timeout);
void congestion_end(int rw);
#define bdi_cap_writeback_dirty(bdi) \
Index: linux-2.6-git/fs/nfs/sysctl.c
===================================================================
--- linux-2.6-git.orig/fs/nfs/sysctl.c 2007-01-20 07:20:10.000000000 +0100
+++ linux-2.6-git/fs/nfs/sysctl.c 2007-01-20 07:20:12.000000000 +0100
@@ -50,6 +50,14 @@ static ctl_table nfs_cb_sysctls[] = {
.proc_handler = &proc_dointvec_jiffies,
.strategy = &sysctl_jiffies,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nfs_congestion_kb",
+ .data = &nfs_congestion_kb,
+ .maxlen = sizeof(nfs_congestion_kb),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};
Index: linux-2.6-git/include/linux/nfs_fs.h
===================================================================
--- linux-2.6-git.orig/include/linux/nfs_fs.h 2007-01-20 07:20:10.000000000 +0100
+++ linux-2.6-git/include/linux/nfs_fs.h 2007-01-20 07:20:12.000000000 +0100
@@ -422,6 +422,7 @@ extern void nfs_complete_unlink(struct d
/*
* linux/fs/nfs/write.c
*/
+extern int nfs_congestion_kb;
extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
extern int nfs_writepages(struct address_space *, struct writeback_control *);
extern int nfs_flush_incompatible(struct file *file, struct page *page);
Index: linux-2.6-git/fs/nfs/super.c
===================================================================
--- linux-2.6-git.orig/fs/nfs/super.c 2007-01-20 07:20:23.000000000 +0100
+++ linux-2.6-git/fs/nfs/super.c 2007-01-20 07:20:29.000000000 +0100
@@ -150,10 +150,10 @@ int __init register_nfs_fs(void)
if (ret < 0)
goto error_0;
-#ifdef CONFIG_NFS_V4
ret = nfs_register_sysctl();
if (ret < 0)
goto error_1;
+#ifdef CONFIG_NFS_V4
ret = register_filesystem(&nfs4_fs_type);
if (ret < 0)
goto error_2;
@@ -164,9 +164,9 @@ int __init register_nfs_fs(void)
#ifdef CONFIG_NFS_V4
error_2:
nfs_unregister_sysctl();
+#endif
error_1:
unregister_filesystem(&nfs_fs_type);
-#endif
error_0:
return ret;
}
On Sat, 2007-01-20 at 08:01 +0100, Peter Zijlstra wrote:
> Subject: nfs: fix congestion control
>
> The current NFS client congestion logic is severly broken, it marks the backing
> device congested during each nfs_writepages() call but doesn't mirror this in
> nfs_writepage() which makes for deadlocks. Also it implements its own waitqueue.
>
> Replace this by a more regular congestion implementation that puts a cap on the
> number of active writeback pages and uses the bdi congestion waitqueue.
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Cc: Trond Myklebust <[email protected]>
> ---
> fs/nfs/super.c | 4 -
> fs/nfs/sysctl.c | 8 +++
> fs/nfs/write.c | 116 ++++++++++++++++++++++++++++----------------
> include/linux/backing-dev.h | 1
> include/linux/nfs_fs.h | 1
> include/linux/nfs_fs_sb.h | 1
> mm/backing-dev.c | 16 ++++++
> 7 files changed, 104 insertions(+), 43 deletions(-)
>
> Index: linux-2.6-git/fs/nfs/write.c
> ===================================================================
> --- linux-2.6-git.orig/fs/nfs/write.c 2007-01-20 07:20:10.000000000 +0100
> +++ linux-2.6-git/fs/nfs/write.c 2007-01-20 07:20:12.000000000 +0100
> @@ -89,8 +89,6 @@ static struct kmem_cache *nfs_wdata_cach
> static mempool_t *nfs_wdata_mempool;
> static mempool_t *nfs_commit_mempool;
>
> -static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
> -
> struct nfs_write_data *nfs_commit_alloc(void)
> {
> struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
> @@ -245,6 +243,39 @@ static int wb_priority(struct writeback_
> }
>
> /*
> + * NFS congestion control
> + */
> +
> +int nfs_congestion_kb;
> +
> +#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10))
> +#define NFS_CONGESTION_OFF_THRESH \
> + (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
> +
> +static inline void nfs_set_page_writeback(struct page *page)
> +{
> + if (!test_set_page_writeback(page)) {
> + struct inode *inode = page->mapping->host;
> + struct nfs_server *nfss = NFS_SERVER(inode);
> +
> + if (atomic_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH)
> + set_bdi_congested(&nfss->backing_dev_info, WRITE);
> + }
> +}
> +
> +static inline void nfs_end_page_writeback(struct page *page)
> +{
> + struct inode *inode = page->mapping->host;
> + struct nfs_server *nfss = NFS_SERVER(inode);
> +
> + end_page_writeback(page);
> + if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
> + clear_bdi_congested(&nfss->backing_dev_info, WRITE);
> + congestion_end(WRITE);
> + }
> +}
> +
> +/*
> * Find an associated nfs write request, and prepare to flush it out
> * Returns 1 if there was no write request, or if the request was
> * already tagged by nfs_set_page_dirty.Returns 0 if the request
> @@ -281,7 +312,7 @@ static int nfs_page_mark_flush(struct pa
> spin_unlock(req_lock);
> if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) {
> nfs_mark_request_dirty(req);
> - set_page_writeback(page);
> + nfs_set_page_writeback(page);
> }
> ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
> nfs_unlock_request(req);
> @@ -336,13 +367,8 @@ int nfs_writepage(struct page *page, str
> return err;
> }
>
> -/*
> - * Note: causes nfs_update_request() to block on the assumption
> - * that the writeback is generated due to memory pressure.
> - */
> int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
> {
> - struct backing_dev_info *bdi = mapping->backing_dev_info;
> struct inode *inode = mapping->host;
> int err;
>
> @@ -351,11 +377,6 @@ int nfs_writepages(struct address_space
> err = generic_writepages(mapping, wbc);
> if (err)
> return err;
> - while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
> - if (wbc->nonblocking)
> - return 0;
> - nfs_wait_on_write_congestion(mapping, 0);
> - }
> err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
> if (err < 0)
> goto out;
> @@ -369,9 +390,6 @@ int nfs_writepages(struct address_space
> if (err > 0)
> err = 0;
> out:
> - clear_bit(BDI_write_congested, &bdi->state);
> - wake_up_all(&nfs_write_congestion);
> - congestion_end(WRITE);
> return err;
> }
>
> @@ -401,7 +419,7 @@ static int nfs_inode_add_request(struct
> }
>
> /*
> - * Insert a write request into an inode
> + * Remove a write request from an inode
> */
> static void nfs_inode_remove_request(struct nfs_page *req)
> {
> @@ -585,8 +603,8 @@ static inline int nfs_scan_commit(struct
>
> static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
> {
> + struct inode *inode = mapping->host;
> struct backing_dev_info *bdi = mapping->backing_dev_info;
> - DEFINE_WAIT(wait);
> int ret = 0;
>
> might_sleep();
> @@ -594,31 +612,27 @@ static int nfs_wait_on_write_congestion(
> if (!bdi_write_congested(bdi))
> return 0;
>
> - nfs_inc_stats(mapping->host, NFSIOS_CONGESTIONWAIT);
> + nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
>
> - if (intr) {
> - struct rpc_clnt *clnt = NFS_CLIENT(mapping->host);
> - sigset_t oldset;
> -
> - rpc_clnt_sigmask(clnt, &oldset);
> - prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
> - if (bdi_write_congested(bdi)) {
> - if (signalled())
> - ret = -ERESTARTSYS;
> - else
> - schedule();
> + do {
> + if (intr) {
> + struct rpc_clnt *clnt = NFS_CLIENT(inode);
> + sigset_t oldset;
> +
> + rpc_clnt_sigmask(clnt, &oldset);
> + ret = congestion_wait_interruptible(WRITE, HZ/10);
> + rpc_clnt_sigunmask(clnt, &oldset);
> + if (ret == -ERESTARTSYS)
> + return ret;
> + ret = 0;
> + } else {
> + congestion_wait(WRITE, HZ/10);
> }
rpc_clnt_sigmask() will always set the correct sigmask, so you don't
need to test for 'intr'. In fact, since we do want to allow people to
SIGKILL the process even if intr is unset, it would be preferable to
drop that test.
> - rpc_clnt_sigunmask(clnt, &oldset);
> - } else {
> - prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
> - if (bdi_write_congested(bdi))
> - schedule();
> - }
> - finish_wait(&nfs_write_congestion, &wait);
> + } while (bdi_write_congested(bdi));
> +
> return ret;
> }
>
> -
> /*
> * Try to update any existing write request, or create one if there is none.
> * In order to match, the request's credentials must match those of
> @@ -779,7 +793,7 @@ int nfs_updatepage(struct file *file, st
>
> static void nfs_writepage_release(struct nfs_page *req)
> {
> - end_page_writeback(req->wb_page);
> + nfs_end_page_writeback(req->wb_page);
>
> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
> if (!PageError(req->wb_page)) {
> @@ -1095,12 +1109,12 @@ static void nfs_writeback_done_full(stru
> ClearPageUptodate(page);
> SetPageError(page);
> req->wb_context->error = task->tk_status;
> - end_page_writeback(page);
> + nfs_end_page_writeback(page);
> nfs_inode_remove_request(req);
> dprintk(", error = %d\n", task->tk_status);
> goto next;
> }
> - end_page_writeback(page);
> + nfs_end_page_writeback(page);
>
> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
> if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
> @@ -1565,6 +1579,26 @@ int __init nfs_init_writepagecache(void)
> if (nfs_commit_mempool == NULL)
> return -ENOMEM;
>
> + /*
> + * NFS congestion size, scale with available memory.
> + *
> + * 64MB: 8192k
> + * 128MB: 11585k
> + * 256MB: 16384k
> + * 512MB: 23170k
> + * 1GB: 32768k
> + * 2GB: 46340k
> + * 4GB: 65536k
> + * 8GB: 92681k
> + * 16GB: 131072k
> + *
> + * This allows larger machines to have larger/more transfers.
> + * Limit the default to 256M
> + */
> + nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
> + if (nfs_congestion_kb > 256*1024)
> + nfs_congestion_kb = 256*1024;
> +
> return 0;
> }
>
> Index: linux-2.6-git/mm/backing-dev.c
> ===================================================================
> --- linux-2.6-git.orig/mm/backing-dev.c 2007-01-20 07:20:10.000000000 +0100
> +++ linux-2.6-git/mm/backing-dev.c 2007-01-20 07:20:12.000000000 +0100
> @@ -55,6 +55,22 @@ long congestion_wait(int rw, long timeou
> }
> EXPORT_SYMBOL(congestion_wait);
>
> +long congestion_wait_interruptible(int rw, long timeout)
> +{
> + long ret;
> + DEFINE_WAIT(wait);
> + wait_queue_head_t *wqh = &congestion_wqh[rw];
> +
> + prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
> + if (signal_pending(current))
> + ret = -ERESTARTSYS;
> + else
> + ret = io_schedule_timeout(timeout);
> + finish_wait(wqh, &wait);
> + return ret;
> +}
> +EXPORT_SYMBOL(congestion_wait_interruptible);
> +
> /**
> * congestion_end - wake up sleepers on a congested backing_dev_info
> * @rw: READ or WRITE
> Index: linux-2.6-git/include/linux/nfs_fs_sb.h
> ===================================================================
> --- linux-2.6-git.orig/include/linux/nfs_fs_sb.h 2007-01-20 07:20:10.000000000 +0100
> +++ linux-2.6-git/include/linux/nfs_fs_sb.h 2007-01-20 07:20:12.000000000 +0100
> @@ -82,6 +82,7 @@ struct nfs_server {
> struct rpc_clnt * client_acl; /* ACL RPC client handle */
> struct nfs_iostats * io_stats; /* I/O statistics */
> struct backing_dev_info backing_dev_info;
> + atomic_t writeback; /* number of writeback pages */
> int flags; /* various flags */
> unsigned int caps; /* server capabilities */
> unsigned int rsize; /* read size */
> Index: linux-2.6-git/include/linux/backing-dev.h
> ===================================================================
> --- linux-2.6-git.orig/include/linux/backing-dev.h 2007-01-20 07:20:10.000000000 +0100
> +++ linux-2.6-git/include/linux/backing-dev.h 2007-01-20 07:20:12.000000000 +0100
> @@ -93,6 +93,7 @@ static inline int bdi_rw_congested(struc
> void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
> void set_bdi_congested(struct backing_dev_info *bdi, int rw);
> long congestion_wait(int rw, long timeout);
> +long congestion_wait_interruptible(int rw, long timeout);
> void congestion_end(int rw);
>
> #define bdi_cap_writeback_dirty(bdi) \
> Index: linux-2.6-git/fs/nfs/sysctl.c
> ===================================================================
> --- linux-2.6-git.orig/fs/nfs/sysctl.c 2007-01-20 07:20:10.000000000 +0100
> +++ linux-2.6-git/fs/nfs/sysctl.c 2007-01-20 07:20:12.000000000 +0100
> @@ -50,6 +50,14 @@ static ctl_table nfs_cb_sysctls[] = {
> .proc_handler = &proc_dointvec_jiffies,
> .strategy = &sysctl_jiffies,
> },
> + {
> + .ctl_name = CTL_UNNUMBERED,
> + .procname = "nfs_congestion_kb",
> + .data = &nfs_congestion_kb,
> + .maxlen = sizeof(nfs_congestion_kb),
> + .mode = 0644,
> + .proc_handler = &proc_dointvec,
> + },
> { .ctl_name = 0 }
> };
>
> Index: linux-2.6-git/include/linux/nfs_fs.h
> ===================================================================
> --- linux-2.6-git.orig/include/linux/nfs_fs.h 2007-01-20 07:20:10.000000000 +0100
> +++ linux-2.6-git/include/linux/nfs_fs.h 2007-01-20 07:20:12.000000000 +0100
> @@ -422,6 +422,7 @@ extern void nfs_complete_unlink(struct d
> /*
> * linux/fs/nfs/write.c
> */
> +extern int nfs_congestion_kb;
> extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
> extern int nfs_writepages(struct address_space *, struct writeback_control *);
> extern int nfs_flush_incompatible(struct file *file, struct page *page);
> Index: linux-2.6-git/fs/nfs/super.c
> ===================================================================
> --- linux-2.6-git.orig/fs/nfs/super.c 2007-01-20 07:20:23.000000000 +0100
> +++ linux-2.6-git/fs/nfs/super.c 2007-01-20 07:20:29.000000000 +0100
> @@ -150,10 +150,10 @@ int __init register_nfs_fs(void)
> if (ret < 0)
> goto error_0;
>
> -#ifdef CONFIG_NFS_V4
> ret = nfs_register_sysctl();
> if (ret < 0)
> goto error_1;
> +#ifdef CONFIG_NFS_V4
> ret = register_filesystem(&nfs4_fs_type);
> if (ret < 0)
> goto error_2;
> @@ -164,9 +164,9 @@ int __init register_nfs_fs(void)
> #ifdef CONFIG_NFS_V4
> error_2:
> nfs_unregister_sysctl();
> +#endif
> error_1:
> unregister_filesystem(&nfs_fs_type);
> -#endif
> error_0:
> return ret;
> }
The rest looks fine...
Trond
On Sat, 20 Jan 2007, Peter Zijlstra wrote:
> Subject: nfs: fix congestion control
I am not sure if its too valuable since I have limited experience with NFS
but it looks fine to me.
Acked-by: Christoph Lameter <[email protected]>
Hopefully the last version ;-)
---
Subject: nfs: fix congestion control
The current NFS client congestion logic is severly broken, it marks the backing
device congested during each nfs_writepages() call but doesn't mirror this in
nfs_writepage() which makes for deadlocks. Also it implements its own waitqueue.
Replace this by a more regular congestion implementation that puts a cap on the
number of active writeback pages and uses the bdi congestion waitqueue.
Also always use an interruptible wait since it makes sense to be able to
SIGKILL the process even for mounts without 'intr'.
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Trond Myklebust <[email protected]>
---
fs/nfs/super.c | 4 -
fs/nfs/sysctl.c | 8 +++
fs/nfs/write.c | 116 ++++++++++++++++++++++++++++----------------
include/linux/backing-dev.h | 1
include/linux/nfs_fs.h | 1
include/linux/nfs_fs_sb.h | 1
mm/backing-dev.c | 16 ++++++
7 files changed, 103 insertions(+), 44 deletions(-)
Index: linux-2.6-git/fs/nfs/write.c
===================================================================
--- linux-2.6-git.orig/fs/nfs/write.c 2007-01-25 16:07:03.000000000 +0100
+++ linux-2.6-git/fs/nfs/write.c 2007-01-25 16:10:48.000000000 +0100
@@ -52,6 +52,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/writeback.h>
+#include <linux/swap.h>
#include <linux/sunrpc/clnt.h>
#include <linux/nfs_fs.h>
@@ -78,7 +79,7 @@ static struct nfs_page * nfs_update_requ
struct page *,
unsigned int, unsigned int);
static void nfs_mark_request_dirty(struct nfs_page *req);
-static int nfs_wait_on_write_congestion(struct address_space *, int);
+static int nfs_wait_on_write_congestion(struct address_space *);
static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int);
static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how);
static const struct rpc_call_ops nfs_write_partial_ops;
@@ -89,8 +90,6 @@ static struct kmem_cache *nfs_wdata_cach
static mempool_t *nfs_wdata_mempool;
static mempool_t *nfs_commit_mempool;
-static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
-
struct nfs_write_data *nfs_commit_alloc(void)
{
struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
@@ -245,6 +244,39 @@ static int wb_priority(struct writeback_
}
/*
+ * NFS congestion control
+ */
+
+int nfs_congestion_kb;
+
+#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10))
+#define NFS_CONGESTION_OFF_THRESH \
+ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+
+static inline void nfs_set_page_writeback(struct page *page)
+{
+ if (!test_set_page_writeback(page)) {
+ struct inode *inode = page->mapping->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ if (atomic_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH)
+ set_bdi_congested(&nfss->backing_dev_info, WRITE);
+ }
+}
+
+static inline void nfs_end_page_writeback(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ end_page_writeback(page);
+ if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+ clear_bdi_congested(&nfss->backing_dev_info, WRITE);
+ congestion_end(WRITE);
+ }
+}
+
+/*
* Find an associated nfs write request, and prepare to flush it out
* Returns 1 if there was no write request, or if the request was
* already tagged by nfs_set_page_dirty.Returns 0 if the request
@@ -281,7 +313,7 @@ static int nfs_page_mark_flush(struct pa
spin_unlock(req_lock);
if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) {
nfs_mark_request_dirty(req);
- set_page_writeback(page);
+ nfs_set_page_writeback(page);
}
ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
nfs_unlock_request(req);
@@ -336,13 +368,8 @@ int nfs_writepage(struct page *page, str
return err;
}
-/*
- * Note: causes nfs_update_request() to block on the assumption
- * that the writeback is generated due to memory pressure.
- */
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
struct inode *inode = mapping->host;
int err;
@@ -351,11 +378,6 @@ int nfs_writepages(struct address_space
err = generic_writepages(mapping, wbc);
if (err)
return err;
- while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
- if (wbc->nonblocking)
- return 0;
- nfs_wait_on_write_congestion(mapping, 0);
- }
err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
if (err < 0)
goto out;
@@ -369,9 +391,6 @@ int nfs_writepages(struct address_space
if (err > 0)
err = 0;
out:
- clear_bit(BDI_write_congested, &bdi->state);
- wake_up_all(&nfs_write_congestion);
- congestion_end(WRITE);
return err;
}
@@ -401,7 +420,7 @@ static int nfs_inode_add_request(struct
}
/*
- * Insert a write request into an inode
+ * Remove a write request from an inode
*/
static void nfs_inode_remove_request(struct nfs_page *req)
{
@@ -583,10 +602,10 @@ static inline int nfs_scan_commit(struct
}
#endif
-static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
+static int nfs_wait_on_write_congestion(struct address_space *mapping)
{
+ struct inode *inode = mapping->host;
struct backing_dev_info *bdi = mapping->backing_dev_info;
- DEFINE_WAIT(wait);
int ret = 0;
might_sleep();
@@ -594,31 +613,23 @@ static int nfs_wait_on_write_congestion(
if (!bdi_write_congested(bdi))
return 0;
- nfs_inc_stats(mapping->host, NFSIOS_CONGESTIONWAIT);
+ nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
- if (intr) {
- struct rpc_clnt *clnt = NFS_CLIENT(mapping->host);
+ do {
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
sigset_t oldset;
rpc_clnt_sigmask(clnt, &oldset);
- prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
- if (bdi_write_congested(bdi)) {
- if (signalled())
- ret = -ERESTARTSYS;
- else
- schedule();
- }
+ ret = congestion_wait_interruptible(WRITE, HZ/10);
rpc_clnt_sigunmask(clnt, &oldset);
- } else {
- prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
- if (bdi_write_congested(bdi))
- schedule();
- }
- finish_wait(&nfs_write_congestion, &wait);
+ if (ret == -ERESTARTSYS)
+ break;
+ ret = 0;
+ } while (bdi_write_congested(bdi));
+
return ret;
}
-
/*
* Try to update any existing write request, or create one if there is none.
* In order to match, the request's credentials must match those of
@@ -629,14 +640,15 @@ static int nfs_wait_on_write_congestion(
static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
struct page *page, unsigned int offset, unsigned int bytes)
{
- struct inode *inode = page->mapping->host;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_page *req, *new = NULL;
unsigned long rqend, end;
end = offset + bytes;
- if (nfs_wait_on_write_congestion(page->mapping, NFS_SERVER(inode)->flags & NFS_MOUNT_INTR))
+ if (nfs_wait_on_write_congestion(mapping))
return ERR_PTR(-ERESTARTSYS);
for (;;) {
/* Loop over all inode entries and see if we find
@@ -779,7 +791,7 @@ int nfs_updatepage(struct file *file, st
static void nfs_writepage_release(struct nfs_page *req)
{
- end_page_writeback(req->wb_page);
+ nfs_end_page_writeback(req->wb_page);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (!PageError(req->wb_page)) {
@@ -1095,12 +1107,12 @@ static void nfs_writeback_done_full(stru
ClearPageUptodate(page);
SetPageError(page);
req->wb_context->error = task->tk_status;
- end_page_writeback(page);
+ nfs_end_page_writeback(page);
nfs_inode_remove_request(req);
dprintk(", error = %d\n", task->tk_status);
goto next;
}
- end_page_writeback(page);
+ nfs_end_page_writeback(page);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
@@ -1565,6 +1577,26 @@ int __init nfs_init_writepagecache(void)
if (nfs_commit_mempool == NULL)
return -ENOMEM;
+ /*
+ * NFS congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ * Limit the default to 256M
+ */
+ nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ if (nfs_congestion_kb > 256*1024)
+ nfs_congestion_kb = 256*1024;
+
return 0;
}
Index: linux-2.6-git/mm/backing-dev.c
===================================================================
--- linux-2.6-git.orig/mm/backing-dev.c 2007-01-25 16:07:03.000000000 +0100
+++ linux-2.6-git/mm/backing-dev.c 2007-01-25 16:07:12.000000000 +0100
@@ -55,6 +55,22 @@ long congestion_wait(int rw, long timeou
}
EXPORT_SYMBOL(congestion_wait);
+long congestion_wait_interruptible(int rw, long timeout)
+{
+ long ret;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+ prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ ret = -ERESTARTSYS;
+ else
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+ return ret;
+}
+EXPORT_SYMBOL(congestion_wait_interruptible);
+
/**
* congestion_end - wake up sleepers on a congested backing_dev_info
* @rw: READ or WRITE
Index: linux-2.6-git/include/linux/nfs_fs_sb.h
===================================================================
--- linux-2.6-git.orig/include/linux/nfs_fs_sb.h 2007-01-25 16:07:03.000000000 +0100
+++ linux-2.6-git/include/linux/nfs_fs_sb.h 2007-01-25 16:07:12.000000000 +0100
@@ -82,6 +82,7 @@ struct nfs_server {
struct rpc_clnt * client_acl; /* ACL RPC client handle */
struct nfs_iostats * io_stats; /* I/O statistics */
struct backing_dev_info backing_dev_info;
+ atomic_t writeback; /* number of writeback pages */
int flags; /* various flags */
unsigned int caps; /* server capabilities */
unsigned int rsize; /* read size */
Index: linux-2.6-git/include/linux/backing-dev.h
===================================================================
--- linux-2.6-git.orig/include/linux/backing-dev.h 2007-01-25 16:07:03.000000000 +0100
+++ linux-2.6-git/include/linux/backing-dev.h 2007-01-25 16:07:12.000000000 +0100
@@ -93,6 +93,7 @@ static inline int bdi_rw_congested(struc
void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
void set_bdi_congested(struct backing_dev_info *bdi, int rw);
long congestion_wait(int rw, long timeout);
+long congestion_wait_interruptible(int rw, long timeout);
void congestion_end(int rw);
#define bdi_cap_writeback_dirty(bdi) \
Index: linux-2.6-git/fs/nfs/sysctl.c
===================================================================
--- linux-2.6-git.orig/fs/nfs/sysctl.c 2007-01-25 16:07:03.000000000 +0100
+++ linux-2.6-git/fs/nfs/sysctl.c 2007-01-25 16:07:12.000000000 +0100
@@ -50,6 +50,14 @@ static ctl_table nfs_cb_sysctls[] = {
.proc_handler = &proc_dointvec_jiffies,
.strategy = &sysctl_jiffies,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nfs_congestion_kb",
+ .data = &nfs_congestion_kb,
+ .maxlen = sizeof(nfs_congestion_kb),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};
Index: linux-2.6-git/include/linux/nfs_fs.h
===================================================================
--- linux-2.6-git.orig/include/linux/nfs_fs.h 2007-01-25 16:07:03.000000000 +0100
+++ linux-2.6-git/include/linux/nfs_fs.h 2007-01-25 16:07:12.000000000 +0100
@@ -423,6 +423,7 @@ extern void nfs_complete_unlink(struct d
/*
* linux/fs/nfs/write.c
*/
+extern int nfs_congestion_kb;
extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
extern int nfs_writepages(struct address_space *, struct writeback_control *);
extern int nfs_flush_incompatible(struct file *file, struct page *page);
Index: linux-2.6-git/fs/nfs/super.c
===================================================================
--- linux-2.6-git.orig/fs/nfs/super.c 2007-01-25 16:07:03.000000000 +0100
+++ linux-2.6-git/fs/nfs/super.c 2007-01-25 16:07:12.000000000 +0100
@@ -150,10 +150,10 @@ int __init register_nfs_fs(void)
if (ret < 0)
goto error_0;
-#ifdef CONFIG_NFS_V4
ret = nfs_register_sysctl();
if (ret < 0)
goto error_1;
+#ifdef CONFIG_NFS_V4
ret = register_filesystem(&nfs4_fs_type);
if (ret < 0)
goto error_2;
@@ -164,9 +164,9 @@ int __init register_nfs_fs(void)
#ifdef CONFIG_NFS_V4
error_2:
nfs_unregister_sysctl();
+#endif
error_1:
unregister_filesystem(&nfs_fs_type);
-#endif
error_0:
return ret;
}
On Thu, 25 Jan 2007 16:32:28 +0100
Peter Zijlstra <[email protected]> wrote:
> +long congestion_wait_interruptible(int rw, long timeout)
> +{
> + long ret;
> + DEFINE_WAIT(wait);
> + wait_queue_head_t *wqh = &congestion_wqh[rw];
> +
> + prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
> + if (signal_pending(current))
> + ret = -ERESTARTSYS;
> + else
> + ret = io_schedule_timeout(timeout);
> + finish_wait(wqh, &wait);
> + return ret;
> +}
> +EXPORT_SYMBOL(congestion_wait_interruptible);
I think this can share code with congestion_wait()?
static long __congestion_wait(int rw, long timeout, int state)
{
long ret;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[rw];
prepare_to_wait(wqh, &wait, state);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
return ret;
}
long congestion_wait_interruptible(int rw, long timeout)
{
long ret = __congestion_wait(rw, timeout);
if (signal_pending(current))
ret = -ERESTARTSYS;
return ret;
}
it's only infinitesimally less efficient..
On Thu, 25 Jan 2007 16:32:28 +0100
Peter Zijlstra <[email protected]> wrote:
> Hopefully the last version ;-)
>
>
> ---
> Subject: nfs: fix congestion control
>
> The current NFS client congestion logic is severly broken, it marks the backing
> device congested during each nfs_writepages() call but doesn't mirror this in
> nfs_writepage() which makes for deadlocks. Also it implements its own waitqueue.
>
> Replace this by a more regular congestion implementation that puts a cap on the
> number of active writeback pages and uses the bdi congestion waitqueue.
>
> Also always use an interruptible wait since it makes sense to be able to
> SIGKILL the process even for mounts without 'intr'.
>
> ..
>
> --- linux-2.6-git.orig/include/linux/nfs_fs_sb.h 2007-01-25 16:07:03.000000000 +0100
> +++ linux-2.6-git/include/linux/nfs_fs_sb.h 2007-01-25 16:07:12.000000000 +0100
> @@ -82,6 +82,7 @@ struct nfs_server {
> struct rpc_clnt * client_acl; /* ACL RPC client handle */
> struct nfs_iostats * io_stats; /* I/O statistics */
> struct backing_dev_info backing_dev_info;
> + atomic_t writeback; /* number of writeback pages */
We're going to get in trouble with this sort of thing within a few years.
atomic_t is 32-bit. Put 16TB of memory under writeback and blam.
On Thu, 25 Jan 2007, Andrew Morton wrote:
> atomic_t is 32-bit. Put 16TB of memory under writeback and blam.
We have systems with 8TB main memory and are able to get to 16TB.
Better change it now.
On Thu, 25 Jan 2007 21:31:43 -0800 (PST)
Christoph Lameter <[email protected]> wrote:
> On Thu, 25 Jan 2007, Andrew Morton wrote:
>
> > atomic_t is 32-bit. Put 16TB of memory under writeback and blam.
>
> We have systems with 8TB main memory and are able to get to 16TB.
But I bet you don't use 4k pages on 'em ;)
> Better change it now.
yup.
On Thu, 25 Jan 2007, Andrew Morton wrote:
> > We have systems with 8TB main memory and are able to get to 16TB.
>
> But I bet you don't use 4k pages on 'em ;)
IA64 can be configured for 4k pagesize but yes 16k is the default. There
are plans to go much higher though. Plus there may be other reaons that
will force us to 4k pagesize on some configurations.
On Thu, 2007-01-25 at 21:02 -0800, Andrew Morton wrote:
> On Thu, 25 Jan 2007 16:32:28 +0100
> Peter Zijlstra <[email protected]> wrote:
>
> > +long congestion_wait_interruptible(int rw, long timeout)
> > +{
> > + long ret;
> > + DEFINE_WAIT(wait);
> > + wait_queue_head_t *wqh = &congestion_wqh[rw];
> > +
> > + prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
> > + if (signal_pending(current))
> > + ret = -ERESTARTSYS;
> > + else
> > + ret = io_schedule_timeout(timeout);
> > + finish_wait(wqh, &wait);
> > + return ret;
> > +}
> > +EXPORT_SYMBOL(congestion_wait_interruptible);
>
> I think this can share code with congestion_wait()?
>
> static long __congestion_wait(int rw, long timeout, int state)
> {
> long ret;
> DEFINE_WAIT(wait);
> wait_queue_head_t *wqh = &congestion_wqh[rw];
>
> prepare_to_wait(wqh, &wait, state);
> ret = io_schedule_timeout(timeout);
> finish_wait(wqh, &wait);
> return ret;
> }
>
> long congestion_wait_interruptible(int rw, long timeout)
> {
> long ret = __congestion_wait(rw, timeout);
>
> if (signal_pending(current))
> ret = -ERESTARTSYS;
> return ret;
> }
>
> it's only infinitesimally less efficient..
All the other _interruptible functions check signal_pending before
calling schedule. Which seems to make sense since its called in a loop
anyway, and if the loop condition turns false when interrupted you might
as well just finish up instead of bailing out.
However if you'd rather see your version, who am I to object ;-)
On Thu, 2007-01-25 at 22:04 -0800, Andrew Morton wrote:
> On Thu, 25 Jan 2007 21:31:43 -0800 (PST)
> Christoph Lameter <[email protected]> wrote:
>
> > On Thu, 25 Jan 2007, Andrew Morton wrote:
> >
> > > atomic_t is 32-bit. Put 16TB of memory under writeback and blam.
> >
> > We have systems with 8TB main memory and are able to get to 16TB.
>
> But I bet you don't use 4k pages on 'em ;)
>
> > Better change it now.
>
> yup.
I can change to atomic_long_t but that would make this patch depend on
Mathieu Desnoyers' atomic.h patch series.
Do I send out a -v5 with this, or should I send an incremental patch
once that hits your tree?
On Fri, 2007-01-26 at 09:00 +0100, Peter Zijlstra wrote:
> On Thu, 2007-01-25 at 21:02 -0800, Andrew Morton wrote:
> > On Thu, 25 Jan 2007 16:32:28 +0100
> > Peter Zijlstra <[email protected]> wrote:
> >
> > > +long congestion_wait_interruptible(int rw, long timeout)
> > > +{
> > > + long ret;
> > > + DEFINE_WAIT(wait);
> > > + wait_queue_head_t *wqh = &congestion_wqh[rw];
> > > +
> > > + prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
> > > + if (signal_pending(current))
> > > + ret = -ERESTARTSYS;
> > > + else
> > > + ret = io_schedule_timeout(timeout);
> > > + finish_wait(wqh, &wait);
> > > + return ret;
> > > +}
> > > +EXPORT_SYMBOL(congestion_wait_interruptible);
> >
> > I think this can share code with congestion_wait()?
> >
> > static long __congestion_wait(int rw, long timeout, int state)
> > {
> > long ret;
> > DEFINE_WAIT(wait);
> > wait_queue_head_t *wqh = &congestion_wqh[rw];
> >
> > prepare_to_wait(wqh, &wait, state);
> > ret = io_schedule_timeout(timeout);
> > finish_wait(wqh, &wait);
> > return ret;
> > }
> >
> > long congestion_wait_interruptible(int rw, long timeout)
> > {
> > long ret = __congestion_wait(rw, timeout);
> >
> > if (signal_pending(current))
> > ret = -ERESTARTSYS;
> > return ret;
> > }
> >
> > it's only infinitesimally less efficient..
>
> All the other _interruptible functions check signal_pending before
> calling schedule. Which seems to make sense since its called in a loop
> anyway, and if the loop condition turns false when interrupted you might
> as well just finish up instead of bailing out.
>
> However if you'd rather see your version, who am I to object ;-)
ok, first wake up, then reply to emails :-)
How about this:
long congestion_wait_interruptible(int rw, long timeout)
{
long ret;
if (signal_pending(current))
ret = -ERESTARTSYS;
else
ret = congestion_wait(rw, timeout);
return ret;
}
EXPORT_SYMBOL(congestion_wait_interruptible);
On Fri, 26 Jan 2007 09:03:37 +0100
Peter Zijlstra <[email protected]> wrote:
> On Thu, 2007-01-25 at 22:04 -0800, Andrew Morton wrote:
> > On Thu, 25 Jan 2007 21:31:43 -0800 (PST)
> > Christoph Lameter <[email protected]> wrote:
> >
> > > On Thu, 25 Jan 2007, Andrew Morton wrote:
> > >
> > > > atomic_t is 32-bit. Put 16TB of memory under writeback and blam.
> > >
> > > We have systems with 8TB main memory and are able to get to 16TB.
> >
> > But I bet you don't use 4k pages on 'em ;)
> >
> > > Better change it now.
> >
> > yup.
>
> I can change to atomic_long_t but that would make this patch depend on
> Mathieu Desnoyers' atomic.h patch series.
>
> Do I send out a -v5 with this, or should I send an incremental patch
> once that hits your tree?
A patch against next -mm would suit, thanks.
(But we already use atomic_long_t in generic code?)
On Fri, 2007-01-26 at 00:51 -0800, Andrew Morton wrote:
> A patch against next -mm would suit, thanks.
>
> (But we already use atomic_long_t in generic code?)
but there is currently no atomic_long_{inc,dec}_return, or any
atomic_long_*_return function for that matter.
Mathieu adds these missing functions.
Hi Christoph -- has anything come of resolving the NFS / OOM
concerns that Andrew Morton expressed concerning the patch? I'd be happy
to see some progress on getting this patch (i.e. the one you posted on
1/23) through.
Thanks,
-- Ethan
On Thu, 1 Feb 2007, Ethan Solomita wrote:
> Hi Christoph -- has anything come of resolving the NFS / OOM concerns that
> Andrew Morton expressed concerning the patch? I'd be happy to see some
> progress on getting this patch (i.e. the one you posted on 1/23) through.
Peter Zilkstra addressed the NFS issue. I will submit the patch again as
soon as the writeback code stabilizes a bit.
On Thu, 1 Feb 2007 18:16:05 -0800 (PST) Christoph Lameter <[email protected]> wrote:
> On Thu, 1 Feb 2007, Ethan Solomita wrote:
>
> > Hi Christoph -- has anything come of resolving the NFS / OOM concerns that
> > Andrew Morton expressed concerning the patch? I'd be happy to see some
> > progress on getting this patch (i.e. the one you posted on 1/23) through.
>
> Peter Zilkstra addressed the NFS issue.
Did he? Are you yet in a position to confirm that?
On Thu, 1 Feb 2007, Andrew Morton wrote:
> > Peter Zilkstra addressed the NFS issue.
>
> Did he? Are you yet in a position to confirm that?
He provided a solution to fix the congestion issue in NFS. I thought
that is what you were looking for? That should make NFS behave more
like a block device right?
As I said before I think NFS is inherently unfixable given the layering of
a block device on top of the network stack (which consists of an unknown
number of additional intermediate layers). Cpuset writeback needs to work
in the same way as in a machine without cpusets. If fails then at least
let the cpuset behave as if we had a machine all on our own and fail in
both cases in the same way. Right now we create dangerous low memory
conditions due to high dirty ratios in a cpuset created by not having a
throttling method. The NFS problems also exist for non cpuset scenarios
and we have by and large been able to live with it so I think they are
lower priority. It seems that the basic problem is created by the dirty
ratios in a cpuset.
BTW the block layer also may be layered with raid and stuff and then we
have similar issues. There is no general way so far of handling these
situations except by twiddling around with min_free_kbytes praying 5 Hail
Mary's and trying again. Maybe we are able allocate all needed memory from
PF_MEMALLOC processes during reclaim and hopefully there is now enough
memory for these allocations and those that happen to occur during an
interrupt while we reclaim.
On Thursday February 1, [email protected] wrote:
> The NFS problems also exist for non cpuset scenarios
> and we have by and large been able to live with it so I think they are
> lower priority. It seems that the basic problem is created by the dirty
> ratios in a cpuset.
Some of our customers haven't been able to live with it. I'm really
glad this will soon be fixed in mainline as it means our somewhat less
elegant fix in SLES can go away :-)
>
> BTW the block layer also may be layered with raid and stuff and then we
> have similar issues. There is no general way so far of handling these
> situations except by twiddling around with min_free_kbytes praying 5 Hail
> Mary's and trying again.
md/raid doesn't cause any problems here. It preallocates enough to be
sure that it can always make forward progress. In general the entire
block layer from generic_make_request down can always successfully
write a block out in a reasonable amount of time without requiring
kmalloc to succeed (with obvious exceptions like loop and nbd which go
back up to a higher layer).
The network stack is of course a different (much harder) problem.
NeilBrown
On Fri, 2 Feb 2007, Neil Brown wrote:
> md/raid doesn't cause any problems here. It preallocates enough to be
> sure that it can always make forward progress. In general the entire
> block layer from generic_make_request down can always successfully
> write a block out in a reasonable amount of time without requiring
> kmalloc to succeed (with obvious exceptions like loop and nbd which go
> back up to a higher layer).
Hmmm... I wonder if that could be generalized. A device driver could make
a reservation by increasing min_free_kbytes? Additional drivers in a
chain could make additional reservations in such a way that enough
memory is set aside for the worst case?
> The network stack is of course a different (much harder) problem.
An NFS solution is possible without solving the network stack issue?
On Thursday February 1, [email protected] wrote:
>
> > The network stack is of course a different (much harder) problem.
>
> An NFS solution is possible without solving the network stack issue?
NFS is currently able to make more than max_dirty_ratio of memory
Dirty/Writeback without being effectively throttled. So it can use up
way more than it should and put pressure in the network stack.
If NFS were throttled like other block-based filesystems (which
Peter's patch should do), then there will normally be a lot more head
room and the network stack will normally be able to cope. There might
still be situations were you can run out of memory to the extent that
NFS cannot make forward progress, but they will be substantially less
likely (I think you need lots of TCP streams with slow consumers and
fast producers so that TCP is forced to use up it reserves).
The block layer guarantees not to run out of memory.
The network layer makes a best effort as long as nothing goes crazy.
NFS (currently) doesn't do quite enough to stop things going crazy.
At least, that is my understanding.
NeilBrown
On Thu, 1 Feb 2007 21:29:06 -0800 (PST) Christoph Lameter <[email protected]> wrote:
> On Thu, 1 Feb 2007, Andrew Morton wrote:
>
> > > Peter Zilkstra addressed the NFS issue.
> >
> > Did he? Are you yet in a position to confirm that?
>
> He provided a solution to fix the congestion issue in NFS. I thought
> that is what you were looking for? That should make NFS behave more
> like a block device right?
We hope so.
The cpuset-aware-writeback patches were explicitly written to hide the bug which
Peter's patches hopefully address. They hence remove our best way of confirming
that Peter's patches fix the problem which you've observed in a proper fashion.
Until we've confirmed that the NFS problem is nailed, I wouldn't want to merge
cpuset-aware-writeback. I'm hoping to be able to do that with fake-numa on x86-64
but haven't got onto it yet.
On Fri, 2007-01-26 at 00:51 -0800, Andrew Morton wrote:
> On Fri, 26 Jan 2007 09:03:37 +0100
> Peter Zijlstra <[email protected]> wrote:
>
> > On Thu, 2007-01-25 at 22:04 -0800, Andrew Morton wrote:
> > > On Thu, 25 Jan 2007 21:31:43 -0800 (PST)
> > > Christoph Lameter <[email protected]> wrote:
> > >
> > > > On Thu, 25 Jan 2007, Andrew Morton wrote:
> > > >
> > > > > atomic_t is 32-bit. Put 16TB of memory under writeback and blam.
> > > >
> > > > We have systems with 8TB main memory and are able to get to 16TB.
> > >
> > > But I bet you don't use 4k pages on 'em ;)
> > >
> > > > Better change it now.
> > >
> > > yup.
> >
> > I can change to atomic_long_t but that would make this patch depend on
> > Mathieu Desnoyers' atomic.h patch series.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > Do I send out a -v5 with this, or should I send an incremental patch
> > once that hits your tree?
>
> A patch against next -mm would suit, thanks.
As promised change the atomic_t in struct nfs_server to atomic_long_t in
anticipation of machines that can handle 8+TB of (4K) pages under writeback.
However I suspect other things in NFS will start going *bang* by then.
Signed-off-by: Peter Zijlstra <[email protected]>
---
fs/nfs/write.c | 4 ++--
include/linux/nfs_fs_sb.h | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
Index: linux-2.6/fs/nfs/write.c
===================================================================
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -224,7 +224,7 @@ static void nfs_set_page_writeback(struc
struct inode *inode = page->mapping->host;
struct nfs_server *nfss = NFS_SERVER(inode);
- if (atomic_inc_return(&nfss->writeback) >
+ if (atomic_long_inc_return(&nfss->writeback) >
NFS_CONGESTION_ON_THRESH)
set_bdi_congested(&nfss->backing_dev_info, WRITE);
}
@@ -236,7 +236,7 @@ static void nfs_end_page_writeback(struc
struct nfs_server *nfss = NFS_SERVER(inode);
end_page_writeback(page);
- if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+ if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
clear_bdi_congested(&nfss->backing_dev_info, WRITE);
congestion_end(WRITE);
}
Index: linux-2.6/include/linux/nfs_fs_sb.h
===================================================================
--- linux-2.6.orig/include/linux/nfs_fs_sb.h
+++ linux-2.6/include/linux/nfs_fs_sb.h
@@ -82,7 +82,7 @@ struct nfs_server {
struct rpc_clnt * client_acl; /* ACL RPC client handle */
struct nfs_iostats * io_stats; /* I/O statistics */
struct backing_dev_info backing_dev_info;
- atomic_t writeback; /* number of writeback pages */
+ atomic_long_t writeback; /* number of writeback pages */
int flags; /* various flags */
unsigned int caps; /* server capabilities */
unsigned int rsize; /* read size */