This implements the ability to remove inodes in a particular slab
from inode caches. In order to remove an inode we may have to write out
the pages of an inode, the inode itself and remove the dentries referring
to the node.
Provide generic functionality that can be used by filesystems that have
their own inode caches to also tie into the defragmentation functions
that are made available here.
FIXES NEEDED!
Note Miklos comments on the patch at http://lkml.indiana.edu/hypermail/linux/kernel/0810.1/2003.html
The way we obtain a reference to a inode entry may be unreliable since inode
refcounting works in different ways. Also a reference to the superblock is necessary
in order to be able to operate on the inodes.
Cc: Miklos Szeredi <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>
---
fs/inode.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 6 ++
2 files changed, 129 insertions(+)
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c 2010-01-29 12:03:04.000000000 -0600
+++ linux-2.6/fs/inode.c 2010-01-29 12:03:25.000000000 -0600
@@ -1538,6 +1538,128 @@ static int __init set_ihash_entries(char
__setup("ihash_entries=", set_ihash_entries);
/*
+ * Obtain a refcount on a list of struct inodes pointed to by v. If the
+ * inode is in the process of being freed then zap the v[] entry so that
+ * we skip the freeing attempts later.
+ *
+ * This is a generic function for the ->get slab defrag callback.
+ */
+void *get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ int i;
+
+ spin_lock(&inode_lock);
+ for (i = 0; i < nr; i++) {
+ struct inode *inode = v[i];
+
+ if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+ v[i] = NULL;
+ else
+ __iget(inode);
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(get_inodes);
+
+/*
+ * Function for filesystems that embedd struct inode into their own
+ * fs inode. The offset is the offset of the struct inode in the fs inode.
+ *
+ * The function adds to the pointers in v[] in order to make them point to
+ * struct inode. Then get_inodes() is used to get the refcount.
+ * The converted v[] pointers can then also be passed to the kick() callback
+ * without further processing.
+ */
+void *fs_get_inodes(struct kmem_cache *s, int nr, void **v,
+ unsigned long offset)
+{
+ int i;
+
+ for (i = 0; i < nr; i++)
+ v[i] += offset;
+
+ return get_inodes(s, nr, v);
+}
+EXPORT_SYMBOL(fs_get_inodes);
+
+/*
+ * Generic callback function slab defrag ->kick methods. Takes the
+ * array with inodes where we obtained refcounts using fs_get_inodes()
+ * or get_inodes() and tries to free them.
+ */
+void kick_inodes(struct kmem_cache *s, int nr, void **v, void *private)
+{
+ struct inode *inode;
+ int i;
+ int abort = 0;
+ LIST_HEAD(freeable);
+ int active;
+
+ for (i = 0; i < nr; i++) {
+ inode = v[i];
+ if (!inode)
+ continue;
+
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ if (remove_inode_buffers(inode))
+ /*
+ * Should we really be doing this? Or
+ * limit the writeback here to only a few pages?
+ *
+ * Possibly an expensive operation but we
+ * cannot reclaim the inode if the pages
+ * are still present.
+ */
+ invalidate_mapping_pages(&inode->i_data,
+ 0, -1);
+ }
+
+ /* Invalidate children and dentry */
+ if (S_ISDIR(inode->i_mode)) {
+ struct dentry *d = d_find_alias(inode);
+
+ if (d) {
+ d_invalidate(d);
+ dput(d);
+ }
+ }
+
+ if (inode->i_state & I_DIRTY)
+ write_inode_now(inode, 1);
+
+ d_prune_aliases(inode);
+ }
+
+ mutex_lock(&iprune_mutex);
+ for (i = 0; i < nr; i++) {
+ inode = v[i];
+
+ if (!inode)
+ /* inode is alrady being freed */
+ continue;
+
+ active = inode->i_sb->s_flags & MS_ACTIVE;
+ iput(inode);
+ if (abort || !active)
+ continue;
+
+ spin_lock(&inode_lock);
+ abort = !can_unuse(inode);
+
+ if (!abort) {
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ inodes_stat.nr_unused--;
+ }
+ spin_unlock(&inode_lock);
+ }
+ dispose_list(&freeable);
+ mutex_unlock(&iprune_mutex);
+}
+EXPORT_SYMBOL(kick_inodes);
+
+/*
* Initialize the waitqueues and inode hash table.
*/
void __init inode_init_early(void)
@@ -1576,6 +1698,7 @@ void __init inode_init(void)
SLAB_MEM_SPREAD),
init_once);
register_shrinker(&icache_shrinker);
+ kmem_cache_setup_defrag(inode_cachep, get_inodes, kick_inodes);
/* Hash may have been set up in inode_init_early */
if (!hashdist)
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h 2010-01-29 12:03:04.000000000 -0600
+++ linux-2.6/include/linux/fs.h 2010-01-29 12:03:25.000000000 -0600
@@ -2466,5 +2466,11 @@ int __init get_filesystem_list(char *buf
#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE))
+/* Helper functions for inode defragmentation support in filesystems */
+extern void kick_inodes(struct kmem_cache *, int, void **, void *);
+extern void *get_inodes(struct kmem_cache *, int nr, void **);
+extern void *fs_get_inodes(struct kmem_cache *, int nr, void **,
+ unsigned long offset);
+
#endif /* __KERNEL__ */
#endif /* _LINUX_FS_H */
--
On Fri, Jan 29, 2010 at 02:49:42PM -0600, Christoph Lameter wrote:
> This implements the ability to remove inodes in a particular slab
> from inode caches. In order to remove an inode we may have to write out
> the pages of an inode, the inode itself and remove the dentries referring
> to the node.
>
> Provide generic functionality that can be used by filesystems that have
> their own inode caches to also tie into the defragmentation functions
> that are made available here.
>
> FIXES NEEDED!
>
> Note Miklos comments on the patch at http://lkml.indiana.edu/hypermail/linux/kernel/0810.1/2003.html
>
> The way we obtain a reference to a inode entry may be unreliable since inode
> refcounting works in different ways. Also a reference to the superblock is necessary
> in order to be able to operate on the inodes.
>
> Cc: Miklos Szeredi <[email protected]>
> Cc: Alexander Viro <[email protected]>
> Cc: Christoph Hellwig <[email protected]>
> Reviewed-by: Rik van Riel <[email protected]>
> Signed-off-by: Christoph Lameter <[email protected]>
> Signed-off-by: Christoph Lameter <[email protected]>
>
> ---
> fs/inode.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/fs.h | 6 ++
> 2 files changed, 129 insertions(+)
>
> Index: linux-2.6/fs/inode.c
> ===================================================================
> --- linux-2.6.orig/fs/inode.c 2010-01-29 12:03:04.000000000 -0600
> +++ linux-2.6/fs/inode.c 2010-01-29 12:03:25.000000000 -0600
> @@ -1538,6 +1538,128 @@ static int __init set_ihash_entries(char
> __setup("ihash_entries=", set_ihash_entries);
>
> /*
> + * Obtain a refcount on a list of struct inodes pointed to by v. If the
> + * inode is in the process of being freed then zap the v[] entry so that
> + * we skip the freeing attempts later.
> + *
> + * This is a generic function for the ->get slab defrag callback.
> + */
> +void *get_inodes(struct kmem_cache *s, int nr, void **v)
> +{
> + int i;
> +
> + spin_lock(&inode_lock);
> + for (i = 0; i < nr; i++) {
> + struct inode *inode = v[i];
> +
> + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
> + v[i] = NULL;
> + else
> + __iget(inode);
> + }
> + spin_unlock(&inode_lock);
> + return NULL;
> +}
> +EXPORT_SYMBOL(get_inodes);
How do you expect defrag to behave when the filesystem doesn't free
the inode immediately during dispose_list()? That is, the above code
only finds inodes that are still active at the VFS level but they
may still live for a significant period of time after the
dispose_list() call. This is a real issue now that XFS has combined
the VFS and XFS inodes into the same slab...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Fri, Jan 29, 2010 at 02:49:42PM -0600, Christoph Lameter wrote:
> This implements the ability to remove inodes in a particular slab
> from inode caches. In order to remove an inode we may have to write out
> the pages of an inode, the inode itself and remove the dentries referring
> to the node.
How often is this going to happen? Removing an inode is an incredibly
expensive operation. We have to eject all of the pages from the page
cache, and if those pages are getting a huge amount of use --- say,
those corresponding to some shared library like libc --- or which have
a huge number of pages that are actively getting used, the thrashing
that is going to result is going to enormous.
There needs to be some kind of cost/benefit analysis done about
whether or not this is worth it. Does it make sense to potentially
force hundreds and hundreds of megaytes of pages to get thrashed in
and out just to recover a single 4k page? In some cases, maybe yes.
But in other cases, the results could be disastrous.
> +/*
> + * Generic callback function slab defrag ->kick methods. Takes the
> + * array with inodes where we obtained refcounts using fs_get_inodes()
> + * or get_inodes() and tries to free them.
> + */
> +void kick_inodes(struct kmem_cache *s, int nr, void **v, void *private)
> +{
> + struct inode *inode;
> + int i;
> + int abort = 0;
> + LIST_HEAD(freeable);
> + int active;
> +
> + for (i = 0; i < nr; i++) {
> + inode = v[i];
> + if (!inode)
> + continue;
In some cases, it's going to be impossible to empty a particular slab
cache page. For example, there may be one inode which has pages
locked into memory, or which we may decide (once we add some
intelligence into this function) is really not worth ejecting. In
that case, there's no point dumping the rest of the inodes on that
particular slab page.
> + if (inode_has_buffers(inode) || inode->i_data.nrpages) {
> + if (remove_inode_buffers(inode))
> + /*
> + * Should we really be doing this? Or
> + * limit the writeback here to only a few pages?
> + *
> + * Possibly an expensive operation but we
> + * cannot reclaim the inode if the pages
> + * are still present.
> + */
> + invalidate_mapping_pages(&inode->i_data,
> + 0, -1);
> + }
I do not thing this function does what you think it does....
"invalidate_mapping_pages() will not block on I/O activity, and it
will refuse to invalidate pages which are dirty, locked, under
writeback, or mapped into page tables."
So you need to force the data to be written *first*, then get the
pages removed from the page table, and only then, call
invalidate_mapping_pages(). Otherwise, this is just going to
pointlessly drop pages from the page cache and trashing the page
cache's effectiveness, without actually making it possible to drop a
particular inode if it is being used at all by any process.
Presumably then the defrag code, since it was unable to free a
particular page, will go on pillaging and raping other inodes in the
inode cache, until it can actually create a hugepage. This is why you
really shouldn't start trying to trash an inode until you're
**really** sure it's possible completely evict a 4k slab page of all
of its inodes.
- Ted
On Sat, Jan 30, 2010 at 02:26:23PM -0500, [email protected] wrote:
> On Fri, Jan 29, 2010 at 02:49:42PM -0600, Christoph Lameter wrote:
> > This implements the ability to remove inodes in a particular slab
> > from inode caches. In order to remove an inode we may have to write out
> > the pages of an inode, the inode itself and remove the dentries referring
> > to the node.
>
> How often is this going to happen? Removing an inode is an incredibly
The standard case is the classic updatedb. Lots of dentries/inodes cached
with no or little corresponding data cache.
> a huge number of pages that are actively getting used, the thrashing
> that is going to result is going to enormous.
I think the consensus so far is to try to avoid any inodes/dentries
which are dirty or used in any way.
I personally would prefer it to be more aggressive for memory offlining
though for RAS purposes though, but just handling the unused cases is a
good first step.
> "invalidate_mapping_pages() will not block on I/O activity, and it
> will refuse to invalidate pages which are dirty, locked, under
> writeback, or mapped into page tables."
I think that was the point.
-Andi
--
[email protected] -- Speaking for myself only.
On Sun, Jan 31, 2010 at 09:34:09AM +0100, Andi Kleen wrote:
>
> The standard case is the classic updatedb. Lots of dentries/inodes cached
> with no or little corresponding data cache.
>
> > a huge number of pages that are actively getting used, the thrashing
> > that is going to result is going to enormous.
>
> I think the consensus so far is to try to avoid any inodes/dentries
> which are dirty or used in any way.
OK, but in that case, the kick_inodes should check to see if the inode
is in use in any way (i.e., has dentries open that will tie it down,
is open, has pages that are dirty or are mapped into some page table)
before attempting to invalidating any of its pages. The patch as
currently constituted doesn't do that. It will attempt to drop all
pages owned by that inode before checking for any of these conditions.
If I wanted that, I'd just do "echo 3 > /proc/sys/vm/drop_caches".
Worse yet, *after* it does this, it tries to write out the pages the
inode. #1, this is pointless, since if the inode had any dirty pages,
they wouldn't have been invalidated, since it calls write_inode_now()
*after* calling invalidate_mapping_pages(), so the previously dirty
pages will still be mapped, and prevent the the inode from being
flushed. #2, it interferes with delayed allocation and becomes
another writeback path, which means some dirty pages might get flushed
too early and it does this writeout without any of the congestion
handling code in the bdi writeback paths.
If the consensus is "avoid any inodes/dentries which are dirty or
used in any way", THIS PATCH DOESN'T DO THAT.
I'd go further, and say that it should avoid trying to flush any inode
if any of its sibling inodes on the slab cache are dirty or in use in
any way. Otherwise, you end up dropping pages from the page cache and
still not be able to do any defragmentation.
> I personally would prefer it to be more aggressive for memory offlining
> though for RAS purposes though, but just handling the unused cases is a
> good first step.
If you want something more aggressive, why not just "echo 3 >
/proc/sys/vm/drop_caches"? We have that already. If the answer is,
because it will trash the performance of the system, I'm concerned
this patch series will do this --- consistently.
If the concern is that the inode cache is filled with crap after an
updatedb run, then we should fix *that* problem; we need a way for
programs like updatedb to indicate that they are scanning lots of
inodes, and if the inode wasn't in cache before it was opened, it
should be placed on the short list to be dropped after it's closed.
Making that a new open(2) flag makes a lot of sense. Let's solve the
real problem here, if that's the concern.
But most of the time, I *want* the page cache filled, since it means
less time wasted accessing spinning rust platters. The last thing I
want is a some helpful defragmentation kernel thread constantly
wandering through inode caches, and randomly calling
"invalidate_mapping_pages" on inodes since it thinks this will help
defrag huge pages. If I'm not running an Oracle database on my
laptop, but instead am concerned about battery lifetime, this is the
last thing I would want.
- Ted
On Sun, Jan 31, 2010 at 09:34:09AM +0100, Andi Kleen wrote:
> On Sat, Jan 30, 2010 at 02:26:23PM -0500, [email protected] wrote:
> > On Fri, Jan 29, 2010 at 02:49:42PM -0600, Christoph Lameter wrote:
> > > This implements the ability to remove inodes in a particular slab
> > > from inode caches. In order to remove an inode we may have to write out
> > > the pages of an inode, the inode itself and remove the dentries referring
> > > to the node.
> >
> > How often is this going to happen? Removing an inode is an incredibly
>
> The standard case is the classic updatedb. Lots of dentries/inodes cached
> with no or little corresponding data cache.
I don't believe that updatedb has anything to do with causing
internal inode/dentry slab fragmentation. In all my testing I rarely
see use-once filesystem traversals cause internal slab
fragmentation. This appears to be a result of use-once filesystem
traversal resulting in slab pages full of objects that have the same
locality of access. Hence each new slab page that traversal
allocates will contain objects that will be adjacent in the LRU.
Hence LRU-based reclaim is very likely to free all the objects on
each page in the same pass and as such no fragmentation will occur.
All the cases of inode/dentry slab fragmentation I have seen are a
result of access patterns that result in slab pages containing
objects with different temporal localities. It's when the access
pattern is sufficiently distributed throughout the working set we
get the "need to free 95% of the objects in the entire cache to free
a single page" type of reclaim behaviour.
AFAICT, the defrag patches as they stand don't really address the
fundamental problem of differing temporal locality inside a slab
page. It makes the assumption that "partial page == defrag
candidate" but there isn't any further consideration of when any of
the remaing objects were last accessed. I think that this really
does need to be taken into account, especially considering that the
allocator tries to fill partial pages with new objects before
allocating new pages and so the page under reclaim might contain
very recently allocated objects.
Someone in a previous discussion on this patch set (Nick? Hugh,
maybe? I can't find the reference right now) mentioned something
like this about the design of the force-reclaim operations. IIRC the
suggestion was that it may be better to track LRU-ness by per-slab
page rather than per-object so that reclaim can target the slab
pages that - on aggregate - had the oldest objects in it. I think
this has merit - prevention of internal fragmentation seems like a
better approach to me than to try to cure it after it is already
present....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Sun, Jan 31, 2010 at 04:02:07PM -0500, [email protected] wrote:
> OK, but in that case, the kick_inodes should check to see if the inode
> is in use in any way (i.e., has dentries open that will tie it down,
> is open, has pages that are dirty or are mapped into some page table)
> before attempting to invalidating any of its pages. The patch as
> currently constituted doesn't do that. It will attempt to drop all
> pages owned by that inode before checking for any of these conditions.
> If I wanted that, I'd just do "echo 3 > /proc/sys/vm/drop_caches".
Yes the patch is more aggressive and probably needs to be fixed.
On the other hand I would like to keep the option to be more aggressive
for soft page offlining where it's useful and nobody cares about
the cost.
> Worse yet, *after* it does this, it tries to write out the pages the
> inode. #1, this is pointless, since if the inode had any dirty pages,
> they wouldn't have been invalidated, since it calls write_inode_now()
Yes .... fought with all that for hwpoison too.
> I'd go further, and say that it should avoid trying to flush any inode
> if any of its sibling inodes on the slab cache are dirty or in use in
> any way. Otherwise, you end up dropping pages from the page cache and
> still not be able to do any defragmentation.
It depends -- for normal operation when running low on memory I agree
with you.
But for hwpoison soft offline purposes it's better to be more aggressive
-- even if that is inefficient -- but number one priority is to still
be correct of course.
>
> If the concern is that the inode cache is filled with crap after an
> updatedb run, then we should fix *that* problem; we need a way for
> programs like updatedb to indicate that they are scanning lots of
> inodes, and if the inode wasn't in cache before it was opened, it
> this patch series will do this --- consistently.
This has been tried many times and nobody came up with a good approach
to detect it automatically that doesn't have bad regressions in corner
cases.
Or the "let's add a updatedb" hint approach has the problem that
it won't cover a lot of other programs (as Linus always points out
these new interfaces rarely actually get used)
Also as Linus always points out -- thi
> But most of the time, I *want* the page cache filled, since it means
> less time wasted accessing spinning rust platters. The last thing I
> want is a some helpful defragmentation kernel thread constantly
> wandering through inode caches, and randomly calling
The problem right now this patch series tries to access is that
when you run out of memory it tends to blow away your dcaches caches
because the dcache reclaim is just too stupid to actually free
memory without going through most of the LRU list.
So yes it's all about improving caching. But yes also
some details need to be improved
-Andi
--
[email protected] -- Speaking for myself only.
On Mon, Feb 01, 2010 at 11:17:02AM +0100, Andi Kleen wrote:
>
> On the other hand I would like to keep the option to be more aggressive
> for soft page offlining where it's useful and nobody cares about
> the cost.
I'm not sure I understand what the goals are for "soft page
offlining". Can you say a bit more about that?
> Or the "let's add a updatedb" hint approach has the problem that
> it won't cover a lot of other programs (as Linus always points out
> these new interfaces rarely actually get used)
Sure, but the number of programs that scan all of the files in a file
system and would need this sort of hint are actually pretty small.
Uptdatedb and backup programs are pretty much about it.
- Ted
On Mon, Feb 01, 2010 at 08:47:39AM -0500, [email protected] wrote:
> On Mon, Feb 01, 2010 at 11:17:02AM +0100, Andi Kleen wrote:
> >
> > On the other hand I would like to keep the option to be more aggressive
> > for soft page offlining where it's useful and nobody cares about
> > the cost.
>
> I'm not sure I understand what the goals are for "soft page
> offlining". Can you say a bit more about that?
Predictive offlining of memory pages based on corrected error counts.
This is a useful feature to get more out of lower quality (and even
high quality) DIMMs.
This is already implemented in mcelog+.33ish memory-failure.c, but right
now it's quite dumb when trying to free a dcache/inode page (it basically
always has to blow away everything)
Also this is just one use case for this. The other would be 2MB
page at runtime support by doing targetted freeing (would be especially
useful with the upcoming transparent huge pages). Probably others
too. I merely mostly quoted hwpoison because I happen to work on that.
>
> > Or the "let's add a updatedb" hint approach has the problem that
> > it won't cover a lot of other programs (as Linus always points out
> > these new interfaces rarely actually get used)
>
> Sure, but the number of programs that scan all of the files in a file
> system and would need this sort of hint are actually pretty small.
Not sure that's true.
Also consider a file server :- how would you get that hint from the
clients?
-Andi
--
[email protected] -- Speaking for myself only.
On Sat, 30 Jan 2010, Dave Chinner wrote:
> How do you expect defrag to behave when the filesystem doesn't free
> the inode immediately during dispose_list()? That is, the above code
> only finds inodes that are still active at the VFS level but they
> may still live for a significant period of time after the
> dispose_list() call. This is a real issue now that XFS has combined
> the VFS and XFS inodes into the same slab...
Then the freeing of the slab has to be delayed until the objects are
freed.
On Mon, 1 Feb 2010, Dave Chinner wrote:
> > The standard case is the classic updatedb. Lots of dentries/inodes cached
> > with no or little corresponding data cache.
>
> I don't believe that updatedb has anything to do with causing
> internal inode/dentry slab fragmentation. In all my testing I rarely
> see use-once filesystem traversals cause internal slab
> fragmentation. This appears to be a result of use-once filesystem
> traversal resulting in slab pages full of objects that have the same
> locality of access. Hence each new slab page that traversal
> allocates will contain objects that will be adjacent in the LRU.
> Hence LRU-based reclaim is very likely to free all the objects on
> each page in the same pass and as such no fragmentation will occur.
updatedb causes lots of partially allocated slab pages. While updatedb
runs other filesystem activities occur. And updatedb does not work in
straightforward linear fashion. dentries are cached and slowly expired etc
etc. Updatedb may not cause the fragmentation on a level that you observed
with some of the filesystem loads on large systems.
> All the cases of inode/dentry slab fragmentation I have seen are a
> result of access patterns that result in slab pages containing
> objects with different temporal localities. It's when the access
> pattern is sufficiently distributed throughout the working set we
> get the "need to free 95% of the objects in the entire cache to free
> a single page" type of reclaim behaviour.
There are also other factors at play like the different NUMA node,
concurrent processes. A strict optimized HPC workload may be able to
eliminate other factors but that is not the case for typical workloads.
Access patterns are typically somewhat distribyted.
> AFAICT, the defrag patches as they stand don't really address the
> fundamental problem of differing temporal locality inside a slab
> page. It makes the assumption that "partial page == defrag
> candidate" but there isn't any further consideration of when any of
> the remaing objects were last accessed. I think that this really
> does need to be taken into account, especially considering that the
> allocator tries to fill partial pages with new objects before
> allocating new pages and so the page under reclaim might contain
> very recently allocated objects.
Reclaim is only run if there is memory pressure. This means that lots of
reclaimable entities exist and therefore we can assume that many of these
have had a somewhat long lifetime. The allocator tries to fill partial
pages with new objects and then retires those pages to the full slab list.
Those are not subject to reclaim efforts covered here. A page under
reclaim is likely to contain many recently freed objects.
The remaining objects may have a long lifetime and a high usage pattern
but it is worth to relocate them into other slabs if they prevent reclaim
of the page. Relocation occurs in this patchset by reclaim and then the
next use likely causes the reallocation in a partially allocated slab.
This means that objects with a high usage count will tend to be aggregated
in full slabs that are no longer subject to targeted reclaim.
We could improve the situation by allowing the moving of objects (which
would avoid the reclaim and realloc) but that is complex and so needs to
be deferred to a second stage (same approach we went through with page
migration).
> Someone in a previous discussion on this patch set (Nick? Hugh,
> maybe? I can't find the reference right now) mentioned something
> like this about the design of the force-reclaim operations. IIRC the
> suggestion was that it may be better to track LRU-ness by per-slab
> page rather than per-object so that reclaim can target the slab
> pages that - on aggregate - had the oldest objects in it. I think
> this has merit - prevention of internal fragmentation seems like a
> better approach to me than to try to cure it after it is already
> present....
LRUness exists in terms of the list of partial slab pages. Frequently
allocated slabs are in the front of the queue and less used slabs are in
the rear. Defrag/reclaim occurs from the rear.
On Wed, Feb 03, 2010 at 09:31:49AM -0600, Christoph Lameter wrote:
> On Mon, 1 Feb 2010, Dave Chinner wrote:
>
> > > The standard case is the classic updatedb. Lots of dentries/inodes cached
> > > with no or little corresponding data cache.
> >
> > I don't believe that updatedb has anything to do with causing
> > internal inode/dentry slab fragmentation. In all my testing I rarely
> > see use-once filesystem traversals cause internal slab
> > fragmentation. This appears to be a result of use-once filesystem
> > traversal resulting in slab pages full of objects that have the same
> > locality of access. Hence each new slab page that traversal
> > allocates will contain objects that will be adjacent in the LRU.
> > Hence LRU-based reclaim is very likely to free all the objects on
> > each page in the same pass and as such no fragmentation will occur.
>
> updatedb causes lots of partially allocated slab pages. While updatedb
> runs other filesystem activities occur. And updatedb does not work in
> straightforward linear fashion. dentries are cached and slowly expired etc
> etc.
Sure, but my point was that updatedb hits lots of inodes only once,
and for those objects the order of caching and expiration are
exactly the same. Hence after reclaim of the updatedb dentries/inodes
the amount of fragmentation in the slab will be almost exactly the
same as it was before the updatedb run.
> > All the cases of inode/dentry slab fragmentation I have seen are a
> > result of access patterns that result in slab pages containing
> > objects with different temporal localities. It's when the access
> > pattern is sufficiently distributed throughout the working set we
> > get the "need to free 95% of the objects in the entire cache to free
> > a single page" type of reclaim behaviour.
>
> There are also other factors at play like the different NUMA node,
> concurrent processes.
Yes, those are just more factors in the access patterns being
"sufficiently distributed throughout the working set".
> > AFAICT, the defrag patches as they stand don't really address the
> > fundamental problem of differing temporal locality inside a slab
> > page. It makes the assumption that "partial page == defrag
> > candidate" but there isn't any further consideration of when any of
> > the remaing objects were last accessed. I think that this really
> > does need to be taken into account, especially considering that the
> > allocator tries to fill partial pages with new objects before
> > allocating new pages and so the page under reclaim might contain
> > very recently allocated objects.
>
> Reclaim is only run if there is memory pressure. This means that lots of
> reclaimable entities exist and therefore we can assume that many of these
> have had a somewhat long lifetime. The allocator tries to fill partial
> pages with new objects and then retires those pages to the full slab list.
> Those are not subject to reclaim efforts covered here. A page under
> reclaim is likely to contain many recently freed objects.
Not necessarily. It might contain only one recently reclaimed object,
but have several other hot objects in the page....
> The remaining objects may have a long lifetime and a high usage pattern
> but it is worth to relocate them into other slabs if they prevent reclaim
> of the page.
I completely disagree. If you have to trash all the cache hot
information related to the cached object in the process of
relocating it, then you've just screwed up application performance
and in a completely unpredictable manner. Admins will be tearing out
their hair trying to work out why their applications randomly slow
down....
> > Someone in a previous discussion on this patch set (Nick? Hugh,
> > maybe? I can't find the reference right now) mentioned something
> > like this about the design of the force-reclaim operations. IIRC the
> > suggestion was that it may be better to track LRU-ness by per-slab
> > page rather than per-object so that reclaim can target the slab
> > pages that - on aggregate - had the oldest objects in it. I think
> > this has merit - prevention of internal fragmentation seems like a
> > better approach to me than to try to cure it after it is already
> > present....
>
> LRUness exists in terms of the list of partial slab pages. Frequently
> allocated slabs are in the front of the queue and less used slabs are in
> the rear. Defrag/reclaim occurs from the rear.
You missed my point again. You're still talking about tracking pages
with no regard to the objects remaining in the pages. A page, full
or partial, is a candidate for object reclaim if none of the objects
on it are referenced and have not been referenced for some time.
You are currently relying on the existing LRU reclaim to move a slab
from full to partial to trigger defragmentation, but you ignore the
hotness of the rest of the objects on the page by trying to reclaim
the page that has been partial for the longest period of time.
What it comes down to is that the slab has two states for objects -
allocated and free - but what we really need here is 3 states -
allocated, unused and freed. We currently track unused objects
outside the slab in LRU lists and, IMO, that is the source of our
fragmentation problems because it has no knowledge of the spatial
layout of the slabs and the state of other objects in the page.
What I'm suggesting is that we ditch the external LRUs and track the
"unused" state inside the slab and then use that knowledge to decide
which pages to reclaim. e.g. slab_object_used() is called when the
first reference on an object is taken. slab_object_unused() is
called when the reference count goes to zero. The slab can then
track unused objects internally and when reclaim is needed can
select pages (full or partial) that only contain unused objects to
reclaim.
>From there the existing reclaim algorithms could be used to reclaim
the objects. i.e. the shrinkers become a slab reclaim callout that
are passed a linked list of objects to reclaim, very similar to the
way __shrink_dcache_sb() and prune_icache() first build a list of
objects to reclaim, then work off that list of objects.
If the goal is to reduce fragmentation, then this seems like a
much better approach to me - it is inherently fragmentation
resistent and much more closely aligned to existing object reclaim
algorithms.
If the goal is random slab page shootdown (e.g. for hwpoison), then
it's a much more complex problem because you can't shoot down
active, referenced objects without revoke(). Hence I think the
two problem spaces should be kept separate as it's not obvious
that they can both be solved with the same mechanism....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Thu, Feb 04, 2010 at 11:34:10AM +1100, Dave Chinner wrote:
>
> I completely disagree. If you have to trash all the cache hot
> information related to the cached object in the process of
> relocating it, then you've just screwed up application performance
> and in a completely unpredictable manner. Admins will be tearing out
> their hair trying to work out why their applications randomly slow
> down....
...
> You missed my point again. You're still talking about tracking pages
> with no regard to the objects remaining in the pages. A page, full
> or partial, is a candidate for object reclaim if none of the objects
> on it are referenced and have not been referenced for some time.
>
> You are currently relying on the existing LRU reclaim to move a slab
> from full to partial to trigger defragmentation, but you ignore the
> hotness of the rest of the objects on the page by trying to reclaim
> the page that has been partial for the longest period of time.
Well said.
This is exactly what I was complaining about as well, but apparently I
wasn't understood the first time either. :-(
This *has* to be fixed, or this set of patches is going to completely
trash the overall system performance, by trashing the page cache.
> What it comes down to is that the slab has two states for objects -
> allocated and free - but what we really need here is 3 states -
> allocated, unused and freed. We currently track unused objects
> outside the slab in LRU lists and, IMO, that is the source of our
> fragmentation problems because it has no knowledge of the spatial
> layout of the slabs and the state of other objects in the page.
>
> What I'm suggesting is that we ditch the external LRUs and track the
> "unused" state inside the slab and then use that knowledge to decide
> which pages to reclaim.
Or maybe we need to have the way to track the LRU of the slab page as
a whole? Any time we touch an object on the slab page, we touch the
last updatedness of the slab as a hole.
It's actually more complicated than that, though. Even if no one has
touched a particular inode, if one of the inode in the slab page is
pinned down because it is in use, so there's no point for the
defragmenter trying to throw away valuable cached pages associated
with other inodes in the same slab page --- since because of that
single pinned inode, YOU'RE NEVER GOING TO DEFRAG THAT PAGE.
And of course, if the inode is pinned down because it is opened and/or
mmaped, then its associated dcache entry can't be freed either, so
there's no point trying to trash all of its sibling dentries on the
same page as that dcache entry.
Randomly shooting down dcache and inode entries in the hopes of
creating coalescing free pages into hugepages is just not cool. If
you're that desperate, you might as well just do "echo 3 >
/proc/sys/vm/drop_caches". From my read of the algorithms, it's going
to be almost as destructive to system performance.
- Ted
On Wed, Feb 03, 2010 at 10:07:36PM -0500, [email protected] wrote:
> On Thu, Feb 04, 2010 at 11:34:10AM +1100, Dave Chinner wrote:
> > What it comes down to is that the slab has two states for objects -
> > allocated and free - but what we really need here is 3 states -
> > allocated, unused and freed. We currently track unused objects
> > outside the slab in LRU lists and, IMO, that is the source of our
> > fragmentation problems because it has no knowledge of the spatial
> > layout of the slabs and the state of other objects in the page.
> >
> > What I'm suggesting is that we ditch the external LRUs and track the
> > "unused" state inside the slab and then use that knowledge to decide
> > which pages to reclaim.
>
> Or maybe we need to have the way to track the LRU of the slab page as
> a whole? Any time we touch an object on the slab page, we touch the
> last updatedness of the slab as a hole.
Yes, that's pretty much what I have been trying to describe. ;)
(And, IIUC, what I think Nick has been trying to describe as well
when he's been saying we should "turn reclaim upside down".)
It seems to me to be pretty simple to track, too, if we define pages
for reclaim to only be those that are full of unused objects. i.e.
the pages have the two states:
- Active: some allocated and referenced object on the page
=> no need for LRU tracking of these
- Unused: all allocated objects on the page are not used
=> these pages are LRU tracked within the slab
A single referenced object is enough to change the state of the
page from Unused to Active, and when page transitions from
Active to Unused is goes on the MRU end of the LRU queue.
Reclaim would then start with the oldest pages on the LRU....
> It's actually more complicated than that, though. Even if no one has
> touched a particular inode, if one of the inode in the slab page is
> pinned down because it is in use,
A single active object like this would the slab page Active, and
therefore not a candidate for reclaim. Also, we already reclaim
dentries before inodes because dentries pin inodes, so our
algorithms for reclaim already deal with these ordering issues for
us.
...
> And of course, if the inode is pinned down because it is opened and/or
> mmaped, then its associated dcache entry can't be freed either, so
> there's no point trying to trash all of its sibling dentries on the
> same page as that dcache entry.
Agreed - that's why I think preventing fragemntation caused by LRU
reclaim is best dealt with internally to slab where both object age
and locality can be taken into account.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Thu, Feb 04, 2010 at 02:39:11PM +1100, Dave Chinner wrote:
> On Wed, Feb 03, 2010 at 10:07:36PM -0500, [email protected] wrote:
> > On Thu, Feb 04, 2010 at 11:34:10AM +1100, Dave Chinner wrote:
> > > What it comes down to is that the slab has two states for objects -
> > > allocated and free - but what we really need here is 3 states -
> > > allocated, unused and freed. We currently track unused objects
> > > outside the slab in LRU lists and, IMO, that is the source of our
> > > fragmentation problems because it has no knowledge of the spatial
> > > layout of the slabs and the state of other objects in the page.
> > >
> > > What I'm suggesting is that we ditch the external LRUs and track the
> > > "unused" state inside the slab and then use that knowledge to decide
> > > which pages to reclaim.
> >
> > Or maybe we need to have the way to track the LRU of the slab page as
> > a whole? Any time we touch an object on the slab page, we touch the
> > last updatedness of the slab as a hole.
>
> Yes, that's pretty much what I have been trying to describe. ;)
> (And, IIUC, what I think Nick has been trying to describe as well
> when he's been saying we should "turn reclaim upside down".)
Well what I described is to do the slab pinning from the reclaim path
(rather than from slab calling into the subsystem). All slab locking
basically "innermost", so you can pretty much poke the slab layer as
much as you like from the subsystem.
After that, LRU on slabs should be fairly easy. Slab could provide a
private per-slab pointer for example that is managed by the caller.
Subsystem can then call into slab to find the objects.
> It seems to me to be pretty simple to track, too, if we define pages
> for reclaim to only be those that are full of unused objects. i.e.
> the pages have the two states:
>
> - Active: some allocated and referenced object on the page
> => no need for LRU tracking of these
> - Unused: all allocated objects on the page are not used
> => these pages are LRU tracked within the slab
>
> A single referenced object is enough to change the state of the
> page from Unused to Active, and when page transitions from
> Active to Unused is goes on the MRU end of the LRU queue.
> Reclaim would then start with the oldest pages on the LRU....
>
> > It's actually more complicated than that, though. Even if no one has
> > touched a particular inode, if one of the inode in the slab page is
> > pinned down because it is in use,
>
> A single active object like this would the slab page Active, and
> therefore not a candidate for reclaim. Also, we already reclaim
> dentries before inodes because dentries pin inodes, so our
> algorithms for reclaim already deal with these ordering issues for
> us.
>
> ...
>
> > And of course, if the inode is pinned down because it is opened and/or
> > mmaped, then its associated dcache entry can't be freed either, so
> > there's no point trying to trash all of its sibling dentries on the
> > same page as that dcache entry.
>
> Agreed - that's why I think preventing fragemntation caused by LRU
> reclaim is best dealt with internally to slab where both object age
> and locality can be taken into account.
On Thu, 4 Feb 2010, Dave Chinner wrote:
> > Or maybe we need to have the way to track the LRU of the slab page as
> > a whole? Any time we touch an object on the slab page, we touch the
> > last updatedness of the slab as a hole.
>
> Yes, that's pretty much what I have been trying to describe. ;)
> (And, IIUC, what I think Nick has been trying to describe as well
> when he's been saying we should "turn reclaim upside down".)
>
> It seems to me to be pretty simple to track, too, if we define pages
> for reclaim to only be those that are full of unused objects. i.e.
> the pages have the two states:
>
> - Active: some allocated and referenced object on the page
> => no need for LRU tracking of these
> - Unused: all allocated objects on the page are not used
> => these pages are LRU tracked within the slab
>
> A single referenced object is enough to change the state of the
> page from Unused to Active, and when page transitions from
> Active to Unused is goes on the MRU end of the LRU queue.
> Reclaim would then start with the oldest pages on the LRU....
These are describing ways of reclaim that could be implemented by the fs
layer. The information what item is "unused" or "referenced" is a notion
of the fs. The slab caches know only of two object states: Free or
allocated. LRU handling of slab pages is something entirely different
from the LRU of the inodes and dentries.
> > And of course, if the inode is pinned down because it is opened and/or
> > mmaped, then its associated dcache entry can't be freed either, so
> > there's no point trying to trash all of its sibling dentries on the
> > same page as that dcache entry.
>
> Agreed - that's why I think preventing fragemntation caused by LRU
> reclaim is best dealt with internally to slab where both object age
> and locality can be taken into account.
Object age is not known by the slab. Locality is only considered in terms
of hardware placement (Numa nodes) not in relationship to objects of other
caches (like inodes and dentries) or the same caches.
If we want this then we may end up with a special allocator for the
filesystem.
You and I have discussed a couple of years ago to add a reference count to
the objects of the slab allocator. Those explorations resulted in am much
more complicated and different allocator that is geared to the needs of
the filesystem for reclaim.
On Thu, 4 Feb 2010, Nick Piggin wrote:
> Well what I described is to do the slab pinning from the reclaim path
> (rather than from slab calling into the subsystem). All slab locking
> basically "innermost", so you can pretty much poke the slab layer as
> much as you like from the subsystem.
Reclaim/defrag is called from the reclaim path (of the VM). We could
enable a call from the fs reclaim code into the slab. But how would this
work?
> After that, LRU on slabs should be fairly easy. Slab could provide a
> private per-slab pointer for example that is managed by the caller.
> Subsystem can then call into slab to find the objects.
Sure with some minor changes we could have a call that is giving you the
list of neighboring objects in a slab, while locking it? Then you can look
at the objects and decide which ones can be tossed and then do another
call to release the objects and unlock the slab.
On Thu, Feb 04, 2010 at 10:59:26AM -0600, Christoph Lameter wrote:
> On Thu, 4 Feb 2010, Dave Chinner wrote:
>
> > > Or maybe we need to have the way to track the LRU of the slab page as
> > > a whole? Any time we touch an object on the slab page, we touch the
> > > last updatedness of the slab as a hole.
> >
> > Yes, that's pretty much what I have been trying to describe. ;)
> > (And, IIUC, what I think Nick has been trying to describe as well
> > when he's been saying we should "turn reclaim upside down".)
> >
> > It seems to me to be pretty simple to track, too, if we define pages
> > for reclaim to only be those that are full of unused objects. i.e.
> > the pages have the two states:
> >
> > - Active: some allocated and referenced object on the page
> > => no need for LRU tracking of these
> > - Unused: all allocated objects on the page are not used
> > => these pages are LRU tracked within the slab
> >
> > A single referenced object is enough to change the state of the
> > page from Unused to Active, and when page transitions from
> > Active to Unused is goes on the MRU end of the LRU queue.
> > Reclaim would then start with the oldest pages on the LRU....
>
> These are describing ways of reclaim that could be implemented by the fs
> layer. The information what item is "unused" or "referenced" is a notion
> of the fs. The slab caches know only of two object states: Free or
> allocated. LRU handling of slab pages is something entirely different
> from the LRU of the inodes and dentries.
Ah, perhaps you missed my previous email in the thread about adding
a third object state to the slab - i.e. an unused state? And an
interface (slab_object_used()/slab_object_unused()) to allow the
external uses to tell the slab about state changes of objects
on the first/last reference to the object. That would allow the
tracking as I stated above....
> > > And of course, if the inode is pinned down because it is opened and/or
> > > mmaped, then its associated dcache entry can't be freed either, so
> > > there's no point trying to trash all of its sibling dentries on the
> > > same page as that dcache entry.
> >
> > Agreed - that's why I think preventing fragemntation caused by LRU
> > reclaim is best dealt with internally to slab where both object age
> > and locality can be taken into account.
>
> Object age is not known by the slab.
See above.
> Locality is only considered in terms
> of hardware placement (Numa nodes) not in relationship to objects of other
> caches (like inodes and dentries) or the same caches.
And that is the defficiency we've been talking about correcting! i.e
that object <-> page locality needs tobe taken into account during
reclaim. Moving used/unused knowledge into the slab where page/object
locality is known is one way of doing that....
> If we want this then we may end up with a special allocator for the
> filesystem.
I don't see why a small extension to the slab code can't fix this...
> You and I have discussed a couple of years ago to add a reference count to
> the objects of the slab allocator. Those explorations resulted in am much
> more complicated and different allocator that is geared to the needs of
> the filesystem for reclaim.
And those discussions and explorations lead to the current defrag
code. After a couple of year, I don't think that the design we came
up with back then is the best way to approach the problem - it still
has many, many flaws. We need to explore different approaches
because none of the evolutionary approaches (i.e. tack something
on the side) appear to be sufficient.
Cheers,
Dave.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
Dave Chinner
[email protected]
On Thu, Feb 04, 2010 at 11:13:15AM -0600, Christoph Lameter wrote:
> On Thu, 4 Feb 2010, Nick Piggin wrote:
>
> > Well what I described is to do the slab pinning from the reclaim path
> > (rather than from slab calling into the subsystem). All slab locking
> > basically "innermost", so you can pretty much poke the slab layer as
> > much as you like from the subsystem.
>
> Reclaim/defrag is called from the reclaim path (of the VM). We could
> enable a call from the fs reclaim code into the slab. But how would this
> work?
Well the exact details will depend, but I feel that things should
get easier because you pin the object (and therefore the slab) via
the normal and well tested reclaim paths.
So for example, for dcache, you will come in and take the normal
locks: dcache_lock, sb_lock, pin the sb, umount_lock. At which
point you have pinned dentries without changing any locking. So
then you can find the first entry on the LRU, and should be able
to then build a list of dentries on the same slab.
You still have the potential issue of now finding objects that would
not be visible by searching the LRU alone. However at least the
locking should be simplified.
> > After that, LRU on slabs should be fairly easy. Slab could provide a
> > private per-slab pointer for example that is managed by the caller.
> > Subsystem can then call into slab to find the objects.
>
> Sure with some minor changes we could have a call that is giving you the
> list of neighboring objects in a slab, while locking it? Then you can look
> at the objects and decide which ones can be tossed and then do another
> call to release the objects and unlock the slab.
Yep. Well... you may not even need to ask slab layer to lock the
slab. Provided that the subsystem is locking out changes. It could
possibly be helpful to have a call to lock and unlock the slab,
although usage of such an API would have to be very careful.
Thanks,
Nick
On Mon, 8 Feb 2010, Nick Piggin wrote:
> > > After that, LRU on slabs should be fairly easy. Slab could provide a
> > > private per-slab pointer for example that is managed by the caller.
> > > Subsystem can then call into slab to find the objects.
> >
> > Sure with some minor changes we could have a call that is giving you the
> > list of neighboring objects in a slab, while locking it? Then you can look
> > at the objects and decide which ones can be tossed and then do another
> > call to release the objects and unlock the slab.
>
> Yep. Well... you may not even need to ask slab layer to lock the
> slab. Provided that the subsystem is locking out changes. It could
> possibly be helpful to have a call to lock and unlock the slab,
> although usage of such an API would have to be very careful.
True, if you are holding a reference to an object in a slab page and
there is a guarantee that the object is not going away then the slab is already
effectively pinned.
So we just need a call that returns
1. The number of allocated objects in a slab page
2. The total possible number of objects
3. A list of pointers to the objects
?
Then reclaim could make a decision if you want these objects to be
reclaimed.
Such a function could actually be a much less code than the current
patchset and would also be easy to do for SLAB/SLOB.
On Mon, Feb 08, 2010 at 06:37:53PM +1100, Nick Piggin wrote:
> On Thu, Feb 04, 2010 at 11:13:15AM -0600, Christoph Lameter wrote:
> > On Thu, 4 Feb 2010, Nick Piggin wrote:
> >
> > > Well what I described is to do the slab pinning from the reclaim path
> > > (rather than from slab calling into the subsystem). All slab locking
> > > basically "innermost", so you can pretty much poke the slab layer as
> > > much as you like from the subsystem.
> >
> > Reclaim/defrag is called from the reclaim path (of the VM). We could
> > enable a call from the fs reclaim code into the slab. But how would this
> > work?
>
> Well the exact details will depend, but I feel that things should
> get easier because you pin the object (and therefore the slab) via
> the normal and well tested reclaim paths.
>
> So for example, for dcache, you will come in and take the normal
> locks: dcache_lock, sb_lock, pin the sb, umount_lock. At which
> point you have pinned dentries without changing any locking. So
> then you can find the first entry on the LRU, and should be able
> to then build a list of dentries on the same slab.
>
> You still have the potential issue of now finding objects that would
> not be visible by searching the LRU alone. However at least the
> locking should be simplified.
Very true, but that leads us to the same problem of fragmented
caches because we empty unused objects off slabs that are still
pinned by hot objects and don't free the page. I agree that we can't
totally avoid this problem, but I still think that using an object
based LRU for reclaim has a fundamental mismatch with page based
reclaim that makes this problem worse than it could be.
FWIW, if we change the above to keeping a page based LRU in the slab
cache and the slab picks a page to reclaim, then the problem goes
mostly away, I think. We don't need to pin the slab to select and
prepare a page to reclaim - the cache only needs to be locked before
it starts reclaim. I think this has a much better chance of
reclaiming entire pages in situations where LRU based reclaim will
leave fragmentation.
i.e. instead of:
shrink_slab
-> external shrinker
-> lock cache
-> find reclaimable object
-> call into slab w/ object
-> return longer list of objects
-> reclaim objects
we do:
shrink_slab
-> internal shrinker
-> find oldest page and make object list
-> external shrinker
-> lock cache
-> reclaim objects
Cheers,
Dave.
--
Dave Chinner
[email protected]