LinuxLists.cc - [PATCH 1/9] add defrag flags

2005-09-26 20:10:26

Subject: [PATCH 1/9] add defrag flags

Index: 2.6.13-joel2/fs/buffer.c
===================================================================
--- 2.6.13-joel2.orig/fs/buffer.c 2005-09-13 14:54:13.%N -0500
+++ 2.6.13-joel2/fs/buffer.c 2005-09-13 15:02:01.%N -0500
@@ -1119,7 +1119,8 @@ grow_dev_page(struct block_device *bdev,
struct page *page;
struct buffer_head *bh;

- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, index,
+ GFP_NOFS | __GFP_USER);
if (!page)
return NULL;

@@ -3044,7 +3045,8 @@ static void recalc_bh_state(void)

struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
{
- struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+ struct buffer_head *ret = kmem_cache_alloc(bh_cachep,
+ gfp_flags|__GFP_KERNRCLM);
if (ret) {
preempt_disable();
__get_cpu_var(bh_accounting).nr++;
Index: 2.6.13-joel2/fs/dcache.c
===================================================================
--- 2.6.13-joel2.orig/fs/dcache.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/dcache.c 2005-09-13 15:02:01.%N -0500
@@ -721,7 +721,7 @@ struct dentry *d_alloc(struct dentry * p
struct dentry *dentry;
char *dname;

- dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL|__GFP_KERNRCLM);
if (!dentry)
return NULL;

Index: 2.6.13-joel2/fs/ext2/super.c
===================================================================
--- 2.6.13-joel2.orig/fs/ext2/super.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/ext2/super.c 2005-09-13 15:02:01.%N -0500
@@ -138,7 +138,8 @@ static kmem_cache_t * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL);
+ ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep,
+ SLAB_KERNEL|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
Index: 2.6.13-joel2/fs/ext3/super.c
===================================================================
--- 2.6.13-joel2.orig/fs/ext3/super.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/ext3/super.c 2005-09-13 15:02:01.%N -0500
@@ -440,7 +440,7 @@ static struct inode *ext3_alloc_inode(st
{
struct ext3_inode_info *ei;

- ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
+ ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
Index: 2.6.13-joel2/fs/ntfs/inode.c
===================================================================
--- 2.6.13-joel2.orig/fs/ntfs/inode.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/ntfs/inode.c 2005-09-13 15:05:53.%N -0500
@@ -317,7 +317,7 @@ struct inode *ntfs_alloc_big_inode(struc
ntfs_inode *ni;

ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_big_inode_cache, SLAB_NOFS);
+ ni = kmem_cache_alloc(ntfs_big_inode_cache, SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return VFS_I(ni);
@@ -342,7 +342,7 @@ static inline ntfs_inode *ntfs_alloc_ext
ntfs_inode *ni;

ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+ ni = kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return ni;
Index: 2.6.13-joel2/include/linux/gfp.h
===================================================================
--- 2.6.13-joel2.orig/include/linux/gfp.h 2005-09-13 14:54:17.%N -0500
+++ 2.6.13-joel2/include/linux/gfp.h 2005-09-13 15:02:01.%N -0500
@@ -41,21 +41,30 @@ struct vm_area_struct;
#define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */
#define __GFP_NORECLAIM 0x20000u /* No realy zone reclaim during allocation */

-#define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */
+/* Allocation type modifiers, group together if possible
+ * __GPF_USER: Allocation for user page or a buffer page
+ * __GFP_KERNRCLM: Short-lived or reclaimable kernel allocation
+ */
+#define __GFP_USER 0x40000u /* Kernel page that is easily reclaimable */
+#define __GFP_KERNRCLM 0x80000u /* User is a userspace user */
+#define __GFP_RCLM_BITS (__GFP_USER|__GFP_KERNRCLM)
+
+#define __GFP_BITS_SHIFT 21 /* Room for 20 __GFP_FOO bits */
#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)

/* if you forget to add the bitmask here kernel will crash, period */
#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
- __GFP_NOMEMALLOC|__GFP_NORECLAIM)
+ __GFP_NOMEMALLOC|__GFP_KERNRCLM|__GFP_USER)

#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_USER)
+#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | \
+ __GFP_USER)

/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */

Attachments:

1_add_defrag_flags (5.13 kB)

2005-09-27 00:16:47

by Kyle Moffett

[permalink] [raw]

Subject: Re: [PATCH 1/9] add defrag flags

On Sep 26, 2005, at 16:03:30, Joel Schopp wrote:
> The flags are:
> __GFP_USER, which corresponds to easily reclaimable pages
> __GFP_KERNRCLM, which corresponds to userspace pages

Uhh, call me crazy, but don't those flags look a little backwards to
you? Maybe it's just me, but wouldn't it make sense to expect
__GFP_USER to be a userspace allocation and __GFP_KERNRCLM to be an
easily reclaimable kernel page?

Cheers,
Kyle Moffett

-----BEGIN GEEK CODE BLOCK-----
Version: 3.12
GCM/CS/IT/U d- s++: a18 C++++>$ UB/L/X/*++++(+)>$ P+++(++++)>$ L++++(+
++) E W++(+) N+++(++) o? K? w--- O? M++ V? PS+() PE+(-) Y+ PGP+++ t+(+
++) 5 X R? tv-(--) b++++(++) DI+ D+ G e->++++$ h!*()>++$ r !y?(-)
------END GEEK CODE BLOCK------

2005-09-27 00:24:31

by Dave Hansen

[permalink] [raw]

Subject: Re: [PATCH 1/9] add defrag flags

On Mon, 2005-09-26 at 20:16 -0400, Kyle Moffett wrote:
> On Sep 26, 2005, at 16:03:30, Joel Schopp wrote:
> > The flags are:
> > __GFP_USER, which corresponds to easily reclaimable pages
> > __GFP_KERNRCLM, which corresponds to userspace pages
>
> Uhh, call me crazy, but don't those flags look a little backwards to
> you? Maybe it's just me, but wouldn't it make sense to expect
> __GFP_USER to be a userspace allocation and __GFP_KERNRCLM to be an
> easily reclaimable kernel page?

I think Joel simply made an error in his description.

__GFP_KERNRCLM corresponds to pages which are kernel-allocated, but have
some chance of being reclaimed at some point. Basically, they're things
that will get freed back under memory pressure. This can be direct, as
with the dcache and its slab shrinker, or more indirect as for control
structures like buffer_heads that get reclaimed after _other_ things are
freed.

-- Dave

2005-09-27 00:44:29

by Kyle Moffett

[permalink] [raw]

Subject: Re: [PATCH 1/9] add defrag flags

On Sep 26, 2005, at 20:24:08, Dave Hansen wrote:
> On Mon, 2005-09-26 at 20:16 -0400, Kyle Moffett wrote:
>> Uhh, call me crazy, but don't those flags look a little backwards
>> to you? Maybe it's just me, but wouldn't it make sense to expect
>> __GFP_USER to be a userspace allocation and __GFP_KERNRCLM to be
>> an easily reclaimable kernel page?
>
> I think Joel simply made an error in his description.
>
> __GFP_KERNRCLM corresponds to pages which are kernel-allocated, but
> have some chance of being reclaimed at some point. Basically,
> they're things that will get freed back under memory pressure.
> This can be direct, as with the dcache and its slab shrinker, or
> more indirect as for control structures like buffer_heads that get
> reclaimed after _other_ things are freed.

Ok, well he should fix both that description and the comment in his
patches, and make sure that the code actually matches what it says:

> +#define __GFP_USER 0x40000u /* Kernel page that is easily
> reclaimable */
> +#define __GFP_KERNRCLM 0x80000u /* User is a userspace user */

Cheers,
Kyle Moffett

--
Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are, by
definition, not smart enough to debug it.
-- Brian Kernighan

2005-09-27 05:45:10

by Paul Jackson

[permalink] [raw]

Subject: Re: [PATCH 1/9] add defrag flags

Dave wrote:
> I think Joel simply made an error in his description.

Looks like he made the same mistake in the actual code comments:

+/* Allocation type modifiers, group together if possible
+ * __GPF_USER: Allocation for user page or a buffer page
+ * __GFP_KERNRCLM: Short-lived or reclaimable kernel allocation
+ */
+#define __GFP_USER 0x40000u /* Kernel page that is easily reclaimable */
+#define __GFP_KERNRCLM 0x80000u /* User is a userspace user */

I'd guess you meant to write more like the following:

#define __GFP_USER 0x40000u /* Page for user address space */
#define __GFP_KERNRCLM 0x80000u /* Kernel page that is easily reclaimable */

And the block comment seems to needlessly repeat the inline comments,
add a dubious claim, and omit the interesting stuff ... In other words:

Does it actually matter if these two bits are grouped, or not? I
suspect that some of your other code, such as shifting the gfpmask by
RCLM_SHIFT bits, _requires_ that these two bits be adjacent. So the
"if possible" in the comment above is misleading.

And I suspect that gfp.h should contain the RCLM_SHIFT define, or
at least mention in comment that RCLM_SHIFT depends on the position
of the above two __GFP_* bits.

And I don't see any mention in the comments in gfp.h that these
two bits, in tandem, have an additional meaning - both bits off
means, I guess, not reclaimable, well at least not easily.

My HARDWALL patch appears to already be in Linus's kernel, so you
probably also need to do a global substitute of all instances in
the kernel of __GFP_HARDWALL, replacing it with __GFP_USER. Here
is the list of files I see affected, with a count of the number of
__GFP_HARDWALL strings in each:

include/linux/gfp.h:4
kernel/cpuset.c:6
mm/page_alloc.c:2
mm/vmscan.c:4

The comment in the next line looks like it needs to be changed to match
the code change:

+#define __GFP_BITS_SHIFT 21 /* Room for 20 __GFP_FOO bits */

On the other hand, why did you change __GFP_BITS_SHIFT? Isn't 20
enough - just enough?

Why was the flag change in fs/buffer.c:grow_dev_page() to add the
__GFP_USER bit, not to add the __GFP_KERNRCLM bit? I don't know that
code - perhaps the answer is simply that the resulting page ends up in
user space.

Aha - I just read one of the comments above that I cut+pasted.
It says that __GFP_USER means user *OR* buffer page. That certainly
explains the fs/buffer.c code using __GFP_USER. But it causes me to
wonder if we can equate __GFP_USER with __GFP_HARDWALL. I'm reluctant,
but more on principal than concrete experience, to modify the meaning
of hardwall cpusets to constrain both user address space pages *AND*
buffer pages. How open would you be to making buffers __GFP_KERNRCLM
instead of __GFP_USER?

If you have good reason to keep __GFP_USER meanin either user or buffer,
then perhaps the name __GFP_USER is misleading.

What sort of performance claims can you make for this change? How does
it impact kernel text size? Could we see a diffstat for the entire
patchset? Under what sort of loads or conditions would you expect
this patchset to do more harm than good?

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-09-27 13:34:57

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH 1/9] add defrag flags

On Mon, 26 Sep 2005, Paul Jackson wrote:

> Dave wrote:
> > I think Joel simply made an error in his description.
>
> Looks like he made the same mistake in the actual code comments:
>
> +/* Allocation type modifiers, group together if possible
> + * __GPF_USER: Allocation for user page or a buffer page
> + * __GFP_KERNRCLM: Short-lived or reclaimable kernel allocation
> + */
> +#define __GFP_USER 0x40000u /* Kernel page that is easily reclaimable */
> +#define __GFP_KERNRCLM 0x80000u /* User is a userspace user */
>
> I'd guess you meant to write more like the following:
>
> #define __GFP_USER 0x40000u /* Page for user address space */
> #define __GFP_KERNRCLM 0x80000u /* Kernel page that is easily reclaimable */
>

yep

> And the block comment seems to needlessly repeat the inline comments,
> add a dubious claim, and omit the interesting stuff ... In other words:
>
> Does it actually matter if these two bits are grouped, or not? I
> suspect that some of your other code, such as shifting the gfpmask by
> RCLM_SHIFT bits, _requires_ that these two bits be adjacent. So the
> "if possible" in the comment above is misleading.
>

The "if possible" must be misleading. The bits have to beside each other
as assumptions are made later in the code about this. The "group together"
comment refers to the patches that are allocated with gfp flags that
include __GFP_USER or __GFP_KERNNORCLM. Those pages should be "grouped
together if possible". The bits must be grouped that way.

> And I suspect that gfp.h should contain the RCLM_SHIFT define, or
> at least mention in comment that RCLM_SHIFT depends on the position
> of the above two __GFP_* bits.
>
> And I don't see any mention in the comments in gfp.h that these
> two bits, in tandem, have an additional meaning - both bits off
> means, I guess, not reclaimable, well at least not easily.
>
> My HARDWALL patch appears to already be in Linus's kernel, so you
> probably also need to do a global substitute of all instances in
> the kernel of __GFP_HARDWALL, replacing it with __GFP_USER.

I am not sure if that is a good idea as I will explain later.

> Here
> is the list of files I see affected, with a count of the number of
> __GFP_HARDWALL strings in each:
>
> include/linux/gfp.h:4
> kernel/cpuset.c:6
> mm/page_alloc.c:2
> mm/vmscan.c:4
>
> The comment in the next line looks like it needs to be changed to match
> the code change:
>
> +#define __GFP_BITS_SHIFT 21 /* Room for 20 __GFP_FOO bits */
>
> On the other hand, why did you change __GFP_BITS_SHIFT? Isn't 20
> enough - just enough?
>

Yep, you're right, it is just enough.

> Why was the flag change in fs/buffer.c:grow_dev_page() to add the
> __GFP_USER bit, not to add the __GFP_KERNRCLM bit?

Because these are buffer pages that get reclaimed very quickly. The
KERNRCLM pages are generally slab pages. These can be reclaimed by reaping
certain slab patches but it's a very hit and miss behavior. Trust me, the
whole scheme works better if buffer pages are treated as __GFP_USER pages,
not __GFP_KERNRCLM.

> Aha - I just read one of the comments above that I cut+pasted.
> It says that __GFP_USER means user *OR* buffer page. That certainly
> explains the fs/buffer.c code using __GFP_USER. But it causes me to
> wonder if we can equate __GFP_USER with __GFP_HARDWALL.

I don't think it should be.

> I'm reluctant,
> but more on principal than concrete experience, to modify the meaning
> of hardwall cpusets to constrain both user address space pages *AND*
> buffer pages. How open would you be to making buffers __GFP_KERNRCLM
> instead of __GFP_USER?
>

Not very open at all. I would prefer to have an additional flag than do
that. The anti-fragmentation does not work anywhere near as well when
buffer pages are KERNRCLM pages. It's because there are large number of
pages that are easily reclaimable by cleaning the buffers and discarding
them. If they were mixed with slab pages, it would not be very effective
when we try to make a large allocation.

> If you have good reason to keep __GFP_USER meanin either user or buffer,
> then perhaps the name __GFP_USER is misleading.
>

Possibly but we are stuck for terminology here. It's hard to think of a
good term that reflects the intention.

> What sort of performance claims can you make for this change?

I don't have figures for this patchset. The figures I do have are for
another version that I'm currently trying to merge with Joels. In my own
set, there are no performance regressions or gains.

> How does
> it impact kernel text size?

Again, based on my own patchset but the figures should be essentially the
same as Joel's;

linux-2.6.13-clean/vmlinux
text data bss dec hex filename
2992829 686212 212708 3891749 3b6225 linux-2.6.13-clean/vmlinux

linux-2.6.13-mbuddy-v14/vmlinux
text data bss dec hex filename
2995335 687852 212708 3895895 3b7257 linux-2.6.13-mbuddy-v14/vmlinux

Is that what you are looking for?

> Could we see a diffstat for the entire
> patchset?

Don't have this at the moment

> Under what sort of loads or conditions would you expect
> this patchset to do more harm than good?
>

I cannot think of a case where it does more harm. At worst, it does not
help fragmentation. For that to happen, the system needs to be very
heavily loaded under heavy memory pressure for a long time with
RCLM_NORCLM pages been retained for very long periods of time even after
loads ease. In this case, fallbacks will eventually fragment memory.

A second case where it could hurt is in allocator scalability over a large
number of CPUs as there are now additional per-cpu lists. I am having
trouble thinking of a test case that would trigger this case though.
Someone used to dealing with large numbers of processors might be able to
make a suggestion.

--
Mel Gorman
Part-time Phd Student Java Applications Developer
University of Limerick IBM Dublin Software Lab

2005-09-27 16:26:24

by Paul Jackson

[permalink] [raw]

Subject: Re: [Lhms-devel] Re: [PATCH 1/9] add defrag flags

Mel wrote:
> > If you have good reason to keep __GFP_USER meanin either user or buffer,
> > then perhaps the name __GFP_USER is misleading.
> >
>
> Possibly but we are stuck for terminology here. It's hard to think of a
> good term that reflects the intention.

You make several good points. How about:
* Rename __GFP_USER to __GFP_EASYRCLM
* Shift the two __GFP_*RCLM flags up to 0x80000u and 0x100000u
* Leave __GFP_BITS_SHIFT at the 21 in your patch (and fix its comment)
(or should we go up the next nibble, to 24?).

This results in the two key GFP defines being:

#define __GFP_EASYRCLM 0x80000u /* Easily reclaimed user or buffer page */
#define __GFP_KERNRCLM 0x100000u /* Reclaimable kernel page */

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-09-27 18:38:30

by Joel Schopp

[permalink] [raw]

Subject: Re: [PATCH 1/9] add defrag flags

Index: 2.6.13-joel2/fs/buffer.c
===================================================================
--- 2.6.13-joel2.orig/fs/buffer.c 2005-09-13 14:54:13.%N -0500
+++ 2.6.13-joel2/fs/buffer.c 2005-09-13 15:02:01.%N -0500
@@ -1119,7 +1119,8 @@ grow_dev_page(struct block_device *bdev,
struct page *page;
struct buffer_head *bh;

- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, index,
+ GFP_NOFS | __GFP_USER);
if (!page)
return NULL;

@@ -3044,7 +3045,8 @@ static void recalc_bh_state(void)

struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
{
- struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+ struct buffer_head *ret = kmem_cache_alloc(bh_cachep,
+ gfp_flags|__GFP_KERNRCLM);
if (ret) {
preempt_disable();
__get_cpu_var(bh_accounting).nr++;
Index: 2.6.13-joel2/fs/dcache.c
===================================================================
--- 2.6.13-joel2.orig/fs/dcache.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/dcache.c 2005-09-13 15:02:01.%N -0500
@@ -721,7 +721,7 @@ struct dentry *d_alloc(struct dentry * p
struct dentry *dentry;
char *dname;

- dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL|__GFP_KERNRCLM);
if (!dentry)
return NULL;

Index: 2.6.13-joel2/fs/ext2/super.c
===================================================================
--- 2.6.13-joel2.orig/fs/ext2/super.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/ext2/super.c 2005-09-13 15:02:01.%N -0500
@@ -138,7 +138,8 @@ static kmem_cache_t * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL);
+ ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep,
+ SLAB_KERNEL|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
Index: 2.6.13-joel2/fs/ext3/super.c
===================================================================
--- 2.6.13-joel2.orig/fs/ext3/super.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/ext3/super.c 2005-09-13 15:02:01.%N -0500
@@ -440,7 +440,7 @@ static struct inode *ext3_alloc_inode(st
{
struct ext3_inode_info *ei;

- ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
+ ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
Index: 2.6.13-joel2/fs/ntfs/inode.c
===================================================================
--- 2.6.13-joel2.orig/fs/ntfs/inode.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/ntfs/inode.c 2005-09-13 15:05:53.%N -0500
@@ -317,7 +317,7 @@ struct inode *ntfs_alloc_big_inode(struc
ntfs_inode *ni;

ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_big_inode_cache, SLAB_NOFS);
+ ni = kmem_cache_alloc(ntfs_big_inode_cache, SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return VFS_I(ni);
@@ -342,7 +342,7 @@ static inline ntfs_inode *ntfs_alloc_ext
ntfs_inode *ni;

ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+ ni = kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return ni;
Index: 2.6.13-joel2/include/linux/gfp.h
===================================================================
--- 2.6.13-joel2.orig/include/linux/gfp.h 2005-09-13 14:54:17.%N -0500
+++ 2.6.13-joel2/include/linux/gfp.h 2005-09-27 12:53:13.%N -0500
@@ -41,6 +41,16 @@ struct vm_area_struct;
#define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */
#define __GFP_NORECLAIM 0x20000u /* No realy zone reclaim during allocation */

+/* Allocation type modifiers, these are required to be adjacent
+ * __GPF_USER: Allocation for user page or a buffer page
+ * __GFP_KERNRCLM: Short-lived or reclaimable kernel allocation
+ * Both bits off: Kernel non-reclaimable or very hard to reclaim
+ * RCLM_SHIFT (defined elsewhere) depends on the location of these bits
+ */
+#define __GFP_USER 0x40000u /* User is a userspace user */
+#define __GFP_KERNRCLM 0x80000u /* Kernel page that is easily reclaimable */
+#define __GFP_RCLM_BITS (__GFP_USER|__GFP_KERNRCLM)
+
#define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */
#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)

@@ -48,14 +58,15 @@ struct vm_area_struct;
#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
- __GFP_NOMEMALLOC|__GFP_NORECLAIM)
+ __GFP_NOMEMALLOC|__GFP_KERNRCLM|__GFP_USER)

#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_USER)
+#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | \
+ __GFP_USER)

/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */

Attachments:

1_add_defrag_flags (5.19 kB)

2005-09-27 19:31:25

by Paul Jackson

[permalink] [raw]

Subject: Re: [PATCH 1/9] add defrag flags

Joel wrote:
> We may not be able to use the same flag after all due to our need to mark buffer
> pages as user.

Agreed - we have separate flags. I want exactly user address space
pages. You want really easy to reclaim pages. You have good
performance justifications for your choice. I have just "design
purity", so if for some reason there was a dire shortage of GFP bits,
I suspect it is I who should give, not you.

> > +#define __GFP_BITS_SHIFT 21 /* Room for 20 __GFP_FOO bits */
>
> Yep.

Once this is merged with current Linux, which already has GFP_HARDWALL,
I presume you will be back up to 21 bits, code and comment.

As I noted in another message the "USER" and the comment in:

#define __GFP_USER 0x40000u /* User is a userspace user */

are a bit misleading now. Perhaps GFP_EASYRCLM?

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-09-27 21:00:48

by Joel Schopp

[permalink] [raw]

Subject: Re: [Lhms-devel] Re: [PATCH 1/9] add defrag flags

> Once this is merged with current Linux, which already has GFP_HARDWALL,
> I presume you will be back up to 21 bits, code and comment.

Looks like it.

>
> As I noted in another message the "USER" and the comment in:
>
> #define __GFP_USER 0x40000u /* User is a userspace user */
>
> are a bit misleading now. Perhaps GFP_EASYRCLM?
>

A rose by any other name would smell as sweet -Romeo

A flag by any other name would work as well -Joel

There are problems with any name we would use. I personally like __GFP_USER
because it is mostly user memory, and nobody will accidently use it to label
something that is not user memory. Those who do use it for non-user memory will
do so with more caution and ridicule. This will keep it from expanding in use
beyond its intent.

If we name it __GFP_EASYRCLM we then start getting into questions about what we
mean by easy and somebody is going to decide that their kernel memory is pretty
easy to reclaim and mess things up. Maybe we could call it
__GPF_REALLYREALLYEASYRCLM to avoid confusion.

If there is a consensus from multiple people for me to go rename the flag
__GFP_xxxxx then I'm not that attached to it and will. But for now I'm going to
leave it __GFP_USER.

2005-09-27 21:24:15

by Paul Jackson

[permalink] [raw]

Subject: Re: [Lhms-devel] Re: [PATCH 1/9] add defrag flags

> But for now I'm going to leave it __GFP_USER.

Well, then, at least fix the comment, from the rather oddly phrased:

#define __GFP_USER 0x40000u /* User is a userspace user */

to something more accurate such as:

#define __GFP_USER 0x40000u /* User and other really easily reclaimed pages */

And consider adding a comment to its use in fs/buffer.c, where marking
a page obviously destined for kernel space __GFP_USER seems strange.
I doubt I will be the last person to look at the line of code and
scratch my head.

Nice clear simple names such as __GFP_USER (only a kernel hacker would
say that ;) should not be used if they are a flat out lie. Better to
use some tongue twister acronym, such as

#define__GFP_RRE_RCLM 0x40000u /* Really Really Easy ReCLaiM (user, buffer) */

so that people don't think they know what something means when they don't.

And the one thing you could say that's useful in this name, that it has
something to do with the reclaim mechanism, is missing - no 'RCLM' in it.

Roses may smell sweet by other names, but kernel names for things do
matter. Unlike classic flowers, we have an awful lot of colorless,
ordorless stuff in there that no one learns about in childhood (Linus's
child notwithstanding ;). We desparately need names to tell the
essentials, and not lie. __GFP_USER does neither.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-09-27 22:03:54

by Joel Schopp

[permalink] [raw]

Subject: Re: [Lhms-devel] Re: [PATCH 1/9] add defrag flags

Index: 2.6.13-joel2/fs/buffer.c
===================================================================
--- 2.6.13-joel2.orig/fs/buffer.c 2005-09-13 14:54:13.%N -0500
+++ 2.6.13-joel2/fs/buffer.c 2005-09-27 16:52:05.%N -0500
@@ -1119,7 +1119,12 @@ grow_dev_page(struct block_device *bdev,
struct page *page;
struct buffer_head *bh;

- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ /*
+ * Mark as __GFP_USER because from a fragmentation avoidance and
+ * reclimation point of view this memory behaves like user memory.
+ */
+ page = find_or_create_page(inode->i_mapping, index,
+ GFP_NOFS | __GFP_USER);
if (!page)
return NULL;

@@ -3044,7 +3049,8 @@ static void recalc_bh_state(void)

struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
{
- struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+ struct buffer_head *ret = kmem_cache_alloc(bh_cachep,
+ gfp_flags|__GFP_KERNRCLM);
if (ret) {
preempt_disable();
__get_cpu_var(bh_accounting).nr++;
Index: 2.6.13-joel2/fs/dcache.c
===================================================================
--- 2.6.13-joel2.orig/fs/dcache.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/dcache.c 2005-09-13 15:02:01.%N -0500
@@ -721,7 +721,7 @@ struct dentry *d_alloc(struct dentry * p
struct dentry *dentry;
char *dname;

- dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL|__GFP_KERNRCLM);
if (!dentry)
return NULL;

Index: 2.6.13-joel2/fs/ext2/super.c
===================================================================
--- 2.6.13-joel2.orig/fs/ext2/super.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/ext2/super.c 2005-09-13 15:02:01.%N -0500
@@ -138,7 +138,8 @@ static kmem_cache_t * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL);
+ ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep,
+ SLAB_KERNEL|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
Index: 2.6.13-joel2/fs/ext3/super.c
===================================================================
--- 2.6.13-joel2.orig/fs/ext3/super.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/ext3/super.c 2005-09-13 15:02:01.%N -0500
@@ -440,7 +440,7 @@ static struct inode *ext3_alloc_inode(st
{
struct ext3_inode_info *ei;

- ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
+ ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
Index: 2.6.13-joel2/fs/ntfs/inode.c
===================================================================
--- 2.6.13-joel2.orig/fs/ntfs/inode.c 2005-09-13 14:54:14.%N -0500
+++ 2.6.13-joel2/fs/ntfs/inode.c 2005-09-13 15:05:53.%N -0500
@@ -317,7 +317,7 @@ struct inode *ntfs_alloc_big_inode(struc
ntfs_inode *ni;

ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_big_inode_cache, SLAB_NOFS);
+ ni = kmem_cache_alloc(ntfs_big_inode_cache, SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return VFS_I(ni);
@@ -342,7 +342,7 @@ static inline ntfs_inode *ntfs_alloc_ext
ntfs_inode *ni;

ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+ ni = kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return ni;
Index: 2.6.13-joel2/include/linux/gfp.h
===================================================================
--- 2.6.13-joel2.orig/include/linux/gfp.h 2005-09-13 14:54:17.%N -0500
+++ 2.6.13-joel2/include/linux/gfp.h 2005-09-27 16:40:55.%N -0500
@@ -41,6 +41,16 @@ struct vm_area_struct;
#define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */
#define __GFP_NORECLAIM 0x20000u /* No realy zone reclaim during allocation */

+/* Allocation type modifiers, these are required to be adjacent
+ * __GPF_USER: Allocation for user page or a buffer page
+ * __GFP_KERNRCLM: Short-lived or reclaimable kernel allocation
+ * Both bits off: Kernel non-reclaimable or very hard to reclaim
+ * RCLM_SHIFT (defined elsewhere) depends on the location of these bits
+ */
+#define __GFP_USER 0x40000u /* User & other really easily reclaimed pages */
+#define __GFP_KERNRCLM 0x80000u /* Kernel page that is easily reclaimable */
+#define __GFP_RCLM_BITS (__GFP_USER|__GFP_KERNRCLM)
+
#define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */
#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)

@@ -48,14 +58,15 @@ struct vm_area_struct;
#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
- __GFP_NOMEMALLOC|__GFP_NORECLAIM)
+ __GFP_NOMEMALLOC|__GFP_KERNRCLM|__GFP_USER)

#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_USER)
+#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | \
+ __GFP_USER)

/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */

Attachments:

1_add_defrag_flags (5.35 kB)

2005-09-27 22:45:38

by Paul Jackson

[permalink] [raw]

Subject: Re: [Lhms-devel] Re: [PATCH 1/9] add defrag flags

+ * Mark as __GFP_USER because from a fragmentation avoidance and
+ * reclimation point of view this memory behaves like user memory.

You misspelled reclamation.

(Nice comment - I had to bitch about something ;).

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401