Hello,
This patch try to solve problem that a long-lasting page caches of
ext4 superblock and journaling of superblock disturb page migration.
I've been testing CMA feature on my ARM-based platform
and found that two page caches cannot be migrated.
They are page caches of superblock of ext4 filesystem and its journaling data.
Current ext4 reads superblock with sb_bread() that allocates page
from movable area. But the problem is that ext4 hold the page until
it is unmounted. If root filesystem is ext4 the page cannot be migrated forever.
And also the journaling data for the superblock cannot be migreated.
I introduce new APIs that allocates page cache with specific flag passed by an argument.
*_gfp APIs are for user want to set page allocation flag for page cache allocation.
And *_unmovable APIs are for the user wants to allocate page cache from non-movable area.
It is useful for ext4/ext3 and others that want to hold page cache for a long time.
I have 3 patchs:
1. Patch 1/3: introduce a new API that create page cache with allocation flag
2. Patch 2/3: have ext4 use the new API to read superblock
3. Patch 3/3: have jbd/jbd2 use the new API to make journaling of superblock
This patchset is based on linux-next-20140814.
Thanks a lot.
Gioh Kim (3):
fs/buffer.c: allocate buffer cache with user specific flag
ext4: allocate buffer-cache for superblock in non-movable area
jbd-jbd2-allocate-buffer-cache-for-superblock-inode-.patch
fs/buffer.c | 54 ++++++++++++++++++++++++++++++++++----------
fs/ext4/super.c | 6 ++--
fs/jbd/journal.c | 2 -
fs/jbd2/journal.c | 2 -
include/linux/buffer_head.h | 14 ++++++++++-
5 files changed, 60 insertions(+), 18 deletions(-)
A buffer cache is allocated from movable area
because it is referred for a while and released soon.
But some filesystems are taking buffer cache for a long time
and it can disturb page migration.
New APIs are introduced to allocate buffer cache
with user specific flag.
*_gfp APIs are for user want to set page allocation flag for page cache
allocation.
And *_unmovable APIs are for the user wants to allocate page cache from
non-movable area.
Signed-off-by: Gioh Kim <[email protected]>
---
fs/buffer.c | 54 +++++++++++++++++++++++++++++++++----------
include/linux/buffer_head.h | 14 ++++++++++-
2 files changed, 55 insertions(+), 13 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index 8f05111..ee29bc4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
*/
static int
grow_dev_page(struct block_device *bdev, sector_t block,
- pgoff_t index, int size, int sizebits)
+ pgoff_t index, int size, int sizebits, gfp_t gfp)
{
struct inode *inode = bdev->bd_inode;
struct page *page;
@@ -1002,10 +1002,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
int ret = 0; /* Will call free_more_memory() */
gfp_t gfp_mask;
- gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
- gfp_mask |= __GFP_MOVABLE;
+ gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+
/*
- * XXX: __getblk_slow() can not really deal with failure and
+ * XXX: __getblk_gfp() can not really deal with failure and
* will endlessly loop on improvised global reclaim. Prefer
* looping in the allocator rather than here, at least that
* code knows what it's doing.
@@ -1058,7 +1058,7 @@ failed:
* that page was dirty, the buffers are set dirty also.
*/
static int
-grow_buffers(struct block_device *bdev, sector_t block, int size)
+grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
{
pgoff_t index;
int sizebits;
@@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
}
/* Create a page with the proper size buffers.. */
- return grow_dev_page(bdev, block, index, size, sizebits);
+ return grow_dev_page(bdev, block, index, size, sizebits, gfp);
}
-static struct buffer_head *
-__getblk_slow(struct block_device *bdev, sector_t block, int size)
+struct buffer_head *
+__getblk_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
/* Size must be multiple of hard sectorsize */
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
@@ -1111,13 +1112,21 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
if (bh)
return bh;
- ret = grow_buffers(bdev, block, size);
+ ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
if (ret == 0)
free_more_memory();
}
}
+EXPORT_SYMBOL(__getblk_gfp);
+
+struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block,
+ unsigned size)
+{
+ return __getblk_gfp(bdev, block, size, 0);
+}
+EXPORT_SYMBOL(getblk_unmovable);
/*
* The relationship between dirty buffers and dirty pages:
@@ -1385,7 +1394,7 @@ __getblk(struct block_device *bdev, sector_t block, unsigned size)
might_sleep();
if (bh == NULL)
- bh = __getblk_slow(bdev, block, size);
+ bh = __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
return bh;
}
EXPORT_SYMBOL(__getblk);
@@ -1410,18 +1419,39 @@ EXPORT_SYMBOL(__breadahead);
* @size: size (in bytes) to read
*
* Reads a specified block, and returns buffer head that contains it.
+ * The page cache is allocated from movable area so that it can be migrated.
* It returns NULL if the block was unreadable.
*/
struct buffer_head *
__bread(struct block_device *bdev, sector_t block, unsigned size)
{
- struct buffer_head *bh = __getblk(bdev, block, size);
+ return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
+}
+EXPORT_SYMBOL(__bread);
+
+/**
+ * __bread_gfp() - reads a specified block and returns the bh
+ * @bdev: the block_device to read from
+ * @block: number of block
+ * @size: size (in bytes) to read
+ * @gfp: page allocation flag
+ *
+ * Reads a specified block, and returns buffer head that contains it.
+ * The page cache can be allocated from non-movable area
+ * not to prevent page migration if you set gfp to zero.
+ * It returns NULL if the block was unreadable.
+ */
+struct buffer_head *
+__bread_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
+{
+ struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
if (likely(bh) && !buffer_uptodate(bh))
bh = __bread_slow(bh);
return bh;
}
-EXPORT_SYMBOL(__bread);
+EXPORT_SYMBOL(__bread_gfp);
/*
* invalidate_bh_lrus() is called rarely - but not only at unmount.
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 324329c..c3f89d3 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -177,10 +177,16 @@ struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
unsigned size);
struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
unsigned size);
+struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp);
+struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block,
+ unsigned size);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
+struct buffer_head *__bread_gfp(struct block_device *,
+ sector_t block, unsigned size, gfp_t gfp);
void invalidate_bh_lrus(void);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
@@ -295,7 +301,13 @@ static inline void bforget(struct buffer_head *bh)
static inline struct buffer_head *
sb_bread(struct super_block *sb, sector_t block)
{
- return __bread(sb->s_bdev, block, sb->s_blocksize);
+ return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
+}
+
+static inline struct buffer_head *
+sb_bread_unmovable(struct super_block *sb, sector_t block)
+{
+ return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
}
static inline void
--
1.7.9.5
A buffer-cache for superblock is disturbing page migration,
because the buffer-cache is not released until unmount.
The buffer-cache must be allocated from non-movable area.
Signed-off-by: Gioh Kim <[email protected]>
---
fs/ext4/super.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 32b43ad..e4b62b3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3435,7 +3435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
logical_sb_block = sb_block;
}
- if (!(bh = sb_bread(sb, logical_sb_block))) {
+ if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
ext4_msg(sb, KERN_ERR, "unable to read superblock");
goto out_fail;
}
@@ -3645,7 +3645,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
offset = do_div(logical_sb_block, blocksize);
- bh = sb_bread(sb, logical_sb_block);
+ bh = sb_bread_unmovable(sb, logical_sb_block);
if (!bh) {
ext4_msg(sb, KERN_ERR,
"Can't read superblock on 2nd try");
@@ -3867,7 +3867,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
- sbi->s_group_desc[i] = sb_bread(sb, block);
+ sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
if (!sbi->s_group_desc[i]) {
ext4_msg(sb, KERN_ERR,
"can't read group descriptor %d", i);
--
1.7.9.5
A long-lasting buffer-cache can distrub page migration so that
it must be allocated from non-movable area.
The journal_init_inode is creating a buffer-cache for superblock journaling.
The superblock exists until system shutdown so that the buffer-cache
for the superblock would also exist for a long time
and it can distrub page migration.
This patch make the buffer-cache be allocated from non-movable area
not to distrub page migration.
Signed-off-by: Gioh Kim <[email protected]>
---
fs/jbd/journal.c | 2 +-
fs/jbd2/journal.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 06fe11e..aab8549 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -886,7 +886,7 @@ journal_t * journal_init_inode (struct inode *inode)
goto out_err;
}
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
printk(KERN_ERR
"%s: Cannot get buffer for journal superblock\n",
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 67b8e30..65bd3b1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1237,7 +1237,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
goto out_err;
}
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
printk(KERN_ERR
"%s: Cannot get buffer for journal superblock\n",
--
1.7.9.5
On Thu 28-08-14 11:32:29, Gioh Kim wrote:
>
> A buffer-cache for superblock is disturbing page migration,
> because the buffer-cache is not released until unmount.
> The buffer-cache must be allocated from non-movable area.
The patch looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
>
> Signed-off-by: Gioh Kim <[email protected]>
> ---
> fs/ext4/super.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 32b43ad..e4b62b3 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -3435,7 +3435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> logical_sb_block = sb_block;
> }
>
> - if (!(bh = sb_bread(sb, logical_sb_block))) {
> + if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
> ext4_msg(sb, KERN_ERR, "unable to read superblock");
> goto out_fail;
> }
> @@ -3645,7 +3645,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> brelse(bh);
> logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
> offset = do_div(logical_sb_block, blocksize);
> - bh = sb_bread(sb, logical_sb_block);
> + bh = sb_bread_unmovable(sb, logical_sb_block);
> if (!bh) {
> ext4_msg(sb, KERN_ERR,
> "Can't read superblock on 2nd try");
> @@ -3867,7 +3867,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>
> for (i = 0; i < db_count; i++) {
> block = descriptor_loc(sb, logical_sb_block, i);
> - sbi->s_group_desc[i] = sb_bread(sb, block);
> + sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
> if (!sbi->s_group_desc[i]) {
> ext4_msg(sb, KERN_ERR,
> "can't read group descriptor %d", i);
> --
> 1.7.9.5
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 28-08-14 11:33:20, Gioh Kim wrote:
>
> A long-lasting buffer-cache can distrub page migration so that
> it must be allocated from non-movable area.
>
> The journal_init_inode is creating a buffer-cache for superblock journaling.
> The superblock exists until system shutdown so that the buffer-cache
> for the superblock would also exist for a long time
> and it can distrub page migration.
>
> This patch make the buffer-cache be allocated from non-movable area
> not to distrub page migration.
The patch looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
>
> Signed-off-by: Gioh Kim <[email protected]>
> ---
> fs/jbd/journal.c | 2 +-
> fs/jbd2/journal.c | 2 +-
> 2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
> index 06fe11e..aab8549 100644
> --- a/fs/jbd/journal.c
> +++ b/fs/jbd/journal.c
> @@ -886,7 +886,7 @@ journal_t * journal_init_inode (struct inode *inode)
> goto out_err;
> }
>
> - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
> + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
> if (!bh) {
> printk(KERN_ERR
> "%s: Cannot get buffer for journal superblock\n",
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 67b8e30..65bd3b1 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -1237,7 +1237,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
> goto out_err;
> }
>
> - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
> + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
> if (!bh) {
> printk(KERN_ERR
> "%s: Cannot get buffer for journal superblock\n",
> --
> 1.7.9.5
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 28-08-14 11:31:46, Gioh Kim wrote:
>
> A buffer cache is allocated from movable area
> because it is referred for a while and released soon.
> But some filesystems are taking buffer cache for a long time
> and it can disturb page migration.
>
> New APIs are introduced to allocate buffer cache
> with user specific flag.
> *_gfp APIs are for user want to set page allocation flag for page cache
> allocation.
> And *_unmovable APIs are for the user wants to allocate page cache from
> non-movable area.
>
> Signed-off-by: Gioh Kim <[email protected]>
Still a few nits below.
> ---
> fs/buffer.c | 54 +++++++++++++++++++++++++++++++++----------
> include/linux/buffer_head.h | 14 ++++++++++-
> 2 files changed, 55 insertions(+), 13 deletions(-)
>
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 8f05111..ee29bc4 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
> */
> static int
> grow_dev_page(struct block_device *bdev, sector_t block,
> - pgoff_t index, int size, int sizebits)
> + pgoff_t index, int size, int sizebits, gfp_t gfp)
I've noticed that whitespace got damaged in your patches (tabs replaced
with spaces). Please use email client that doesn't do this or use
attachments. Otherwise patch doesn't apply.
> {
> struct inode *inode = bdev->bd_inode;
> struct page *page;
> @@ -1002,10 +1002,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
> int ret = 0; /* Will call free_more_memory() */
> gfp_t gfp_mask;
>
> - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> - gfp_mask |= __GFP_MOVABLE;
> + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
> +
> /*
> - * XXX: __getblk_slow() can not really deal with failure and
> + * XXX: __getblk_gfp() can not really deal with failure and
> * will endlessly loop on improvised global reclaim. Prefer
> * looping in the allocator rather than here, at least that
> * code knows what it's doing.
> @@ -1058,7 +1058,7 @@ failed:
> * that page was dirty, the buffers are set dirty also.
> */
> static int
> -grow_buffers(struct block_device *bdev, sector_t block, int size)
> +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
> {
> pgoff_t index;
> int sizebits;
> @@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
> }
>
> /* Create a page with the proper size buffers.. */
> - return grow_dev_page(bdev, block, index, size, sizebits);
> + return grow_dev_page(bdev, block, index, size, sizebits, gfp);
> }
>
> -static struct buffer_head *
> -__getblk_slow(struct block_device *bdev, sector_t block, int size)
> +struct buffer_head *
> +__getblk_gfp(struct block_device *bdev, sector_t block,
> + unsigned size, gfp_t gfp)
> {
> /* Size must be multiple of hard sectorsize */
> if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
> @@ -1111,13 +1112,21 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
> if (bh)
> return bh;
>
> - ret = grow_buffers(bdev, block, size);
> + ret = grow_buffers(bdev, block, size, gfp);
> if (ret < 0)
> return NULL;
> if (ret == 0)
> free_more_memory();
> }
> }
> +EXPORT_SYMBOL(__getblk_gfp);
> +
> +struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block,
> + unsigned size)
> +{
> + return __getblk_gfp(bdev, block, size, 0);
> +}
> +EXPORT_SYMBOL(getblk_unmovable);
This can be just an inline function in include/linux/buffer_head.h.
> /*
> * The relationship between dirty buffers and dirty pages:
> @@ -1385,7 +1394,7 @@ __getblk(struct block_device *bdev, sector_t block, unsigned size)
>
> might_sleep();
> if (bh == NULL)
> - bh = __getblk_slow(bdev, block, size);
> + bh = __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
> return bh;
> }
> EXPORT_SYMBOL(__getblk);
I'd keep __getblk_slow() internal and just add 'gfp' parameter to it.
Then change __getblk() to __getblk_gfp() and pass on the 'gfp' parameter.
And finally define inline __getblk() in include/linux/buffer_head.h which
just calls __getblk_gfp() with appropriate gfp mask.
That way you keep all the interfaces completely symmetric. For example now
you miss might_sleep() checks from __getblk_gfp().
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu, Aug 28, 2014 at 11:26:31AM +0900, Gioh Kim wrote:
>
> I have 3 patchs:
>
> 1. Patch 1/3: introduce a new API that create page cache with allocation flag
> 2. Patch 2/3: have ext4 use the new API to read superblock
> 3. Patch 3/3: have jbd/jbd2 use the new API to make journaling of superblock
>
> This patchset is based on linux-next-20140814.
Looks good. Unless there are any objections from the mm folks, since
the nearly all of the changes are in fs/buffer.c and in ext4/jbd2
code, I plan to carry this in the ext4 tree.
I do plan to clean up the patch titles a little; from:
fs/buffer.c: allocate buffer cache with user specific flag
ext4: allocate buffer-cache for superblock in, non-movable area
jbd/jbd2: allocate buffer-cache for superblock inode in non-movable area
to:
fs.c: support buffer cache allocations with gfp modifiers
ext4: use non-movable memory for the ext4 superblock
jbd/jbd2: use non-movable memory for the jbd superblock
And do some minor english grammar/spelling cleanups in the commit
description when I apply the patch.
Thanks for this work; I'm going to need to use the interfaces you
introduced in fs/buffer.c to guarantee that certain directory reads
can be done with GFP_NOFAIL (since under heavy memory pressure,
allocation failures there can currently lead to the file system
getting declared corrupt. Interestingly, this bug has been around for
a long time, and hasn't been noticed in over two cycles of enterprise
distro qualifications by either RHEL or SLES, which leads me to wonder
if there are other places where the error paths for GFP_NOFS
allocations haven't been well tested....)
- Ted
2014-08-28 오후 10:48, Theodore Ts'o 쓴 글:
> On Thu, Aug 28, 2014 at 11:26:31AM +0900, Gioh Kim wrote:
>>
>> I have 3 patchs:
>>
>> 1. Patch 1/3: introduce a new API that create page cache with allocation flag
>> 2. Patch 2/3: have ext4 use the new API to read superblock
>> 3. Patch 3/3: have jbd/jbd2 use the new API to make journaling of superblock
>>
>> This patchset is based on linux-next-20140814.
>
> Looks good. Unless there are any objections from the mm folks, since
> the nearly all of the changes are in fs/buffer.c and in ext4/jbd2
> code, I plan to carry this in the ext4 tree.
>
> I do plan to clean up the patch titles a little; from:
>
> fs/buffer.c: allocate buffer cache with user specific flag
> ext4: allocate buffer-cache for superblock in, non-movable area
> jbd/jbd2: allocate buffer-cache for superblock inode in non-movable area
>
> to:
>
> fs.c: support buffer cache allocations with gfp modifiers
> ext4: use non-movable memory for the ext4 superblock
> jbd/jbd2: use non-movable memory for the jbd superblock
>
> And do some minor english grammar/spelling cleanups in the commit
> description when I apply the patch.
>
> Thanks for this work; I'm going to need to use the interfaces you
> introduced in fs/buffer.c to guarantee that certain directory reads
> can be done with GFP_NOFAIL (since under heavy memory pressure,
> allocation failures there can currently lead to the file system
> getting declared corrupt. Interestingly, this bug has been around for
> a long time, and hasn't been noticed in over two cycles of enterprise
> distro qualifications by either RHEL or SLES, which leads me to wonder
> if there are other places where the error paths for GFP_NOFS
> allocations haven't been well tested....)
>
> - Ted
>
Thanks a lot. It's my honor.
I'm not good at English, so please feel free to fix it.
Jan Kara requested me some changes for the interfaces.
I'm going to send v4 soon with new titles you recommend and some change for the interfaces.
2014-08-28 오후 7:59, Jan Kara 쓴 글:
> On Thu 28-08-14 11:31:46, Gioh Kim wrote:
>>
>> A buffer cache is allocated from movable area
>> because it is referred for a while and released soon.
>> But some filesystems are taking buffer cache for a long time
>> and it can disturb page migration.
>>
>> New APIs are introduced to allocate buffer cache
>> with user specific flag.
>> *_gfp APIs are for user want to set page allocation flag for page cache
>> allocation.
>> And *_unmovable APIs are for the user wants to allocate page cache from
>> non-movable area.
>>
>> Signed-off-by: Gioh Kim <[email protected]>
> Still a few nits below.
>> ---
>> fs/buffer.c | 54 +++++++++++++++++++++++++++++++++----------
>> include/linux/buffer_head.h | 14 ++++++++++-
>> 2 files changed, 55 insertions(+), 13 deletions(-)
>>
>> diff --git a/fs/buffer.c b/fs/buffer.c
>> index 8f05111..ee29bc4 100644
>> --- a/fs/buffer.c
>> +++ b/fs/buffer.c
>> @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
>> */
>> static int
>> grow_dev_page(struct block_device *bdev, sector_t block,
>> - pgoff_t index, int size, int sizebits)
>> + pgoff_t index, int size, int sizebits, gfp_t gfp)
> I've noticed that whitespace got damaged in your patches (tabs replaced
> with spaces). Please use email client that doesn't do this or use
> attachments. Otherwise patch doesn't apply.
I'm sorry, it's my mistake.
I'm using Thunderbird but looking for another client.
>
>> {
>> struct inode *inode = bdev->bd_inode;
>> struct page *page;
>> @@ -1002,10 +1002,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
>> int ret = 0; /* Will call free_more_memory() */
>> gfp_t gfp_mask;
>>
>> - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
>> - gfp_mask |= __GFP_MOVABLE;
>> + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
>> +
>> /*
>> - * XXX: __getblk_slow() can not really deal with failure and
>> + * XXX: __getblk_gfp() can not really deal with failure and
>> * will endlessly loop on improvised global reclaim. Prefer
>> * looping in the allocator rather than here, at least that
>> * code knows what it's doing.
>> @@ -1058,7 +1058,7 @@ failed:
>> * that page was dirty, the buffers are set dirty also.
>> */
>> static int
>> -grow_buffers(struct block_device *bdev, sector_t block, int size)
>> +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
>> {
>> pgoff_t index;
>> int sizebits;
>> @@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
>> }
>>
>> /* Create a page with the proper size buffers.. */
>> - return grow_dev_page(bdev, block, index, size, sizebits);
>> + return grow_dev_page(bdev, block, index, size, sizebits, gfp);
>> }
>>
>> -static struct buffer_head *
>> -__getblk_slow(struct block_device *bdev, sector_t block, int size)
>> +struct buffer_head *
>> +__getblk_gfp(struct block_device *bdev, sector_t block,
>> + unsigned size, gfp_t gfp)
>> {
>> /* Size must be multiple of hard sectorsize */
>> if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
>> @@ -1111,13 +1112,21 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
>> if (bh)
>> return bh;
>>
>> - ret = grow_buffers(bdev, block, size);
>> + ret = grow_buffers(bdev, block, size, gfp);
>> if (ret < 0)
>> return NULL;
>> if (ret == 0)
>> free_more_memory();
>> }
>> }
>> +EXPORT_SYMBOL(__getblk_gfp);
>> +
>> +struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block,
>> + unsigned size)
>> +{
>> + return __getblk_gfp(bdev, block, size, 0);
>> +}
>> +EXPORT_SYMBOL(getblk_unmovable);
> This can be just an inline function in include/linux/buffer_head.h.
OK. I agreed.
>
>> /*
>> * The relationship between dirty buffers and dirty pages:
>> @@ -1385,7 +1394,7 @@ __getblk(struct block_device *bdev, sector_t block, unsigned size)
>>
>> might_sleep();
>> if (bh == NULL)
>> - bh = __getblk_slow(bdev, block, size);
>> + bh = __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
>> return bh;
>> }
>> EXPORT_SYMBOL(__getblk);
> I'd keep __getblk_slow() internal and just add 'gfp' parameter to it.
> Then change __getblk() to __getblk_gfp() and pass on the 'gfp' parameter.
> And finally define inline __getblk() in include/linux/buffer_head.h which
> just calls __getblk_gfp() with appropriate gfp mask.
>
> That way you keep all the interfaces completely symmetric. For example now
> you miss might_sleep() checks from __getblk_gfp().
>
> Honza
>
I got it.
What about below?: add gfp for __getblk_slow, change __getblk into __getblk_gfp,
getblk_unmovable and __getblk are, I think, symmetric.
If you say OK, I'm going to send v4 with tabs ;-)
diff --git a/fs/buffer.c b/fs/buffer.c
index 8f05111..21711c78 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
*/
static int
grow_dev_page(struct block_device *bdev, sector_t block,
- pgoff_t index, int size, int sizebits)
+ pgoff_t index, int size, int sizebits, gfp_t gfp)
{
struct inode *inode = bdev->bd_inode;
struct page *page;
@@ -1002,10 +1002,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
int ret = 0; /* Will call free_more_memory() */
gfp_t gfp_mask;
- gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
- gfp_mask |= __GFP_MOVABLE;
+ gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+
/*
- * XXX: __getblk_slow() can not really deal with failure and
+ * XXX: __getblk_gfp() can not really deal with failure and
* will endlessly loop on improvised global reclaim. Prefer
* looping in the allocator rather than here, at least that
* code knows what it's doing.
@@ -1058,7 +1058,7 @@ failed:
* that page was dirty, the buffers are set dirty also.
*/
static int
-grow_buffers(struct block_device *bdev, sector_t block, int size)
+grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
{
pgoff_t index;
int sizebits;
@@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
}
/* Create a page with the proper size buffers.. */
- return grow_dev_page(bdev, block, index, size, sizebits);
+ return grow_dev_page(bdev, block, index, size, sizebits, gfp);
}
-static struct buffer_head *
-__getblk_slow(struct block_device *bdev, sector_t block, int size)
+struct buffer_head *
+__getblk_slow(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
/* Size must be multiple of hard sectorsize */
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
@@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
if (bh)
return bh;
- ret = grow_buffers(bdev, block, size);
+ ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
if (ret == 0)
free_more_memory();
}
}
+EXPORT_SYMBOL(__getblk_slow);
/*
* The relationship between dirty buffers and dirty pages:
@@ -1371,24 +1373,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
EXPORT_SYMBOL(__find_get_block);
/*
- * __getblk will locate (and, if necessary, create) the buffer_head
+ * __getblk_gfp will locate (and, if necessary, create) the buffer_head
* which corresponds to the passed block_device, block and size. The
* returned buffer has its reference count incremented.
*
- * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
- * attempt is failing. FIXME, perhaps?
+ * __getblk()_gfp will lock up the machine if grow_dev_page's
+ * try_to_free_buffers() attempt is failing. FIXME, perhaps?
*/
struct buffer_head *
-__getblk(struct block_device *bdev, sector_t block, unsigned size)
+__getblk_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
struct buffer_head *bh = __find_get_block(bdev, block, size);
might_sleep();
if (bh == NULL)
- bh = __getblk_slow(bdev, block, size);
+ bh = __getblk_slow(bdev, block, size, gfp);
return bh;
}
-EXPORT_SYMBOL(__getblk);
+EXPORT_SYMBOL(__getblk_gfp);
/*
* Do async read-ahead on a buffer..
@@ -1410,18 +1413,39 @@ EXPORT_SYMBOL(__breadahead);
* @size: size (in bytes) to read
*
* Reads a specified block, and returns buffer head that contains it.
+ * The page cache is allocated from movable area so that it can be migrated.
* It returns NULL if the block was unreadable.
*/
struct buffer_head *
__bread(struct block_device *bdev, sector_t block, unsigned size)
{
- struct buffer_head *bh = __getblk(bdev, block, size);
+ return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
+}
+EXPORT_SYMBOL(__bread);
+
+/**
+ * __bread_gfp() - reads a specified block and returns the bh
+ * @bdev: the block_device to read from
+ * @block: number of block
+ * @size: size (in bytes) to read
+ * @gfp: page allocation flag
+ *
+ * Reads a specified block, and returns buffer head that contains it.
+ * The page cache can be allocated from non-movable area
+ * not to prevent page migration if you set gfp to zero.
+ * It returns NULL if the block was unreadable.
+ */
+struct buffer_head *
+__bread_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
+{
+ struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
if (likely(bh) && !buffer_uptodate(bh))
bh = __bread_slow(bh);
return bh;
}
-EXPORT_SYMBOL(__bread);
+EXPORT_SYMBOL(__bread_gfp);
/*
* invalidate_bh_lrus() is called rarely - but not only at unmount.
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 324329c..6073f5d 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -175,12 +175,14 @@ void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
unsigned size);
-struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
- unsigned size);
+struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
+struct buffer_head *__bread_gfp(struct block_device *,
+ sector_t block, unsigned size, gfp_t gfp);
void invalidate_bh_lrus(void);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
@@ -295,7 +297,13 @@ static inline void bforget(struct buffer_head *bh)
static inline struct buffer_head *
sb_bread(struct super_block *sb, sector_t block)
{
- return __bread(sb->s_bdev, block, sb->s_blocksize);
+ return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
+}
+
+static inline struct buffer_head *
+sb_bread_unmovable(struct super_block *sb, sector_t block)
+{
+ return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
}
static inline void
@@ -307,7 +315,7 @@ sb_breadahead(struct super_block *sb, sector_t block)
static inline struct buffer_head *
sb_getblk(struct super_block *sb, sector_t block)
{
- return __getblk(sb->s_bdev, block, sb->s_blocksize);
+ return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
}
static inline struct buffer_head *
@@ -344,6 +352,20 @@ static inline void lock_buffer(struct buffer_head *bh)
__lock_buffer(bh);
}
+static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
+ sector_t block,
+ unsigned size)
+{
+ return __getblk_gfp(bdev, block, size, 0);
+}
+
+static inline struct buffer_head *__getblk(struct block_device *bdev,
+ sector_t block,
+ unsigned size)
+{
+ return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
+}
+
extern int __set_page_dirty_buffers(struct page *page);
#else /* CONFIG_BLOCK */
--
1.7.9.5
On Fri 29-08-14 13:48:27, Gioh Kim wrote:
> What about below?: add gfp for __getblk_slow, change __getblk into __getblk_gfp,
> getblk_unmovable and __getblk are, I think, symmetric.
>
> If you say OK, I'm going to send v4 with tabs ;-)
Yes, this looks like what I wanted. I've just spotted two typos in
comments and one function which should be inline (see below). Thanks for
work!
Honza
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 8f05111..21711c78 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
> */
> static int
> grow_dev_page(struct block_device *bdev, sector_t block,
> - pgoff_t index, int size, int sizebits)
> + pgoff_t index, int size, int sizebits, gfp_t gfp)
> {
> struct inode *inode = bdev->bd_inode;
> struct page *page;
> @@ -1002,10 +1002,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
> int ret = 0; /* Will call free_more_memory() */
> gfp_t gfp_mask;
>
> - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> - gfp_mask |= __GFP_MOVABLE;
> + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
> +
> /*
> - * XXX: __getblk_slow() can not really deal with failure and
> + * XXX: __getblk_gfp() can not really deal with failure and
You can leave __getblk_slow() here I believe.
> * will endlessly loop on improvised global reclaim. Prefer
> * looping in the allocator rather than here, at least that
> * code knows what it's doing.
> @@ -1058,7 +1058,7 @@ failed:
> * that page was dirty, the buffers are set dirty also.
> */
> static int
> -grow_buffers(struct block_device *bdev, sector_t block, int size)
> +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
> {
> pgoff_t index;
> int sizebits;
> @@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
> }
>
> /* Create a page with the proper size buffers.. */
> - return grow_dev_page(bdev, block, index, size, sizebits);
> + return grow_dev_page(bdev, block, index, size, sizebits, gfp);
> }
>
> -static struct buffer_head *
> -__getblk_slow(struct block_device *bdev, sector_t block, int size)
> +struct buffer_head *
> +__getblk_slow(struct block_device *bdev, sector_t block,
> + unsigned size, gfp_t gfp)
> {
> /* Size must be multiple of hard sectorsize */
> if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
> @@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
> if (bh)
> return bh;
>
> - ret = grow_buffers(bdev, block, size);
> + ret = grow_buffers(bdev, block, size, gfp);
> if (ret < 0)
> return NULL;
> if (ret == 0)
> free_more_memory();
> }
> }
> +EXPORT_SYMBOL(__getblk_slow);
>
> /*
> * The relationship between dirty buffers and dirty pages:
> @@ -1371,24 +1373,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
> EXPORT_SYMBOL(__find_get_block);
>
> /*
> - * __getblk will locate (and, if necessary, create) the buffer_head
> + * __getblk_gfp will locate (and, if necessary, create) the buffer_head
> * which corresponds to the passed block_device, block and size. The
> * returned buffer has its reference count incremented.
> *
> - * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
> - * attempt is failing. FIXME, perhaps?
> + * __getblk()_gfp will lock up the machine if grow_dev_page's
_gfp should be before () in the line above.
> + * try_to_free_buffers() attempt is failing. FIXME, perhaps?
> */
> struct buffer_head *
> -__getblk(struct block_device *bdev, sector_t block, unsigned size)
> +__getblk_gfp(struct block_device *bdev, sector_t block,
> + unsigned size, gfp_t gfp)
> {
> struct buffer_head *bh = __find_get_block(bdev, block, size);
>
> might_sleep();
> if (bh == NULL)
> - bh = __getblk_slow(bdev, block, size);
> + bh = __getblk_slow(bdev, block, size, gfp);
> return bh;
> }
> -EXPORT_SYMBOL(__getblk);
> +EXPORT_SYMBOL(__getblk_gfp);
>
> /*
> * Do async read-ahead on a buffer..
> @@ -1410,18 +1413,39 @@ EXPORT_SYMBOL(__breadahead);
> * @size: size (in bytes) to read
> *
> * Reads a specified block, and returns buffer head that contains it.
> + * The page cache is allocated from movable area so that it can be migrated.
> * It returns NULL if the block was unreadable.
> */
> struct buffer_head *
> __bread(struct block_device *bdev, sector_t block, unsigned size)
> {
> - struct buffer_head *bh = __getblk(bdev, block, size);
> + return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
> +}
> +EXPORT_SYMBOL(__bread);
This can be just inline defined in buffer_head.h.
> +
> +/**
> + * __bread_gfp() - reads a specified block and returns the bh
> + * @bdev: the block_device to read from
> + * @block: number of block
> + * @size: size (in bytes) to read
> + * @gfp: page allocation flag
> + *
> + * Reads a specified block, and returns buffer head that contains it.
> + * The page cache can be allocated from non-movable area
> + * not to prevent page migration if you set gfp to zero.
> + * It returns NULL if the block was unreadable.
> + */
> +struct buffer_head *
> +__bread_gfp(struct block_device *bdev, sector_t block,
> + unsigned size, gfp_t gfp)
> +{
> + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
>
> if (likely(bh) && !buffer_uptodate(bh))
> bh = __bread_slow(bh);
> return bh;
> }
> -EXPORT_SYMBOL(__bread);
> +EXPORT_SYMBOL(__bread_gfp);
>
> /*
> * invalidate_bh_lrus() is called rarely - but not only at unmount.
> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
> index 324329c..6073f5d 100644
> --- a/include/linux/buffer_head.h
> +++ b/include/linux/buffer_head.h
> @@ -175,12 +175,14 @@ void __wait_on_buffer(struct buffer_head *);
> wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
> struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
> unsigned size);
> -struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
> - unsigned size);
> +struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
> + unsigned size, gfp_t gfp);
> void __brelse(struct buffer_head *);
> void __bforget(struct buffer_head *);
> void __breadahead(struct block_device *, sector_t block, unsigned int size);
> struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
> +struct buffer_head *__bread_gfp(struct block_device *,
> + sector_t block, unsigned size, gfp_t gfp);
> void invalidate_bh_lrus(void);
> struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
> void free_buffer_head(struct buffer_head * bh);
> @@ -295,7 +297,13 @@ static inline void bforget(struct buffer_head *bh)
> static inline struct buffer_head *
> sb_bread(struct super_block *sb, sector_t block)
> {
> - return __bread(sb->s_bdev, block, sb->s_blocksize);
> + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
> +}
> +
> +static inline struct buffer_head *
> +sb_bread_unmovable(struct super_block *sb, sector_t block)
> +{
> + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
> }
>
> static inline void
> @@ -307,7 +315,7 @@ sb_breadahead(struct super_block *sb, sector_t block)
> static inline struct buffer_head *
> sb_getblk(struct super_block *sb, sector_t block)
> {
> - return __getblk(sb->s_bdev, block, sb->s_blocksize);
> + return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
> }
>
> static inline struct buffer_head *
> @@ -344,6 +352,20 @@ static inline void lock_buffer(struct buffer_head *bh)
> __lock_buffer(bh);
> }
>
> +static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
> + sector_t block,
> + unsigned size)
> +{
> + return __getblk_gfp(bdev, block, size, 0);
> +}
> +
> +static inline struct buffer_head *__getblk(struct block_device *bdev,
> + sector_t block,
> + unsigned size)
> +{
> + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
> +}
> +
> extern int __set_page_dirty_buffers(struct page *page);
>
> #else /* CONFIG_BLOCK */
> --
> 1.7.9.5
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
2014-09-01 오후 4:53, Jan Kara 쓴 글:
> On Fri 29-08-14 13:48:27, Gioh Kim wrote:
>> What about below?: add gfp for __getblk_slow, change __getblk into __getblk_gfp,
>> getblk_unmovable and __getblk are, I think, symmetric.
>>
>> If you say OK, I'm going to send v4 with tabs ;-)
> Yes, this looks like what I wanted. I've just spotted two typos in
> comments and one function which should be inline (see below). Thanks for
> work!
>
> Honza
Thank you too!
I'm going to report the next spin soon.
>
>> diff --git a/fs/buffer.c b/fs/buffer.c
>> index 8f05111..21711c78 100644
>> --- a/fs/buffer.c
>> +++ b/fs/buffer.c
>> @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
>> */
>> static int
>> grow_dev_page(struct block_device *bdev, sector_t block,
>> - pgoff_t index, int size, int sizebits)
>> + pgoff_t index, int size, int sizebits, gfp_t gfp)
>> {
>> struct inode *inode = bdev->bd_inode;
>> struct page *page;
>> @@ -1002,10 +1002,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
>> int ret = 0; /* Will call free_more_memory() */
>> gfp_t gfp_mask;
>>
>> - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
>> - gfp_mask |= __GFP_MOVABLE;
>> + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
>> +
>> /*
>> - * XXX: __getblk_slow() can not really deal with failure and
>> + * XXX: __getblk_gfp() can not really deal with failure and
> You can leave __getblk_slow() here I believe.
I got it. It's my mistake.
>
>
>> * will endlessly loop on improvised global reclaim. Prefer
>> * looping in the allocator rather than here, at least that
>> * code knows what it's doing.
>> @@ -1058,7 +1058,7 @@ failed:
>> * that page was dirty, the buffers are set dirty also.
>> */
>> static int
>> -grow_buffers(struct block_device *bdev, sector_t block, int size)
>> +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
>> {
>> pgoff_t index;
>> int sizebits;
>> @@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
>> }
>>
>> /* Create a page with the proper size buffers.. */
>> - return grow_dev_page(bdev, block, index, size, sizebits);
>> + return grow_dev_page(bdev, block, index, size, sizebits, gfp);
>> }
>>
>> -static struct buffer_head *
>> -__getblk_slow(struct block_device *bdev, sector_t block, int size)
>> +struct buffer_head *
>> +__getblk_slow(struct block_device *bdev, sector_t block,
>> + unsigned size, gfp_t gfp)
>> {
>> /* Size must be multiple of hard sectorsize */
>> if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
>> @@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
>> if (bh)
>> return bh;
>>
>> - ret = grow_buffers(bdev, block, size);
>> + ret = grow_buffers(bdev, block, size, gfp);
>> if (ret < 0)
>> return NULL;
>> if (ret == 0)
>> free_more_memory();
>> }
>> }
>> +EXPORT_SYMBOL(__getblk_slow);
>>
>> /*
>> * The relationship between dirty buffers and dirty pages:
>> @@ -1371,24 +1373,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
>> EXPORT_SYMBOL(__find_get_block);
>>
>> /*
>> - * __getblk will locate (and, if necessary, create) the buffer_head
>> + * __getblk_gfp will locate (and, if necessary, create) the buffer_head
>> * which corresponds to the passed block_device, block and size. The
>> * returned buffer has its reference count incremented.
>> *
>> - * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
>> - * attempt is failing. FIXME, perhaps?
>> + * __getblk()_gfp will lock up the machine if grow_dev_page's
> _gfp should be before () in the line above.
I got it.
>
>> + * try_to_free_buffers() attempt is failing. FIXME, perhaps?
>> */
>> struct buffer_head *
>> -__getblk(struct block_device *bdev, sector_t block, unsigned size)
>> +__getblk_gfp(struct block_device *bdev, sector_t block,
>> + unsigned size, gfp_t gfp)
>> {
>> struct buffer_head *bh = __find_get_block(bdev, block, size);
>>
>> might_sleep();
>> if (bh == NULL)
>> - bh = __getblk_slow(bdev, block, size);
>> + bh = __getblk_slow(bdev, block, size, gfp);
>> return bh;
>> }
>> -EXPORT_SYMBOL(__getblk);
>> +EXPORT_SYMBOL(__getblk_gfp);
>>
>> /*
>> * Do async read-ahead on a buffer..
>> @@ -1410,18 +1413,39 @@ EXPORT_SYMBOL(__breadahead);
>> * @size: size (in bytes) to read
>> *
>> * Reads a specified block, and returns buffer head that contains it.
>> + * The page cache is allocated from movable area so that it can be migrated.
>> * It returns NULL if the block was unreadable.
>> */
>> struct buffer_head *
>> __bread(struct block_device *bdev, sector_t block, unsigned size)
>> {
>> - struct buffer_head *bh = __getblk(bdev, block, size);
>> + return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
>> +}
>> +EXPORT_SYMBOL(__bread);
> This can be just inline defined in buffer_head.h.
I got it.
>
>
>> +
>> +/**
>> + * __bread_gfp() - reads a specified block and returns the bh
>> + * @bdev: the block_device to read from
>> + * @block: number of block
>> + * @size: size (in bytes) to read
>> + * @gfp: page allocation flag
>> + *
>> + * Reads a specified block, and returns buffer head that contains it.
>> + * The page cache can be allocated from non-movable area
>> + * not to prevent page migration if you set gfp to zero.
>> + * It returns NULL if the block was unreadable.
>> + */
>> +struct buffer_head *
>> +__bread_gfp(struct block_device *bdev, sector_t block,
>> + unsigned size, gfp_t gfp)
>> +{
>> + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
>>
>> if (likely(bh) && !buffer_uptodate(bh))
>> bh = __bread_slow(bh);
>> return bh;
>> }
>> -EXPORT_SYMBOL(__bread);
>> +EXPORT_SYMBOL(__bread_gfp);
>>
>> /*
>> * invalidate_bh_lrus() is called rarely - but not only at unmount.
>> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
>> index 324329c..6073f5d 100644
>> --- a/include/linux/buffer_head.h
>> +++ b/include/linux/buffer_head.h
>> @@ -175,12 +175,14 @@ void __wait_on_buffer(struct buffer_head *);
>> wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
>> struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
>> unsigned size);
>> -struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
>> - unsigned size);
>> +struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
>> + unsigned size, gfp_t gfp);
>> void __brelse(struct buffer_head *);
>> void __bforget(struct buffer_head *);
>> void __breadahead(struct block_device *, sector_t block, unsigned int size);
>> struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
>> +struct buffer_head *__bread_gfp(struct block_device *,
>> + sector_t block, unsigned size, gfp_t gfp);
>> void invalidate_bh_lrus(void);
>> struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
>> void free_buffer_head(struct buffer_head * bh);
>> @@ -295,7 +297,13 @@ static inline void bforget(struct buffer_head *bh)
>> static inline struct buffer_head *
>> sb_bread(struct super_block *sb, sector_t block)
>> {
>> - return __bread(sb->s_bdev, block, sb->s_blocksize);
>> + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
>> +}
>> +
>> +static inline struct buffer_head *
>> +sb_bread_unmovable(struct super_block *sb, sector_t block)
>> +{
>> + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
>> }
>>
>> static inline void
>> @@ -307,7 +315,7 @@ sb_breadahead(struct super_block *sb, sector_t block)
>> static inline struct buffer_head *
>> sb_getblk(struct super_block *sb, sector_t block)
>> {
>> - return __getblk(sb->s_bdev, block, sb->s_blocksize);
>> + return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
>> }
>>
>> static inline struct buffer_head *
>> @@ -344,6 +352,20 @@ static inline void lock_buffer(struct buffer_head *bh)
>> __lock_buffer(bh);
>> }
>>
>> +static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
>> + sector_t block,
>> + unsigned size)
>> +{
>> + return __getblk_gfp(bdev, block, size, 0);
>> +}
>> +
>> +static inline struct buffer_head *__getblk(struct block_device *bdev,
>> + sector_t block,
>> + unsigned size)
>> +{
>> + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
>> +}
>> +
>> extern int __set_page_dirty_buffers(struct page *page);
>>
>> #else /* CONFIG_BLOCK */
>> --
>> 1.7.9.5
>>