Hello,
This patch try to solve problem that a long-lasting page caches of
ext4 superblock and journaling of superblock disturb page migration.
I've been testing CMA feature on my ARM-based platform
and found that two page caches cannot be migrated.
They are page caches of superblock of ext4 filesystem and its journaling data.
Current ext4 reads superblock with sb_bread() that allocates page
from movable area. But the problem is that ext4 hold the page until
it is unmounted. If root filesystem is ext4 the page cannot be migrated forever.
And also the journaling data for the superblock cannot be migreated.
I introduce a new API that allocates page cache with specific flag passed by an argument.
The API user can set flag to zero if the user wants to allocate page cache from non-movable area.
It is useful for ext4/ext3 and others that want to hold page cache for a long time.
I have 3 patchs:
1. Patch 1/3: introduce a new API that create page cache with allocation flag
2. Patch 2/3: have ext4 use the new API to read superblock
3. Patch 3/3: have jbd/jbd2 use the new API to make journaling of superblock
This patchset is based on linux-next-20140814.
Thanks a lot.
Gioh Kim (3):
fs/buffer.c: allocate buffer cache with user specific flag
ext4: allocate buffer-cache for superblock in non-movable area
jbd-jbd2-allocate-buffer-cache-for-superblock-inode-.patch
fs/buffer.c | 52 +++++++++++++++++++++++++++++---------------
fs/ext4/super.c | 6 ++---
fs/jbd/journal.c | 2 -
fs/jbd2/journal.c | 2 -
include/linux/buffer_head.h | 12 +++++++++-
5 files changed, 51 insertions(+), 23 deletions(-)
A buffer cache is allocated from movable area
because it is referred for a while and released soon.
But some filesystems are taking buffer cache for a long time
and it can disturb page migration.
A new API should be introduced to allocate buffer cache
with user specific flag.
For instance if user set flag to zero, buffer cache is allocated from
non-movable area.
Signed-off-by: Gioh Kim <[email protected]>
---
fs/buffer.c | 52 +++++++++++++++++++++++++++++--------------
include/linux/buffer_head.h | 12 +++++++++-
2 files changed, 46 insertions(+), 18 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index 8f05111..14f2f21 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
*/
static int
grow_dev_page(struct block_device *bdev, sector_t block,
- pgoff_t index, int size, int sizebits)
+ pgoff_t index, int size, int sizebits, gfp_t gfp)
{
struct inode *inode = bdev->bd_inode;
struct page *page;
@@ -1002,10 +1002,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
int ret = 0; /* Will call free_more_memory() */
gfp_t gfp_mask;
- gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
- gfp_mask |= __GFP_MOVABLE;
+ gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+
/*
- * XXX: __getblk_slow() can not really deal with failure and
+ * XXX: __getblk_gfp() can not really deal with failure and
* will endlessly loop on improvised global reclaim. Prefer
* looping in the allocator rather than here, at least that
* code knows what it's doing.
@@ -1058,7 +1058,7 @@ failed:
* that page was dirty, the buffers are set dirty also.
*/
static int
-grow_buffers(struct block_device *bdev, sector_t block, int size)
+grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
{
pgoff_t index;
int sizebits;
@@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
}
/* Create a page with the proper size buffers.. */
- return grow_dev_page(bdev, block, index, size, sizebits);
+ return grow_dev_page(bdev, block, index, size, sizebits, gfp);
}
-static struct buffer_head *
-__getblk_slow(struct block_device *bdev, sector_t block, int size)
+struct buffer_head *
+__getblk_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
/* Size must be multiple of hard sectorsize */
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
@@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
if (bh)
return bh;
- ret = grow_buffers(bdev, block, size);
+ ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
if (ret == 0)
free_more_memory();
}
}
+EXPORT_SYMBOL(__getblk_gfp);
/*
* The relationship between dirty buffers and dirty pages:
@@ -1381,12 +1383,7 @@ EXPORT_SYMBOL(__find_get_block);
struct buffer_head *
__getblk(struct block_device *bdev, sector_t block, unsigned size)
{
- struct buffer_head *bh = __find_get_block(bdev, block, size);
-
- might_sleep();
- if (bh == NULL)
- bh = __getblk_slow(bdev, block, size);
- return bh;
+ return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
}
EXPORT_SYMBOL(__getblk);
@@ -1410,18 +1407,39 @@ EXPORT_SYMBOL(__breadahead);
* @size: size (in bytes) to read
*
* Reads a specified block, and returns buffer head that contains it.
+ * The page cache is allocated from movable area so that it can be migrated.
* It returns NULL if the block was unreadable.
*/
struct buffer_head *
__bread(struct block_device *bdev, sector_t block, unsigned size)
{
- struct buffer_head *bh = __getblk(bdev, block, size);
+ return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
+}
+EXPORT_SYMBOL(__bread);
+
+/**
+ * __bread_gfp() - reads a specified block and returns the bh
+ * @bdev: the block_device to read from
+ * @block: number of block
+ * @size: size (in bytes) to read
+ * @gfp: page allocation flag
+ *
+ * Reads a specified block, and returns buffer head that contains it.
+ * The page cache can be allocated from non-movable area
+ * not to prevent page migration if you set gfp to zero.
+ * It returns NULL if the block was unreadable.
+ */
+struct buffer_head *
+__bread_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
+{
+ struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
if (likely(bh) && !buffer_uptodate(bh))
bh = __bread_slow(bh);
return bh;
}
-EXPORT_SYMBOL(__bread);
+EXPORT_SYMBOL(__bread_gfp);
/*
* invalidate_bh_lrus() is called rarely - but not only at unmount.
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 324329c..a1d73fd 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -177,10 +177,14 @@ struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
unsigned size);
struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
unsigned size);
+struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
+struct buffer_head *__bread_gfp(struct block_device *,
+ sector_t block, unsigned size, gfp_t gfp);
void invalidate_bh_lrus(void);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
@@ -295,7 +299,13 @@ static inline void bforget(struct buffer_head *bh)
static inline struct buffer_head *
sb_bread(struct super_block *sb, sector_t block)
{
- return __bread(sb->s_bdev, block, sb->s_blocksize);
+ return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
+}
+
+static inline struct buffer_head *
+sb_bread_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
+{
+ return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
}
static inline void
--
1.7.9.5
A buffer-cache for superblock is disturbing page migration,
because the buffer-cache is not released until unmount.
The buffer-cache must be allocated from non-movable area.
Signed-off-by: Gioh Kim <[email protected]>
---
fs/ext4/super.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 32b43ad..13d7770 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3435,7 +3435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
logical_sb_block = sb_block;
}
- if (!(bh = sb_bread(sb, logical_sb_block))) {
+ if (!(bh = sb_bread_gfp(sb, logical_sb_block, 0))) {
ext4_msg(sb, KERN_ERR, "unable to read superblock");
goto out_fail;
}
@@ -3645,7 +3645,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
offset = do_div(logical_sb_block, blocksize);
- bh = sb_bread(sb, logical_sb_block);
+ bh = sb_bread_gfp(sb, logical_sb_block, 0);
if (!bh) {
ext4_msg(sb, KERN_ERR,
"Can't read superblock on 2nd try");
@@ -3867,7 +3867,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
- sbi->s_group_desc[i] = sb_bread(sb, block);
+ sbi->s_group_desc[i] = sb_bread_gfp(sb, block, 0);
if (!sbi->s_group_desc[i]) {
ext4_msg(sb, KERN_ERR,
"can't read group descriptor %d", i);
--
1.7.9.5
A long-lasting buffer-cache can distrub page migration so that
it must be allocated from non-movable area.
The journal_init_inode is creating a buffer-cache for superblock journaling.
The superblock exists until system shutdown so that the buffer-cache
for the superblock would also exist for a long time
and it can distrub page migration.
This patch make the buffer-cache be allocated from non-movable area
not to distrub page migration.
Signed-off-by: Gioh Kim <[email protected]>
---
fs/jbd/journal.c | 2 +-
fs/jbd2/journal.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 06fe11e..a9f8aaf 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -886,7 +886,7 @@ journal_t * journal_init_inode (struct inode *inode)
goto out_err;
}
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ bh = __getblk_gfp(journal->j_dev, blocknr, journal->j_blocksize, 0);
if (!bh) {
printk(KERN_ERR
"%s: Cannot get buffer for journal superblock\n",
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 67b8e30..0461998 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1237,7 +1237,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
goto out_err;
}
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ bh = __getblk_gfp(journal->j_dev, blocknr, journal->j_blocksize, 0);
if (!bh) {
printk(KERN_ERR
"%s: Cannot get buffer for journal superblock\n",
--
1.7.9.5
Hello,
On Tue 19-08-14 15:52:38, Gioh Kim wrote:
> A buffer cache is allocated from movable area
> because it is referred for a while and released soon.
> But some filesystems are taking buffer cache for a long time
> and it can disturb page migration.
>
> A new API should be introduced to allocate buffer cache
> with user specific flag.
> For instance if user set flag to zero, buffer cache is allocated from
> non-movable area.
>
> Signed-off-by: Gioh Kim <[email protected]>
> ---
> fs/buffer.c | 52 +++++++++++++++++++++++++++++--------------
> include/linux/buffer_head.h | 12 +++++++++-
> 2 files changed, 46 insertions(+), 18 deletions(-)
>
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 8f05111..14f2f21 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
> */
> static int
> grow_dev_page(struct block_device *bdev, sector_t block,
> - pgoff_t index, int size, int sizebits)
> + pgoff_t index, int size, int sizebits, gfp_t gfp)
> {
> struct inode *inode = bdev->bd_inode;
> struct page *page;
> @@ -1002,10 +1002,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
> int ret = 0; /* Will call free_more_memory() */
> gfp_t gfp_mask;
>
> - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> - gfp_mask |= __GFP_MOVABLE;
> + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
> +
Hum, it seems a bit misleading that the 'gfp' flags are just or-ed to
mapping_gfp_mask(inode->i_mapping). Usually, passed gfp mask is just
directly used. There are also interfaces like pagecache_get_page() which
play more complex tricks with mapping_gfp_mask(). This would be yet another
convention which I don't think is desirable. I know Andrew suggested what
you wrote so I guess I have to settle this with him. Andrew?
> /*
> - * XXX: __getblk_slow() can not really deal with failure and
> + * XXX: __getblk_gfp() can not really deal with failure and
> * will endlessly loop on improvised global reclaim. Prefer
> * looping in the allocator rather than here, at least that
> * code knows what it's doing.
> @@ -1058,7 +1058,7 @@ failed:
> * that page was dirty, the buffers are set dirty also.
> */
> static int
> -grow_buffers(struct block_device *bdev, sector_t block, int size)
> +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
> {
> pgoff_t index;
> int sizebits;
> @@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
> }
>
> /* Create a page with the proper size buffers.. */
> - return grow_dev_page(bdev, block, index, size, sizebits);
> + return grow_dev_page(bdev, block, index, size, sizebits, gfp);
> }
>
> -static struct buffer_head *
> -__getblk_slow(struct block_device *bdev, sector_t block, int size)
> +struct buffer_head *
> +__getblk_gfp(struct block_device *bdev, sector_t block,
> + unsigned size, gfp_t gfp)
> {
> /* Size must be multiple of hard sectorsize */
> if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
> @@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
> if (bh)
> return bh;
>
> - ret = grow_buffers(bdev, block, size);
> + ret = grow_buffers(bdev, block, size, gfp);
> if (ret < 0)
> return NULL;
> if (ret == 0)
> free_more_memory();
> }
> }
> +EXPORT_SYMBOL(__getblk_gfp);
>
> /*
> * The relationship between dirty buffers and dirty pages:
> @@ -1381,12 +1383,7 @@ EXPORT_SYMBOL(__find_get_block);
> struct buffer_head *
> __getblk(struct block_device *bdev, sector_t block, unsigned size)
> {
> - struct buffer_head *bh = __find_get_block(bdev, block, size);
> -
> - might_sleep();
> - if (bh == NULL)
> - bh = __getblk_slow(bdev, block, size);
> - return bh;
> + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
> }
> EXPORT_SYMBOL(__getblk);
Why did you remove the __find_get_block() call? That looks like a bug.
> @@ -1410,18 +1407,39 @@ EXPORT_SYMBOL(__breadahead);
> * @size: size (in bytes) to read
> *
> * Reads a specified block, and returns buffer head that contains it.
> + * The page cache is allocated from movable area so that it can be migrated.
> * It returns NULL if the block was unreadable.
> */
> struct buffer_head *
> __bread(struct block_device *bdev, sector_t block, unsigned size)
> {
> - struct buffer_head *bh = __getblk(bdev, block, size);
> + return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
> +}
> +EXPORT_SYMBOL(__bread);
> +
> +/**
> + * __bread_gfp() - reads a specified block and returns the bh
> + * @bdev: the block_device to read from
> + * @block: number of block
> + * @size: size (in bytes) to read
> + * @gfp: page allocation flag
> + *
> + * Reads a specified block, and returns buffer head that contains it.
> + * The page cache can be allocated from non-movable area
> + * not to prevent page migration if you set gfp to zero.
> + * It returns NULL if the block was unreadable.
> + */
> +struct buffer_head *
> +__bread_gfp(struct block_device *bdev, sector_t block,
> + unsigned size, gfp_t gfp)
> +{
> + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
>
> if (likely(bh) && !buffer_uptodate(bh))
> bh = __bread_slow(bh);
> return bh;
> }
> -EXPORT_SYMBOL(__bread);
> +EXPORT_SYMBOL(__bread_gfp);
>
> /*
> * invalidate_bh_lrus() is called rarely - but not only at unmount.
> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
> index 324329c..a1d73fd 100644
> --- a/include/linux/buffer_head.h
> +++ b/include/linux/buffer_head.h
> @@ -177,10 +177,14 @@ struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
> unsigned size);
> struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
> unsigned size);
> +struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
> + unsigned size, gfp_t gfp);
> void __brelse(struct buffer_head *);
> void __bforget(struct buffer_head *);
> void __breadahead(struct block_device *, sector_t block, unsigned int size);
> struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
> +struct buffer_head *__bread_gfp(struct block_device *,
> + sector_t block, unsigned size, gfp_t gfp);
> void invalidate_bh_lrus(void);
> struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
> void free_buffer_head(struct buffer_head * bh);
> @@ -295,7 +299,13 @@ static inline void bforget(struct buffer_head *bh)
> static inline struct buffer_head *
> sb_bread(struct super_block *sb, sector_t block)
> {
> - return __bread(sb->s_bdev, block, sb->s_blocksize);
> + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
> +}
> +
> +static inline struct buffer_head *
> +sb_bread_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
> +{
> + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
> }
I think Andrew was suggesting to provide sb_bread_unmovable() and
sb_getblk_unmovable() which would set appropriately. It is then more
obvious what are filesystems trying to do when using those functions...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
2014-08-19 오후 10:03, Jan Kara 쓴 글:
> Hello,
>
> On Tue 19-08-14 15:52:38, Gioh Kim wrote:
>> A buffer cache is allocated from movable area
>> because it is referred for a while and released soon.
>> But some filesystems are taking buffer cache for a long time
>> and it can disturb page migration.
>>
>> A new API should be introduced to allocate buffer cache
>> with user specific flag.
>> For instance if user set flag to zero, buffer cache is allocated from
>> non-movable area.
>>
>> Signed-off-by: Gioh Kim <[email protected]>
>> ---
>> fs/buffer.c | 52 +++++++++++++++++++++++++++++--------------
>> include/linux/buffer_head.h | 12 +++++++++-
>> 2 files changed, 46 insertions(+), 18 deletions(-)
>>
>> diff --git a/fs/buffer.c b/fs/buffer.c
>> index 8f05111..14f2f21 100644
>> --- a/fs/buffer.c
>> +++ b/fs/buffer.c
>> @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
>> */
>> static int
>> grow_dev_page(struct block_device *bdev, sector_t block,
>> - pgoff_t index, int size, int sizebits)
>> + pgoff_t index, int size, int sizebits, gfp_t gfp)
>> {
>> struct inode *inode = bdev->bd_inode;
>> struct page *page;
>> @@ -1002,10 +1002,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
>> int ret = 0; /* Will call free_more_memory() */
>> gfp_t gfp_mask;
>>
>> - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
>> - gfp_mask |= __GFP_MOVABLE;
>> + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
>> +
> Hum, it seems a bit misleading that the 'gfp' flags are just or-ed to
> mapping_gfp_mask(inode->i_mapping). Usually, passed gfp mask is just
> directly used. There are also interfaces like pagecache_get_page() which
> play more complex tricks with mapping_gfp_mask(). This would be yet another
> convention which I don't think is desirable. I know Andrew suggested what
> you wrote so I guess I have to settle this with him. Andrew?
I don't know mapping_gfp_mask(). I just add gfp at the original code.
Whould you tell me why it is undesirable?
>
>> /*
>> - * XXX: __getblk_slow() can not really deal with failure and
>> + * XXX: __getblk_gfp() can not really deal with failure and
>> * will endlessly loop on improvised global reclaim. Prefer
>> * looping in the allocator rather than here, at least that
>> * code knows what it's doing.
>> @@ -1058,7 +1058,7 @@ failed:
>> * that page was dirty, the buffers are set dirty also.
>> */
>> static int
>> -grow_buffers(struct block_device *bdev, sector_t block, int size)
>> +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
>> {
>> pgoff_t index;
>> int sizebits;
>> @@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
>> }
>>
>> /* Create a page with the proper size buffers.. */
>> - return grow_dev_page(bdev, block, index, size, sizebits);
>> + return grow_dev_page(bdev, block, index, size, sizebits, gfp);
>> }
>>
>> -static struct buffer_head *
>> -__getblk_slow(struct block_device *bdev, sector_t block, int size)
>> +struct buffer_head *
>> +__getblk_gfp(struct block_device *bdev, sector_t block,
>> + unsigned size, gfp_t gfp)
>> {
>> /* Size must be multiple of hard sectorsize */
>> if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
>> @@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
>> if (bh)
>> return bh;
>>
>> - ret = grow_buffers(bdev, block, size);
>> + ret = grow_buffers(bdev, block, size, gfp);
>> if (ret < 0)
>> return NULL;
>> if (ret == 0)
>> free_more_memory();
>> }
>> }
>> +EXPORT_SYMBOL(__getblk_gfp);
>>
>> /*
>> * The relationship between dirty buffers and dirty pages:
>> @@ -1381,12 +1383,7 @@ EXPORT_SYMBOL(__find_get_block);
>> struct buffer_head *
>> __getblk(struct block_device *bdev, sector_t block, unsigned size)
>> {
>> - struct buffer_head *bh = __find_get_block(bdev, block, size);
>> -
>> - might_sleep();
>> - if (bh == NULL)
>> - bh = __getblk_slow(bdev, block, size);
>> - return bh;
>> + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
>> }
>> EXPORT_SYMBOL(__getblk);
> Why did you remove the __find_get_block() call? That looks like a bug.
>
>> @@ -1410,18 +1407,39 @@ EXPORT_SYMBOL(__breadahead);
>> * @size: size (in bytes) to read
>> *
>> * Reads a specified block, and returns buffer head that contains it.
>> + * The page cache is allocated from movable area so that it can be migrated.
>> * It returns NULL if the block was unreadable.
>> */
>> struct buffer_head *
>> __bread(struct block_device *bdev, sector_t block, unsigned size)
>> {
>> - struct buffer_head *bh = __getblk(bdev, block, size);
>> + return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
>> +}
>> +EXPORT_SYMBOL(__bread);
>> +
>> +/**
>> + * __bread_gfp() - reads a specified block and returns the bh
>> + * @bdev: the block_device to read from
>> + * @block: number of block
>> + * @size: size (in bytes) to read
>> + * @gfp: page allocation flag
>> + *
>> + * Reads a specified block, and returns buffer head that contains it.
>> + * The page cache can be allocated from non-movable area
>> + * not to prevent page migration if you set gfp to zero.
>> + * It returns NULL if the block was unreadable.
>> + */
>> +struct buffer_head *
>> +__bread_gfp(struct block_device *bdev, sector_t block,
>> + unsigned size, gfp_t gfp)
>> +{
>> + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
>>
>> if (likely(bh) && !buffer_uptodate(bh))
>> bh = __bread_slow(bh);
>> return bh;
>> }
>> -EXPORT_SYMBOL(__bread);
>> +EXPORT_SYMBOL(__bread_gfp);
>>
>> /*
>> * invalidate_bh_lrus() is called rarely - but not only at unmount.
>> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
>> index 324329c..a1d73fd 100644
>> --- a/include/linux/buffer_head.h
>> +++ b/include/linux/buffer_head.h
>> @@ -177,10 +177,14 @@ struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
>> unsigned size);
>> struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
>> unsigned size);
>> +struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
>> + unsigned size, gfp_t gfp);
>> void __brelse(struct buffer_head *);
>> void __bforget(struct buffer_head *);
>> void __breadahead(struct block_device *, sector_t block, unsigned int size);
>> struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
>> +struct buffer_head *__bread_gfp(struct block_device *,
>> + sector_t block, unsigned size, gfp_t gfp);
>> void invalidate_bh_lrus(void);
>> struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
>> void free_buffer_head(struct buffer_head * bh);
>> @@ -295,7 +299,13 @@ static inline void bforget(struct buffer_head *bh)
>> static inline struct buffer_head *
>> sb_bread(struct super_block *sb, sector_t block)
>> {
>> - return __bread(sb->s_bdev, block, sb->s_blocksize);
>> + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
>> +}
>> +
>> +static inline struct buffer_head *
>> +sb_bread_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
>> +{
>> + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
>> }
> I think Andrew was suggesting to provide sb_bread_unmovable() and
> sb_getblk_unmovable() which would set appropriately. It is then more
> obvious what are filesystems trying to do when using those functions...
I think the common interface is important.
If sb_getblk_unmovable() is obvious for the filesystem,
I will add some codes for getblk_unmovable() which calling __getblk_gfp(),
and sb_bread_unmovable() calling __bread_gfp().
If so, sb_bread_gfp is not necessary.
It might be like followings:
diff --git a/fs/buffer.c b/fs/buffer.c
index 14f2f21..35caf77 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1088,7 +1088,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int siz
return grow_dev_page(bdev, block, index, size, sizebits, gfp);
}
-struct buffer_head *
+static struct buffer_head *
__getblk_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
@@ -1119,7 +1119,13 @@ __getblk_gfp(struct block_device *bdev, sector_t block,
free_more_memory();
}
}
-EXPORT_SYMBOL(__getblk_gfp);
+
+struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block,
+ unsigned size)
+{
+ return __getblk_gfp(bdev, block, size, 0);
+}
+EXPORT_SYMBOL(getblk_unmovable);
/*
* The relationship between dirty buffers and dirty pages:
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index a1d73fd..c5fb4fc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -177,8 +177,8 @@ struct buffer_head *__find_get_block(struct block_device *bdev, s
unsigned size);
struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
unsigned size);
-struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
- unsigned size, gfp_t gfp);
+struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block,
+ unsigned size);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
@@ -303,9 +303,9 @@ sb_bread(struct super_block *sb, sector_t block)
}
static inline struct buffer_head *
-sb_bread_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
+sb_bread_unmovable(struct super_block *sb, sector_t block)
{
- return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
+ return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
}
static inline void
Is it better?
Thank you for your advice.
On Wed 20-08-14 08:37:07, Gioh Kim wrote:
>
>
> 2014-08-19 오후 10:03, Jan Kara 쓴 글:
> > Hello,
> >
> >On Tue 19-08-14 15:52:38, Gioh Kim wrote:
> >>A buffer cache is allocated from movable area
> >>because it is referred for a while and released soon.
> >>But some filesystems are taking buffer cache for a long time
> >>and it can disturb page migration.
> >>
> >>A new API should be introduced to allocate buffer cache
> >>with user specific flag.
> >>For instance if user set flag to zero, buffer cache is allocated from
> >>non-movable area.
> >>
> >>Signed-off-by: Gioh Kim <[email protected]>
> >>---
> >> fs/buffer.c | 52 +++++++++++++++++++++++++++++--------------
> >> include/linux/buffer_head.h | 12 +++++++++-
> >> 2 files changed, 46 insertions(+), 18 deletions(-)
> >>
> >>diff --git a/fs/buffer.c b/fs/buffer.c
> >>index 8f05111..14f2f21 100644
> >>--- a/fs/buffer.c
> >>+++ b/fs/buffer.c
> >>@@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
> >> */
> >> static int
> >> grow_dev_page(struct block_device *bdev, sector_t block,
> >>- pgoff_t index, int size, int sizebits)
> >>+ pgoff_t index, int size, int sizebits, gfp_t gfp)
> >> {
> >> struct inode *inode = bdev->bd_inode;
> >> struct page *page;
> >>@@ -1002,10 +1002,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
> >> int ret = 0; /* Will call free_more_memory() */
> >> gfp_t gfp_mask;
> >>
> >>- gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> >>- gfp_mask |= __GFP_MOVABLE;
> >>+ gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
> >>+
> > Hum, it seems a bit misleading that the 'gfp' flags are just or-ed to
> >mapping_gfp_mask(inode->i_mapping). Usually, passed gfp mask is just
> >directly used. There are also interfaces like pagecache_get_page() which
> >play more complex tricks with mapping_gfp_mask(). This would be yet another
> >convention which I don't think is desirable. I know Andrew suggested what
> >you wrote so I guess I have to settle this with him. Andrew?
>
> I don't know mapping_gfp_mask(). I just add gfp at the original code.
> Whould you tell me why it is undesirable?
Well, it's not that mapping_gfp_mask() would be undesirable. It's that
the interface where you pass in gfp mask but it gets silently combined with
another gfp mask seems a bit error prone to me. So would prefer
grow_dev_page() to just use the gfp mask passed and then do something like:
struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block,
unsigned size)
{
return __getblk_gfp(bdev, block, size,
mapping_gfp_mask(bdev->bd_inode->i_mapping));
}
And similarly in getblk() and other places. But before you go and do this,
I'd like Andrew to say what he thinks about it because maybe he had a good
reason why he wanted it the way you've implemented it.
> >>@@ -1381,12 +1383,7 @@ EXPORT_SYMBOL(__find_get_block);
> >> struct buffer_head *
> >> __getblk(struct block_device *bdev, sector_t block, unsigned size)
> >> {
> >>- struct buffer_head *bh = __find_get_block(bdev, block, size);
> >>-
> >>- might_sleep();
> >>- if (bh == NULL)
> >>- bh = __getblk_slow(bdev, block, size);
> >>- return bh;
> >>+ return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
> >> }
> >> EXPORT_SYMBOL(__getblk);
> > Why did you remove the __find_get_block() call? That looks like a bug.
I'm not sure if you didn't miss this comment....
> I think the common interface is important.
>
> If sb_getblk_unmovable() is obvious for the filesystem,
> I will add some codes for getblk_unmovable() which calling __getblk_gfp(),
> and sb_bread_unmovable() calling __bread_gfp().
> If so, sb_bread_gfp is not necessary.
>
> It might be like followings:
>
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 14f2f21..35caf77 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -1088,7 +1088,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int siz
> return grow_dev_page(bdev, block, index, size, sizebits, gfp);
> }
>
> -struct buffer_head *
> +static struct buffer_head *
> __getblk_gfp(struct block_device *bdev, sector_t block,
> unsigned size, gfp_t gfp)
> {
> @@ -1119,7 +1119,13 @@ __getblk_gfp(struct block_device *bdev, sector_t block,
> free_more_memory();
> }
> }
> -EXPORT_SYMBOL(__getblk_gfp);
> +
> +struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block,
> + unsigned size)
> +{
> + return __getblk_gfp(bdev, block, size, 0);
> +}
> +EXPORT_SYMBOL(getblk_unmovable);
>
> /*
> * The relationship between dirty buffers and dirty pages:
> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
> index a1d73fd..c5fb4fc 100644
> --- a/include/linux/buffer_head.h
> +++ b/include/linux/buffer_head.h
> @@ -177,8 +177,8 @@ struct buffer_head *__find_get_block(struct block_device *bdev, s
> unsigned size);
> struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
> unsigned size);
> -struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
> - unsigned size, gfp_t gfp);
> +struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block,
> + unsigned size);
> void __brelse(struct buffer_head *);
> void __bforget(struct buffer_head *);
> void __breadahead(struct block_device *, sector_t block, unsigned int size);
> @@ -303,9 +303,9 @@ sb_bread(struct super_block *sb, sector_t block)
> }
>
> static inline struct buffer_head *
> -sb_bread_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
> +sb_bread_unmovable(struct super_block *sb, sector_t block)
> {
> - return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
> + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
> }
>
> Is it better?
Yes, this is what I meant.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
>>>> @@ -1381,12 +1383,7 @@ EXPORT_SYMBOL(__find_get_block);
>>>> struct buffer_head *
>>>> __getblk(struct block_device *bdev, sector_t block, unsigned size)
>>>> {
>>>> - struct buffer_head *bh = __find_get_block(bdev, block, size);
>>>> -
>>>> - might_sleep();
>>>> - if (bh == NULL)
>>>> - bh = __getblk_slow(bdev, block, size);
>>>> - return bh;
>>>> + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
>>>> }
>>>> EXPORT_SYMBOL(__getblk);
>>> Why did you remove the __find_get_block() call? That looks like a bug.
> I'm not sure if you didn't miss this comment....
I'm sorry I missed it.
I think calling __find_get_block() in __getblk_gfp() can replace it.
I'm not sure about it.
If anybody disagree with it, I'll change it as the original code.
On Wed 20-08-14 11:38:10, Gioh Kim wrote:
>
> >>>>@@ -1381,12 +1383,7 @@ EXPORT_SYMBOL(__find_get_block);
> >>>> struct buffer_head *
> >>>> __getblk(struct block_device *bdev, sector_t block, unsigned size)
> >>>> {
> >>>>- struct buffer_head *bh = __find_get_block(bdev, block, size);
> >>>>-
> >>>>- might_sleep();
> >>>>- if (bh == NULL)
> >>>>- bh = __getblk_slow(bdev, block, size);
> >>>>- return bh;
> >>>>+ return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
> >>>> }
> >>>> EXPORT_SYMBOL(__getblk);
> >>> Why did you remove the __find_get_block() call? That looks like a bug.
> > I'm not sure if you didn't miss this comment....
>
> I'm sorry I missed it.
> I think calling __find_get_block() in __getblk_gfp() can replace it.
> I'm not sure about it.
>
> If anybody disagree with it, I'll change it as the original code.
OK, I see. Thanks for explanation. I agree we can remove
__find_get_block() from __getblk() but please make this change a separate
patch and also please put the might_sleep() check __getblk_gfp().
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
2014-08-21 오전 7:02, Jan Kara 쓴 글:
> On Wed 20-08-14 11:38:10, Gioh Kim wrote:
>>
>>>>>> @@ -1381,12 +1383,7 @@ EXPORT_SYMBOL(__find_get_block);
>>>>>> struct buffer_head *
>>>>>> __getblk(struct block_device *bdev, sector_t block, unsigned size)
>>>>>> {
>>>>>> - struct buffer_head *bh = __find_get_block(bdev, block, size);
>>>>>> -
>>>>>> - might_sleep();
>>>>>> - if (bh == NULL)
>>>>>> - bh = __getblk_slow(bdev, block, size);
>>>>>> - return bh;
>>>>>> + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
>>>>>> }
>>>>>> EXPORT_SYMBOL(__getblk);
>>>>> Why did you remove the __find_get_block() call? That looks like a bug.
>>> I'm not sure if you didn't miss this comment....
>>
>> I'm sorry I missed it.
>> I think calling __find_get_block() in __getblk_gfp() can replace it.
>> I'm not sure about it.
>>
>> If anybody disagree with it, I'll change it as the original code.
> OK, I see. Thanks for explanation. I agree we can remove
> __find_get_block() from __getblk() but please make this change a separate
> patch and also please put the might_sleep() check __getblk_gfp().
>
> Honza
>
I got it.
I'm going to report v3 patch that reverts __getblk() and adds sb_bread_unmovable(),
if Andrew give me a feedback about the other codes.
I can wait ;-)