Support in this function for large folios is limited to supporting
filesystems with block size > PAGE_SIZE. This new functionality will only
be supported on machines without HIGHMEM, so the problem of kmap_local
only being able to map a single page in the folio can be ignored.
We will not use large folios for ext2 directories on HIGHMEM machines.
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
fs/ext2/dir.c | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index b335f17f682f..03867381eec2 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -96,19 +96,19 @@ static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
unlock_page(page);
}
-static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
+static bool ext2_check_folio(struct folio *folio, int quiet, char *kaddr)
{
- struct inode *dir = page->mapping->host;
+ struct inode *dir = folio->mapping->host;
struct super_block *sb = dir->i_sb;
unsigned chunk_size = ext2_chunk_size(dir);
u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
unsigned offs, rec_len;
- unsigned limit = PAGE_SIZE;
+ unsigned limit = folio_size(folio);
ext2_dirent *p;
char *error;
- if ((dir->i_size >> PAGE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_MASK;
+ if (dir->i_size < folio_pos(folio) + limit) {
+ limit = offset_in_folio(folio, dir->i_size);
if (limit & (chunk_size - 1))
goto Ebadsize;
if (!limit)
@@ -132,7 +132,7 @@ static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
if (offs != limit)
goto Eend;
out:
- SetPageChecked(page);
+ folio_set_checked(folio);
return true;
/* Too bad, we had an error */
@@ -160,22 +160,22 @@ static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
bad_entry:
if (!quiet)
ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
- "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
+ "offset=%llu, inode=%lu, rec_len=%d, name_len=%d",
+ dir->i_ino, error, folio_pos(folio) + offs,
(unsigned long) le32_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
Eend:
if (!quiet) {
p = (ext2_dirent *)(kaddr + offs);
- ext2_error(sb, "ext2_check_page",
+ ext2_error(sb, "ext2_check_folio",
"entry in directory #%lu spans the page boundary"
- "offset=%lu, inode=%lu",
- dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
+ "offset=%llu, inode=%lu",
+ dir->i_ino, folio_pos(folio) + offs,
(unsigned long) le32_to_cpu(p->inode));
}
fail:
- SetPageError(page);
+ folio_set_error(folio);
return false;
}
@@ -195,9 +195,9 @@ static void *ext2_get_page(struct inode *dir, unsigned long n,
if (IS_ERR(folio))
return ERR_CAST(folio);
- page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1));
+ page_addr = kmap_local_folio(folio, 0);
if (unlikely(!folio_test_checked(folio))) {
- if (!ext2_check_page(&folio->page, quiet, page_addr))
+ if (!ext2_check_folio(folio, quiet, page_addr))
goto fail;
}
*page = &folio->page;
--
2.40.1
On Thu 21-09-23 21:07:39, Matthew Wilcox (Oracle) wrote:
> Support in this function for large folios is limited to supporting
> filesystems with block size > PAGE_SIZE. This new functionality will only
> be supported on machines without HIGHMEM, so the problem of kmap_local
> only being able to map a single page in the folio can be ignored.
> We will not use large folios for ext2 directories on HIGHMEM machines.
OK, but can we perhaps enforce this with some checks & error messages
instead of a silent failure? Like:
#ifdef CONFIG_HIGHMEM
if (sb->s_blocksize > PAGE_SIZE)
bail with error
#endif
somewhere in ext2_fill_super()? Or maybe force allocation of lowmem pages
when blocksize > PAGE_SIZE?
Honza
>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
> ---
> fs/ext2/dir.c | 28 ++++++++++++++--------------
> 1 file changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
> index b335f17f682f..03867381eec2 100644
> --- a/fs/ext2/dir.c
> +++ b/fs/ext2/dir.c
> @@ -96,19 +96,19 @@ static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
> unlock_page(page);
> }
>
> -static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
> +static bool ext2_check_folio(struct folio *folio, int quiet, char *kaddr)
> {
> - struct inode *dir = page->mapping->host;
> + struct inode *dir = folio->mapping->host;
> struct super_block *sb = dir->i_sb;
> unsigned chunk_size = ext2_chunk_size(dir);
> u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
> unsigned offs, rec_len;
> - unsigned limit = PAGE_SIZE;
> + unsigned limit = folio_size(folio);
> ext2_dirent *p;
> char *error;
>
> - if ((dir->i_size >> PAGE_SHIFT) == page->index) {
> - limit = dir->i_size & ~PAGE_MASK;
> + if (dir->i_size < folio_pos(folio) + limit) {
> + limit = offset_in_folio(folio, dir->i_size);
> if (limit & (chunk_size - 1))
> goto Ebadsize;
> if (!limit)
> @@ -132,7 +132,7 @@ static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
> if (offs != limit)
> goto Eend;
> out:
> - SetPageChecked(page);
> + folio_set_checked(folio);
> return true;
>
> /* Too bad, we had an error */
> @@ -160,22 +160,22 @@ static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
> bad_entry:
> if (!quiet)
> ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
> - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
> - dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
> + "offset=%llu, inode=%lu, rec_len=%d, name_len=%d",
> + dir->i_ino, error, folio_pos(folio) + offs,
> (unsigned long) le32_to_cpu(p->inode),
> rec_len, p->name_len);
> goto fail;
> Eend:
> if (!quiet) {
> p = (ext2_dirent *)(kaddr + offs);
> - ext2_error(sb, "ext2_check_page",
> + ext2_error(sb, "ext2_check_folio",
> "entry in directory #%lu spans the page boundary"
> - "offset=%lu, inode=%lu",
> - dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
> + "offset=%llu, inode=%lu",
> + dir->i_ino, folio_pos(folio) + offs,
> (unsigned long) le32_to_cpu(p->inode));
> }
> fail:
> - SetPageError(page);
> + folio_set_error(folio);
> return false;
> }
>
> @@ -195,9 +195,9 @@ static void *ext2_get_page(struct inode *dir, unsigned long n,
>
> if (IS_ERR(folio))
> return ERR_CAST(folio);
> - page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1));
> + page_addr = kmap_local_folio(folio, 0);
> if (unlikely(!folio_test_checked(folio))) {
> - if (!ext2_check_page(&folio->page, quiet, page_addr))
> + if (!ext2_check_folio(folio, quiet, page_addr))
> goto fail;
> }
> *page = &folio->page;
> --
> 2.40.1
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue 03-10-23 12:40:17, Jan Kara wrote:
> On Thu 21-09-23 21:07:39, Matthew Wilcox (Oracle) wrote:
> > Support in this function for large folios is limited to supporting
> > filesystems with block size > PAGE_SIZE. This new functionality will only
> > be supported on machines without HIGHMEM, so the problem of kmap_local
> > only being able to map a single page in the folio can be ignored.
> > We will not use large folios for ext2 directories on HIGHMEM machines.
>
> OK, but can we perhaps enforce this with some checks & error messages
> instead of a silent failure? Like:
>
> #ifdef CONFIG_HIGHMEM
> if (sb->s_blocksize > PAGE_SIZE)
> bail with error
> #endif
>
> somewhere in ext2_fill_super()? Or maybe force allocation of lowmem pages
> when blocksize > PAGE_SIZE?
>
> > @@ -195,9 +195,9 @@ static void *ext2_get_page(struct inode *dir, unsigned long n,
> >
> > if (IS_ERR(folio))
> > return ERR_CAST(folio);
> > - page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1));
> > + page_addr = kmap_local_folio(folio, 0);
Oh, and I think this change breaks the code whenever we get back higher
order folio because the page_addr we get back is not for the page index
'n'.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 21-09-23 21:07:39, Matthew Wilcox (Oracle) wrote:
> Support in this function for large folios is limited to supporting
> filesystems with block size > PAGE_SIZE. This new functionality will only
> be supported on machines without HIGHMEM, so the problem of kmap_local
> only being able to map a single page in the folio can be ignored.
> We will not use large folios for ext2 directories on HIGHMEM machines.
A look like a fool replying to the same patch multiple times ;) but how is
this supposed to work even without HIGHMEM? Suppose we have a page size 4k
and block size 16k. Directory entries in a block can then straddle 4k
boundary so if kmap_local() is mapping only a single page, then we are
going to have hard time parsing this all?
Oh, I guess you are pointing to the fact kmap_local_folio() gives you back
address usable for the whole folio access if !HIGHMEM. But then all the
iterations (e.g. in ext2_readdir()) has to be folio-size based and not
page-size based as they currently are? You didn't change these iterations
in your patches which has confused me...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue, Oct 03, 2023 at 12:52:24PM +0200, Jan Kara wrote:
> On Tue 03-10-23 12:40:17, Jan Kara wrote:
> > On Thu 21-09-23 21:07:39, Matthew Wilcox (Oracle) wrote:
> > > Support in this function for large folios is limited to supporting
> > > filesystems with block size > PAGE_SIZE. This new functionality will only
> > > be supported on machines without HIGHMEM, so the problem of kmap_local
> > > only being able to map a single page in the folio can be ignored.
> > > We will not use large folios for ext2 directories on HIGHMEM machines.
> >
> > OK, but can we perhaps enforce this with some checks & error messages
> > instead of a silent failure? Like:
> >
> > #ifdef CONFIG_HIGHMEM
> > if (sb->s_blocksize > PAGE_SIZE)
> > bail with error
> > #endif
> >
> > somewhere in ext2_fill_super()? Or maybe force allocation of lowmem pages
> > when blocksize > PAGE_SIZE?
> >
> > > @@ -195,9 +195,9 @@ static void *ext2_get_page(struct inode *dir, unsigned long n,
> > >
> > > if (IS_ERR(folio))
> > > return ERR_CAST(folio);
> > > - page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1));
> > > + page_addr = kmap_local_folio(folio, 0);
>
> Oh, and I think this change breaks the code whenever we get back higher
> order folio because the page_addr we get back is not for the page index
> 'n'.
> A look like a fool replying to the same patch multiple times ;) but how is
> this supposed to work even without HIGHMEM? Suppose we have a page size 4k
> and block size 16k. Directory entries in a block can then straddle 4k
> boundary so if kmap_local() is mapping only a single page, then we are
> going to have hard time parsing this all?
>
> Oh, I guess you are pointing to the fact kmap_local_folio() gives you back
> address usable for the whole folio access if !HIGHMEM. But then all the
> iterations (e.g. in ext2_readdir()) has to be folio-size based and not
> page-size based as they currently are? You didn't change these iterations
> in your patches which has confused me...
I'm going to reply to all three emails as one ;-)
The goal for this patchset is simply to do the folio conversion.
Earlier work was focused on "make sure everything behaves as it did
before", but now I've started to give some thought to "How will this
work" without actually doing the higher level work.
Today, ext2 does not permit large folios to be used for directory
inodes. Whoever adds the call to mapping_set_large_folios() gets the
joy of finding all the places that contain assumptions and fixing them.
I don't particularly want to do that myself; I'm just trying to sort
out the underpinnings so that whoever does it has a somewhat sane
API to work against.
So I'm really just trying to do two things here:
- Convert uses of struct page -> struct folio
- Not leave any landmines for whoever comes after me
I had a quick look at converting ext2_readdir() to work for
large folios. It's a bit intrusive, and I haven't done any
serious testing (let alone with the bs>PS patches). But it could
look something like this:
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 6807df637112..5eeb57ce6a18 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -257,45 +257,45 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
static int
ext2_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = ctx->pos;
struct inode *inode = file_inode(file);
+ loff_t pos = ctx->pos;
+ loff_t isize = i_size_read(inode);
struct super_block *sb = inode->i_sb;
- unsigned int offset = pos & ~PAGE_MASK;
- unsigned long n = pos >> PAGE_SHIFT;
- unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
bool has_filetype;
- if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
+ if (pos > isize - EXT2_DIR_REC_LEN(1))
return 0;
has_filetype =
EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE);
- for ( ; n < npages; n++, offset = 0) {
+ while (pos < isize) {
ext2_dirent *de;
struct folio *folio;
- char *kaddr = ext2_get_folio(inode, n, 0, &folio);
+ char *kaddr = ext2_get_folio(inode, pos / PAGE_SIZE, 0, &folio);
+ size_t offset = offset_in_folio(folio, pos);
char *limit;
if (IS_ERR(kaddr)) {
ext2_error(sb, __func__,
"bad page in #%lu",
inode->i_ino);
- ctx->pos += PAGE_SIZE - offset;
+ ctx->pos += folio_size(folio) - offset;
return PTR_ERR(kaddr);
}
if (unlikely(need_revalidate)) {
if (offset) {
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
- ctx->pos = (n<<PAGE_SHIFT) + offset;
+ ctx->pos = folio_pos(folio) + offset;
}
file->f_version = inode_query_iversion(inode);
need_revalidate = false;
}
de = (ext2_dirent *)(kaddr+offset);
- limit = kaddr + ext2_last_byte(inode, n) - EXT2_DIR_REC_LEN(1);
+ limit = kaddr + ext2_last_byte(inode, folio->index) -
+ EXT2_DIR_REC_LEN(1);
for ( ;(char*)de <= limit; de = ext2_next_entry(de)) {
if (de->rec_len == 0) {
ext2_error(sb, __func__,
@@ -319,6 +319,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
ctx->pos += ext2_rec_len_from_disk(de->rec_len);
}
folio_release_kmap(folio, kaddr);
+ pos = (loff_t)folio_next_index(folio) * PAGE_SIZE;
}
return 0;
}
ext2_last_byte() is clearly now incorrect. It should probably take
isize and folio as inputs and return either the last byte in the folio
or the last byte in the file. But then we need to convert all the
callers, and I didn't want to do that for this demonstration patch.