Add an iomap_get_folio() helper that gets a folio reference based on
an iomap iterator and an offset into the address space. Use it in
iomap_write_begin().
Signed-off-by: Andreas Gruenbacher <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/iomap/buffered-io.c | 39 ++++++++++++++++++++++++++++++---------
include/linux/iomap.h | 1 +
2 files changed, 31 insertions(+), 9 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d4b444e44861..de4a8e5f721a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
}
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
+/**
+ * iomap_get_folio - get a folio reference for writing
+ * @iter: iteration structure
+ * @pos: start offset of write
+ *
+ * Returns a locked reference to the folio at @pos, or an error pointer if the
+ * folio could not be obtained.
+ */
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
+{
+ unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
+ struct folio *folio;
+
+ if (iter->flags & IOMAP_NOWAIT)
+ fgp |= FGP_NOWAIT;
+
+ folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+ fgp, mapping_gfp_mask(iter->inode->i_mapping));
+ if (folio)
+ return folio;
+
+ if (iter->flags & IOMAP_NOWAIT)
+ return ERR_PTR(-EAGAIN);
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL_GPL(iomap_get_folio);
+
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
{
trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
@@ -603,12 +630,8 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
struct folio *folio;
- unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
int status = 0;
- if (iter->flags & IOMAP_NOWAIT)
- fgp |= FGP_NOWAIT;
-
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
@@ -625,12 +648,10 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
return status;
}
- folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
- fgp, mapping_gfp_mask(iter->inode->i_mapping));
- if (!folio) {
- status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
+ folio = iomap_get_folio(iter, pos);
+ if (IS_ERR(folio)) {
__iomap_put_folio(iter, pos, 0, NULL);
- return status;
+ return PTR_ERR(folio);
}
/*
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index ecf815b34d51..188d14e786a4 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -261,6 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
--
2.38.1
On Sun, Jan 08, 2023 at 08:40:28PM +0100, Andreas Gruenbacher wrote:
> Add an iomap_get_folio() helper that gets a folio reference based on
> an iomap iterator and an offset into the address space. Use it in
> iomap_write_begin().
>
> Signed-off-by: Andreas Gruenbacher <[email protected]>
> Reviewed-by: Darrick J. Wong <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> ---
> fs/iomap/buffered-io.c | 39 ++++++++++++++++++++++++++++++---------
> include/linux/iomap.h | 1 +
> 2 files changed, 31 insertions(+), 9 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index d4b444e44861..de4a8e5f721a 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
> }
> EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
>
> +/**
> + * iomap_get_folio - get a folio reference for writing
> + * @iter: iteration structure
> + * @pos: start offset of write
> + *
> + * Returns a locked reference to the folio at @pos, or an error pointer if the
> + * folio could not be obtained.
> + */
> +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
> +{
> + unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
> + struct folio *folio;
> +
> + if (iter->flags & IOMAP_NOWAIT)
> + fgp |= FGP_NOWAIT;
> +
> + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> + fgp, mapping_gfp_mask(iter->inode->i_mapping));
> + if (folio)
> + return folio;
> +
> + if (iter->flags & IOMAP_NOWAIT)
> + return ERR_PTR(-EAGAIN);
> + return ERR_PTR(-ENOMEM);
> +}
> +EXPORT_SYMBOL_GPL(iomap_get_folio);
Hmmmm.
This is where things start to get complex. I have sent a patch to
fix a problem with iomap_zero_range() failing to zero cached dirty
pages over UNWRITTEN extents, and that requires making FGP_CREAT
optional. This is an iomap bug, and needs to be fixed in the core
iomap code:
https://lore.kernel.org/linux-xfs/[email protected]/
Essentially, we need to pass fgp flags to iomap_write_begin() need
so the callers can supply a 0 or FGP_CREAT appropriately. This
allows iomap_write_begin() to act only on pre-cached pages rather
than always instantiating a new page if one does not exist in cache.
This allows that iomap_write_begin() to return a NULL folio
successfully, and this is perfectly OK for callers that pass in fgp
= 0 as they are expected to handle a NULL folio return indicating
there was no cached data over the range...
Exposing the folio allocation as an external interface makes bug
fixes like this rather messy - it's taking a core abstraction (iomap
hides all the folio and page cache manipulations from the
filesystem) and punching a big hole in it by requiring filesystems
to actually allocation page cache folios on behalf of the iomap
core.
Given that I recently got major push-back for fixing an XFS-only bug
by walking the page cache directly instead of abstracting it via the
iomap core, punching an even bigger hole in the abstraction layer to
fix a GFS2-only problem is just as bad....
-Dave.
--
Dave Chinner
[email protected]
On Sun, Jan 8, 2023 at 10:33 PM Dave Chinner <[email protected]> wrote:
> On Sun, Jan 08, 2023 at 08:40:28PM +0100, Andreas Gruenbacher wrote:
> > Add an iomap_get_folio() helper that gets a folio reference based on
> > an iomap iterator and an offset into the address space. Use it in
> > iomap_write_begin().
> >
> > Signed-off-by: Andreas Gruenbacher <[email protected]>
> > Reviewed-by: Darrick J. Wong <[email protected]>
> > Reviewed-by: Christoph Hellwig <[email protected]>
> > ---
> > fs/iomap/buffered-io.c | 39 ++++++++++++++++++++++++++++++---------
> > include/linux/iomap.h | 1 +
> > 2 files changed, 31 insertions(+), 9 deletions(-)
> >
> > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > index d4b444e44861..de4a8e5f721a 100644
> > --- a/fs/iomap/buffered-io.c
> > +++ b/fs/iomap/buffered-io.c
> > @@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
> > }
> > EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
> >
> > +/**
> > + * iomap_get_folio - get a folio reference for writing
> > + * @iter: iteration structure
> > + * @pos: start offset of write
> > + *
> > + * Returns a locked reference to the folio at @pos, or an error pointer if the
> > + * folio could not be obtained.
> > + */
> > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
> > +{
> > + unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
> > + struct folio *folio;
> > +
> > + if (iter->flags & IOMAP_NOWAIT)
> > + fgp |= FGP_NOWAIT;
> > +
> > + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> > + fgp, mapping_gfp_mask(iter->inode->i_mapping));
> > + if (folio)
> > + return folio;
> > +
> > + if (iter->flags & IOMAP_NOWAIT)
> > + return ERR_PTR(-EAGAIN);
> > + return ERR_PTR(-ENOMEM);
> > +}
> > +EXPORT_SYMBOL_GPL(iomap_get_folio);
>
> Hmmmm.
>
> This is where things start to get complex. I have sent a patch to
> fix a problem with iomap_zero_range() failing to zero cached dirty
> pages over UNWRITTEN extents, and that requires making FGP_CREAT
> optional. This is an iomap bug, and needs to be fixed in the core
> iomap code:
>
> https://lore.kernel.org/linux-xfs/[email protected]/
>
> Essentially, we need to pass fgp flags to iomap_write_begin() need
> so the callers can supply a 0 or FGP_CREAT appropriately. This
> allows iomap_write_begin() to act only on pre-cached pages rather
> than always instantiating a new page if one does not exist in cache.
>
> This allows that iomap_write_begin() to return a NULL folio
> successfully, and this is perfectly OK for callers that pass in fgp
> = 0 as they are expected to handle a NULL folio return indicating
> there was no cached data over the range...
>
> Exposing the folio allocation as an external interface makes bug
> fixes like this rather messy - it's taking a core abstraction (iomap
> hides all the folio and page cache manipulations from the
> filesystem) and punching a big hole in it by requiring filesystems
> to actually allocation page cache folios on behalf of the iomap
> core.
>
> Given that I recently got major push-back for fixing an XFS-only bug
> by walking the page cache directly instead of abstracting it via the
> iomap core, punching an even bigger hole in the abstraction layer to
> fix a GFS2-only problem is just as bad....
We can handle that by adding a new IOMAP_NOCREATE iterator flag and
checking for that in iomap_get_folio(). Your patch then turns into
the below.
Thanks,
Andreas
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index dacc7c80b20d..34b335a89527 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -470,6 +470,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
struct folio *folio;
+ if (!(iter->flags & IOMAP_NOCREATE))
+ fgp |= FGP_CREAT;
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
@@ -478,6 +480,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
if (folio)
return folio;
+ if (iter->flags & IOMAP_NOCREATE)
+ return ERR_PTR(-ENODATA);
if (iter->flags & IOMAP_NOWAIT)
return ERR_PTR(-EAGAIN);
return ERR_PTR(-ENOMEM);
@@ -1162,8 +1166,12 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
loff_t written = 0;
/* already zeroed? we're done. */
- if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ if (srcmap->type == IOMAP_HOLE)
return length;
+ /* only do page cache lookups over unwritten extents */
+ iter->flags &= ~IOMAP_NOCREATE;
+ if (srcmap->type == IOMAP_UNWRITTEN)
+ iter->flags |= IOMAP_NOCREATE;
do {
struct folio *folio;
@@ -1172,8 +1180,19 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
size_t bytes = min_t(u64, SIZE_MAX, length);
status = iomap_write_begin(iter, pos, bytes, &folio);
- if (status)
+ if (status) {
+ if (status == -ENODATA) {
+ /*
+ * No folio was found, so skip to the start of
+ * the next potential entry in the page cache
+ * and continue from there.
+ */
+ if (bytes > PAGE_SIZE - offset_in_page(pos))
+ bytes = PAGE_SIZE - offset_in_page(pos);
+ goto loop_continue;
+ }
return status;
+ }
if (iter->iomap.flags & IOMAP_F_STALE)
break;
@@ -1181,6 +1200,19 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (bytes > folio_size(folio) - offset)
bytes = folio_size(folio) - offset;
+ /*
+ * If the folio over an unwritten extent is clean, then we
+ * aren't going to touch the data in it at all. We don't want to
+ * mark it dirty or change the uptodate state of data in the
+ * page, so we just unlock it and skip to the next range over
+ * the unwritten extent we need to check.
+ */
+ if (srcmap->type == IOMAP_UNWRITTEN &&
+ !folio_test_dirty(folio)) {
+ folio_unlock(folio);
+ goto loop_continue;
+ }
+
folio_zero_range(folio, offset, bytes);
folio_mark_accessed(folio);
@@ -1188,6 +1220,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (WARN_ON_ONCE(bytes == 0))
return -EIO;
+loop_continue:
pos += bytes;
length -= bytes;
written += bytes;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 515318dfbc38..87b9d9aba4bb 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -841,17 +841,7 @@ xfs_setattr_size(
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
&did_zeroing);
- } else {
- /*
- * iomap won't detect a dirty page over an unwritten block (or a
- * cow block over a hole) and subsequently skips zeroing the
- * newly post-EOF portion of the page. Flush the new EOF to
- * convert the block before the pagecache truncate.
- */
- error = filemap_write_and_wait_range(inode->i_mapping, newsize,
- newsize);
- if (error)
- return error;
+ } else if (newsize != oldsize) {
error = xfs_truncate_page(ip, newsize, &did_zeroing);
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3e6c34b03c89..55f195866f00 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -164,6 +164,7 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
+#define IOMAP_NOCREATE (1 << 9) /* look up folios without FGP_CREAT */
struct iomap_ops {
/*
--
2.38.1
On Mon, Jan 09, 2023 at 01:46:42PM +0100, Andreas Gruenbacher wrote:
> We can handle that by adding a new IOMAP_NOCREATE iterator flag and
> checking for that in iomap_get_folio(). Your patch then turns into
> the below.
Exactly. And as I already pointed out in reply to Dave's original
patch what we really should be doing is returning an ERR_PTR from
__filemap_get_folio instead of reverse-engineering the expected
error code.
The only big question is if we should apply Dave's patch first as a bug
fix before this series, and I suspect we should do that.
Am Di., 10. Jan. 2023 um 09:52 Uhr schrieb Christoph Hellwig
<[email protected]>:
> On Mon, Jan 09, 2023 at 01:46:42PM +0100, Andreas Gruenbacher wrote:
> > We can handle that by adding a new IOMAP_NOCREATE iterator flag and
> > checking for that in iomap_get_folio(). Your patch then turns into
> > the below.
>
> Exactly. And as I already pointed out in reply to Dave's original
> patch what we really should be doing is returning an ERR_PTR from
> __filemap_get_folio instead of reverse-engineering the expected
> error code.
>
> The only big question is if we should apply Dave's patch first as a bug
> fix before this series, and I suspect we should do that.
Sounds fine. I assume Dave is going to send an update.
Thanks,
Andreas
On Tue, Jan 10, 2023 at 12:46:45AM -0800, Christoph Hellwig wrote:
> On Mon, Jan 09, 2023 at 01:46:42PM +0100, Andreas Gruenbacher wrote:
> > We can handle that by adding a new IOMAP_NOCREATE iterator flag and
> > checking for that in iomap_get_folio(). Your patch then turns into
> > the below.
>
> Exactly. And as I already pointed out in reply to Dave's original
> patch what we really should be doing is returning an ERR_PTR from
> __filemap_get_folio instead of reverse-engineering the expected
> error code.
Ouch, we have a nasty problem.
If somebody passes FGP_ENTRY, we can return a shadow entry. And the
encodings for shadow entries overlap with the encodings for ERR_PTR,
meaning that some shadow entries will look like errors. The way I
solved this in the XArray code is by shifting the error values by
two bits and encoding errors as XA_ERROR(-ENOMEM) (for example).
I don't _object_ to introducing XA_ERROR() / xa_err() into the VFS,
but so far we haven't, and I'd like to make that decision intentionally.
On Tue, Jan 10, 2023 at 01:34:16PM +0000, Matthew Wilcox wrote:
> > Exactly. And as I already pointed out in reply to Dave's original
> > patch what we really should be doing is returning an ERR_PTR from
> > __filemap_get_folio instead of reverse-engineering the expected
> > error code.
>
> Ouch, we have a nasty problem.
>
> If somebody passes FGP_ENTRY, we can return a shadow entry. And the
> encodings for shadow entries overlap with the encodings for ERR_PTR,
> meaning that some shadow entries will look like errors. The way I
> solved this in the XArray code is by shifting the error values by
> two bits and encoding errors as XA_ERROR(-ENOMEM) (for example).
>
> I don't _object_ to introducing XA_ERROR() / xa_err() into the VFS,
> but so far we haven't, and I'd like to make that decision intentionally.
So what would be an alternative way to tell the callers why no folio
was found instead of trying to reverse engineer that? Return an errno
and the folio by reference? The would work, but the calling conventions
would be awful.
On Tue, Jan 10, 2023 at 07:24:27AM -0800, Christoph Hellwig wrote:
> On Tue, Jan 10, 2023 at 01:34:16PM +0000, Matthew Wilcox wrote:
> > > Exactly. And as I already pointed out in reply to Dave's original
> > > patch what we really should be doing is returning an ERR_PTR from
> > > __filemap_get_folio instead of reverse-engineering the expected
> > > error code.
> >
> > Ouch, we have a nasty problem.
> >
> > If somebody passes FGP_ENTRY, we can return a shadow entry. And the
> > encodings for shadow entries overlap with the encodings for ERR_PTR,
> > meaning that some shadow entries will look like errors. The way I
> > solved this in the XArray code is by shifting the error values by
> > two bits and encoding errors as XA_ERROR(-ENOMEM) (for example).
> >
> > I don't _object_ to introducing XA_ERROR() / xa_err() into the VFS,
> > but so far we haven't, and I'd like to make that decision intentionally.
>
> So what would be an alternative way to tell the callers why no folio
> was found instead of trying to reverse engineer that? Return an errno
> and the folio by reference? The would work, but the calling conventions
> would be awful.
Agreed. How about an xa_filemap_get_folio()?
(there are a number of things to fix here; haven't decided if XA_ERROR
should return void *, or whether i should use a separate 'entry' and
'folio' until I know the entry is actually a folio ...)
Usage would seem pretty straightforward:
folio = xa_filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
status = xa_err(folio);
if (status)
goto out_no_page;
diff --git a/mm/filemap.c b/mm/filemap.c
index 7bf8442bcfaa..7d489f96c690 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1800,40 +1800,25 @@ static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
}
/**
- * __filemap_get_folio - Find and get a reference to a folio.
+ * xa_filemap_get_folio - Find and get a reference to a folio.
* @mapping: The address_space to search.
* @index: The page index.
* @fgp_flags: %FGP flags modify how the folio is returned.
* @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
*
- * Looks up the page cache entry at @mapping & @index.
- *
- * @fgp_flags can be zero or more of these flags:
- *
- * * %FGP_ACCESSED - The folio will be marked accessed.
- * * %FGP_LOCK - The folio is returned locked.
- * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
- * instead of allocating a new folio to replace it.
- * * %FGP_CREAT - If no page is present then a new page is allocated using
- * @gfp and added to the page cache and the VM's LRU list.
- * The page is returned locked and with an increased refcount.
- * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
- * page is already in cache. If the page was allocated, unlock it before
- * returning so the caller can do the same dance.
- * * %FGP_WRITE - The page will be written to by the caller.
- * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
- * * %FGP_NOWAIT - Don't get blocked by page lock.
- * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
- *
- * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
- * if the %GFP flags specified for %FGP_CREAT are atomic.
+ * Looks up the page cache entry at @mapping & @index. See
+ * __filemap_get_folio() for a detailed description.
*
- * If there is a page cache page, it is returned with an increased refcount.
+ * This differs from __filemap_get_folio() in that it will return an
+ * XArray error instead of NULL if something goes wrong, allowing the
+ * advanced user to distinguish why the failure happened. We can't use an
+ * ERR_PTR() because its encodings overlap with shadow/swap/dax entries.
*
- * Return: The found folio or %NULL otherwise.
+ * Return: The entry in the page cache or an xa_err() if there is no entry
+ * or it could not be appropiately locked.
*/
-struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp)
+struct folio *xa_filemap_get_folio(struct address_space *mapping,
+ pgoff_t index, int fgp_flags, gfp_t gfp)
{
struct folio *folio;
@@ -1851,7 +1836,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
if (fgp_flags & FGP_NOWAIT) {
if (!folio_trylock(folio)) {
folio_put(folio);
- return NULL;
+ return (struct folio *)XA_ERROR(-EAGAIN);
}
} else {
folio_lock(folio);
@@ -1890,7 +1875,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
folio = filemap_alloc_folio(gfp, 0);
if (!folio)
- return NULL;
+ return (struct folio *)XA_ERROR(-ENOMEM);
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
@@ -1902,19 +1887,65 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
err = filemap_add_folio(mapping, folio, index, gfp);
if (unlikely(err)) {
folio_put(folio);
- folio = NULL;
if (err == -EEXIST)
goto repeat;
+ folio = (struct folio *)XA_ERROR(err);
+ } else {
+ /*
+ * filemap_add_folio locks the page, and for mmap
+ * we expect an unlocked page.
+ */
+ if (fgp_flags & FGP_FOR_MMAP)
+ folio_unlock(folio);
}
-
- /*
- * filemap_add_folio locks the page, and for mmap
- * we expect an unlocked page.
- */
- if (folio && (fgp_flags & FGP_FOR_MMAP))
- folio_unlock(folio);
}
+ if (!folio)
+ folio = (struct folio *)XA_ERROR(-ENODATA);
+ return folio;
+}
+EXPORT_SYMBOL_GPL(xa_filemap_get_folio);
+
+/**
+ * __filemap_get_folio - Find and get a reference to a folio.
+ * @mapping: The address_space to search.
+ * @index: The page index.
+ * @fgp: %FGP flags modify how the folio is returned.
+ * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
+ *
+ * Looks up the page cache entry at @mapping & @index.
+ *
+ * @fgp_flags can be zero or more of these flags:
+ *
+ * * %FGP_ACCESSED - The folio will be marked accessed.
+ * * %FGP_LOCK - The folio is returned locked.
+ * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
+ * instead of allocating a new folio to replace it.
+ * * %FGP_CREAT - If no page is present then a new page is allocated using
+ * @gfp and added to the page cache and the VM's LRU list.
+ * The page is returned locked and with an increased refcount.
+ * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
+ * page is already in cache. If the page was allocated, unlock it before
+ * returning so the caller can do the same dance.
+ * * %FGP_WRITE - The page will be written to by the caller.
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
+ * * %FGP_NOWAIT - Don't get blocked by page lock.
+ * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
+ *
+ * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
+ * if the %GFP flags specified for %FGP_CREAT are atomic.
+ *
+ * If there is a page cache page, it is returned with an increased refcount.
+ *
+ * Return: The found folio or %NULL otherwise.
+ */
+struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
+ int fgp, gfp_t gfp)
+{
+ struct folio *folio = xa_filemap_get_folio(mapping, index, fgp, gfp);
+
+ if (xa_is_err(folio))
+ return NULL;
return folio;
}
EXPORT_SYMBOL(__filemap_get_folio);
On Wed, Jan 11, 2023 at 07:36:26PM +0000, Matthew Wilcox wrote:
> On Tue, Jan 10, 2023 at 07:24:27AM -0800, Christoph Hellwig wrote:
> > On Tue, Jan 10, 2023 at 01:34:16PM +0000, Matthew Wilcox wrote:
> > > > Exactly. And as I already pointed out in reply to Dave's original
> > > > patch what we really should be doing is returning an ERR_PTR from
> > > > __filemap_get_folio instead of reverse-engineering the expected
> > > > error code.
> > >
> > > Ouch, we have a nasty problem.
> > >
> > > If somebody passes FGP_ENTRY, we can return a shadow entry. And the
> > > encodings for shadow entries overlap with the encodings for ERR_PTR,
> > > meaning that some shadow entries will look like errors. The way I
> > > solved this in the XArray code is by shifting the error values by
> > > two bits and encoding errors as XA_ERROR(-ENOMEM) (for example).
> > >
> > > I don't _object_ to introducing XA_ERROR() / xa_err() into the VFS,
> > > but so far we haven't, and I'd like to make that decision intentionally.
> >
> > So what would be an alternative way to tell the callers why no folio
> > was found instead of trying to reverse engineer that? Return an errno
> > and the folio by reference? The would work, but the calling conventions
> > would be awful.
>
> Agreed. How about an xa_filemap_get_folio()?
>
> (there are a number of things to fix here; haven't decided if XA_ERROR
> should return void *, or whether i should use a separate 'entry' and
> 'folio' until I know the entry is actually a folio ...)
That's awful. Exposing internal implementation details in the API
that is supposed to abstract away the internal implementation
details from users doesn't seem like a great idea to me.
Exactly what are we trying to fix here? Do we really need to punch
a hole through the abstraction layers like this just to remove half
a dozen lines of -slow path- context specific error handling from a
single caller?
If there's half a dozen cases that need this sort of handling, then
maybe it's the right thing to do. But for a single calling context
that only needs to add a null return check in one specific case?
There's absolutely no need to make generic infrastructure violate
layering abstractions to handle that...
-Dave.
--
Dave Chinner
[email protected]
On Thu, Jan 12, 2023 at 07:52:41AM +1100, Dave Chinner wrote:
> Exposing internal implementation details in the API
> that is supposed to abstract away the internal implementation
> details from users doesn't seem like a great idea to me.
While I somewhat agree with the concern of leaking the xarray
internals, at least they are clearly documented and easy to find..
> Exactly what are we trying to fix here? Do we really need to punch
> a hole through the abstraction layers like this just to remove half
> a dozen lines of -slow path- context specific error handling from a
> single caller?
While the current code (which is getting worse with your fix) leaks
completely undocumented and internal decision making. So what this
fixes is a real leak of internatal logic inside of __filemap_get_folio
into the callers.
So as far as I'm concerned we really do need the helper, and anyone
using !GFP_CREATE or FGP_NOWAIT should be using it. The only question
to me is if exposing the xarray internals is worth it vs the
less optimal calling conventions of needing an extra argument for
the error code.
On Tue, Jan 10, 2023 at 01:34:16PM +0000, Matthew Wilcox wrote:
> On Tue, Jan 10, 2023 at 12:46:45AM -0800, Christoph Hellwig wrote:
> > On Mon, Jan 09, 2023 at 01:46:42PM +0100, Andreas Gruenbacher wrote:
> > > We can handle that by adding a new IOMAP_NOCREATE iterator flag and
> > > checking for that in iomap_get_folio(). Your patch then turns into
> > > the below.
> >
> > Exactly. And as I already pointed out in reply to Dave's original
> > patch what we really should be doing is returning an ERR_PTR from
> > __filemap_get_folio instead of reverse-engineering the expected
> > error code.
>
> Ouch, we have a nasty problem.
>
> If somebody passes FGP_ENTRY, we can return a shadow entry. And the
> encodings for shadow entries overlap with the encodings for ERR_PTR,
> meaning that some shadow entries will look like errors. The way I
> solved this in the XArray code is by shifting the error values by
> two bits and encoding errors as XA_ERROR(-ENOMEM) (for example).
>
> I don't _object_ to introducing XA_ERROR() / xa_err() into the VFS,
> but so far we haven't, and I'd like to make that decision intentionally.
Sorry, I'm not following this at all -- where in buffered-io.c does
anyone pass FGP_ENTRY? Andreas' code doesn't seem to introduce it
either...?
--D
On Sun, Jan 15, 2023 at 09:01:22AM -0800, Darrick J. Wong wrote:
> On Tue, Jan 10, 2023 at 01:34:16PM +0000, Matthew Wilcox wrote:
> > On Tue, Jan 10, 2023 at 12:46:45AM -0800, Christoph Hellwig wrote:
> > > On Mon, Jan 09, 2023 at 01:46:42PM +0100, Andreas Gruenbacher wrote:
> > > > We can handle that by adding a new IOMAP_NOCREATE iterator flag and
> > > > checking for that in iomap_get_folio(). Your patch then turns into
> > > > the below.
> > >
> > > Exactly. And as I already pointed out in reply to Dave's original
> > > patch what we really should be doing is returning an ERR_PTR from
> > > __filemap_get_folio instead of reverse-engineering the expected
> > > error code.
> >
> > Ouch, we have a nasty problem.
> >
> > If somebody passes FGP_ENTRY, we can return a shadow entry. And the
> > encodings for shadow entries overlap with the encodings for ERR_PTR,
> > meaning that some shadow entries will look like errors. The way I
> > solved this in the XArray code is by shifting the error values by
> > two bits and encoding errors as XA_ERROR(-ENOMEM) (for example).
> >
> > I don't _object_ to introducing XA_ERROR() / xa_err() into the VFS,
> > but so far we haven't, and I'd like to make that decision intentionally.
>
> Sorry, I'm not following this at all -- where in buffered-io.c does
> anyone pass FGP_ENTRY? Andreas' code doesn't seem to introduce it
> either...?
Oh, never mind, I worked out that the conflict is between iomap not
passing FGP_ENTRY and wanting a pointer or a negative errno; and someone
who does FGP_ENTRY, in which case the xarray value can be confused for a
negative errno.
OFC now I wonder, can we simply say that the return value is "The found
folio or NULL if you set FGP_ENTRY; or the found folio or a negative
errno if you don't" ?
--D
> --D
On Sun, Jan 15, 2023 at 09:06:50AM -0800, Darrick J. Wong wrote:
> On Sun, Jan 15, 2023 at 09:01:22AM -0800, Darrick J. Wong wrote:
> > On Tue, Jan 10, 2023 at 01:34:16PM +0000, Matthew Wilcox wrote:
> > > On Tue, Jan 10, 2023 at 12:46:45AM -0800, Christoph Hellwig wrote:
> > > > On Mon, Jan 09, 2023 at 01:46:42PM +0100, Andreas Gruenbacher wrote:
> > > > > We can handle that by adding a new IOMAP_NOCREATE iterator flag and
> > > > > checking for that in iomap_get_folio(). Your patch then turns into
> > > > > the below.
> > > >
> > > > Exactly. And as I already pointed out in reply to Dave's original
> > > > patch what we really should be doing is returning an ERR_PTR from
> > > > __filemap_get_folio instead of reverse-engineering the expected
> > > > error code.
> > >
> > > Ouch, we have a nasty problem.
> > >
> > > If somebody passes FGP_ENTRY, we can return a shadow entry. And the
> > > encodings for shadow entries overlap with the encodings for ERR_PTR,
> > > meaning that some shadow entries will look like errors. The way I
> > > solved this in the XArray code is by shifting the error values by
> > > two bits and encoding errors as XA_ERROR(-ENOMEM) (for example).
> > >
> > > I don't _object_ to introducing XA_ERROR() / xa_err() into the VFS,
> > > but so far we haven't, and I'd like to make that decision intentionally.
> >
> > Sorry, I'm not following this at all -- where in buffered-io.c does
> > anyone pass FGP_ENTRY? Andreas' code doesn't seem to introduce it
> > either...?
>
> Oh, never mind, I worked out that the conflict is between iomap not
> passing FGP_ENTRY and wanting a pointer or a negative errno; and someone
> who does FGP_ENTRY, in which case the xarray value can be confused for a
> negative errno.
>
> OFC now I wonder, can we simply say that the return value is "The found
> folio or NULL if you set FGP_ENTRY; or the found folio or a negative
> errno if you don't" ?
Erm ... I would rather not!
Part of me remembers that x86-64 has the rather nice calling convention
of being able to return a struct containing two values in two registers:
: Integer return values up to 64 bits in size are stored in RAX while
: values up to 128 bit are stored in RAX and RDX.
so maybe we can return:
struct OptionFolio {
int err;
struct folio *folio;
};
On Mon, Jan 16, 2023 at 05:46:01AM +0000, Matthew Wilcox wrote:
> > OFC now I wonder, can we simply say that the return value is "The found
> > folio or NULL if you set FGP_ENTRY; or the found folio or a negative
> > errno if you don't" ?
>
> Erm ... I would rather not!
Agreed.
>
> Part of me remembers that x86-64 has the rather nice calling convention
> of being able to return a struct containing two values in two registers:
We could do that. But while reading what Darrick wrote I came up with
another idea I quite like. Just split the FGP_ENTRY handling into
a separate helper. The logic and use cases are quite different from
the normal page cache lookup, and the returning of the xarray entry
is exactly the kind of layering violation that Dave is complaining
about. So what about just splitting that use case into a separate
self contained helper?
---
From b4d10f98ea57f8480c03c0b00abad6f2b7186f56 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <[email protected]>
Date: Mon, 16 Jan 2023 08:26:57 +0100
Subject: mm: replace FGP_ENTRY with a new __filemap_get_folio_entry helper
Split the xarray entry returning logic into a separate helper. This will
allow returning ERR_PTRs from __filemap_get_folio, and also isolates the
logic that needs to known about xarray internals into a separate
function. This causes some code duplication, but as most flags to
__filemap_get_folio are not applicable for the users that care about an
entry that amount is very limited.
Signed-off-by: Christoph Hellwig <[email protected]>
---
include/linux/pagemap.h | 6 +++--
mm/filemap.c | 50 ++++++++++++++++++++++++++++++++++++-----
mm/huge_memory.c | 4 ++--
mm/shmem.c | 5 ++---
mm/swap_state.c | 2 +-
5 files changed, 53 insertions(+), 14 deletions(-)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b3a7124c76712..e06c14b610caf2 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -504,8 +504,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
#define FGP_NOFS 0x00000010
#define FGP_NOWAIT 0x00000020
#define FGP_FOR_MMAP 0x00000040
-#define FGP_ENTRY 0x00000080
-#define FGP_STABLE 0x00000100
+#define FGP_STABLE 0x00000080
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
int fgp_flags, gfp_t gfp);
@@ -546,6 +545,9 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping,
return __filemap_get_folio(mapping, index, FGP_LOCK, 0);
}
+struct folio *__filemap_get_folio_entry(struct address_space *mapping,
+ pgoff_t index, int fgp_flags);
+
/**
* find_get_page - find and get a page reference
* @mapping: the address_space to search
diff --git a/mm/filemap.c b/mm/filemap.c
index c4d4ace9cc7003..d04613347b3e71 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1887,8 +1887,6 @@ static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
*
* * %FGP_ACCESSED - The folio will be marked accessed.
* * %FGP_LOCK - The folio is returned locked.
- * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
- * instead of allocating a new folio to replace it.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
@@ -1914,11 +1912,8 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
repeat:
folio = mapping_get_entry(mapping, index);
- if (xa_is_value(folio)) {
- if (fgp_flags & FGP_ENTRY)
- return folio;
+ if (xa_is_value(folio))
folio = NULL;
- }
if (!folio)
goto no_page;
@@ -1994,6 +1989,49 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
}
EXPORT_SYMBOL(__filemap_get_folio);
+
+/**
+ * __filemap_get_folio_entry - Find and get a reference to a folio.
+ * @mapping: The address_space to search.
+ * @index: The page index.
+ * @fgp_flags: %FGP flags modify how the folio is returned.
+ *
+ * Looks up the page cache entry at @mapping & @index. If there is a shadow /
+ * swap / DAX entry, return it instead of allocating a new folio to replace it.
+ *
+ * @fgp_flags can be zero or more of these flags:
+ *
+ * * %FGP_LOCK - The folio is returned locked.
+ *
+ * If there is a page cache page, it is returned with an increased refcount.
+ *
+ * Return: The found folio or %NULL otherwise.
+ */
+struct folio *__filemap_get_folio_entry(struct address_space *mapping,
+ pgoff_t index, int fgp_flags)
+{
+ struct folio *folio;
+
+ if (WARN_ON_ONCE(fgp_flags & ~FGP_LOCK))
+ return NULL;
+
+repeat:
+ folio = mapping_get_entry(mapping, index);
+ if (folio && !xa_is_value(folio) && (fgp_flags & FGP_LOCK)) {
+ folio_lock(folio);
+
+ /* Has the page been truncated? */
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto repeat;
+ }
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ }
+
+ return folio;
+}
+
static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
xa_mark_t mark)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index abe6cfd92ffa0e..88b517c338a6db 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3088,10 +3088,10 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
mapping = candidate->f_mapping;
for (index = off_start; index < off_end; index += nr_pages) {
- struct folio *folio = __filemap_get_folio(mapping, index,
- FGP_ENTRY, 0);
+ struct folio *folio;
nr_pages = 1;
+ folio = __filemap_get_folio_entry(mapping, index, 0);
if (xa_is_value(folio) || !folio)
continue;
diff --git a/mm/shmem.c b/mm/shmem.c
index c301487be5fb40..0a36563ef7a0c1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -888,8 +888,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
* At first avoid shmem_get_folio(,,,SGP_READ): that fails
* beyond i_size, and reports fallocated pages as holes.
*/
- folio = __filemap_get_folio(inode->i_mapping, index,
- FGP_ENTRY | FGP_LOCK, 0);
+ folio = __filemap_get_folio_entry(inode->i_mapping, index, FGP_LOCK);
if (!xa_is_value(folio))
return folio;
/*
@@ -1860,7 +1859,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
sbinfo = SHMEM_SB(inode->i_sb);
charge_mm = vma ? vma->vm_mm : NULL;
- folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
+ folio = __filemap_get_folio_entry(mapping, index, FGP_LOCK);
if (folio && vma && userfaultfd_minor(vma)) {
if (!xa_is_value(folio)) {
folio_unlock(folio);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2927507b43d819..1f45241987aea2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -384,7 +384,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
{
swp_entry_t swp;
struct swap_info_struct *si;
- struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0);
+ struct folio *folio = __filemap_get_folio_entry(mapping, index, 0);
if (!xa_is_value(folio))
goto out;
--
2.39.0
On Sun, Jan 15, 2023 at 11:34:26PM -0800, Christoph Hellwig wrote:
> We could do that. But while reading what Darrick wrote I came up with
> another idea I quite like. Just split the FGP_ENTRY handling into
> a separate helper. The logic and use cases are quite different from
> the normal page cache lookup, and the returning of the xarray entry
> is exactly the kind of layering violation that Dave is complaining
> about. So what about just splitting that use case into a separate
> self contained helper?
Essentially reverting 44835d20b2a0. Although we retain the merging of
the lock & get functions via the use of FGP flags. Let me think about
it for a day.
> ---
> >From b4d10f98ea57f8480c03c0b00abad6f2b7186f56 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <[email protected]>
> Date: Mon, 16 Jan 2023 08:26:57 +0100
> Subject: mm: replace FGP_ENTRY with a new __filemap_get_folio_entry helper
>
> Split the xarray entry returning logic into a separate helper. This will
> allow returning ERR_PTRs from __filemap_get_folio, and also isolates the
> logic that needs to known about xarray internals into a separate
> function. This causes some code duplication, but as most flags to
> __filemap_get_folio are not applicable for the users that care about an
> entry that amount is very limited.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> include/linux/pagemap.h | 6 +++--
> mm/filemap.c | 50 ++++++++++++++++++++++++++++++++++++-----
> mm/huge_memory.c | 4 ++--
> mm/shmem.c | 5 ++---
> mm/swap_state.c | 2 +-
> 5 files changed, 53 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 4b3a7124c76712..e06c14b610caf2 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -504,8 +504,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
> #define FGP_NOFS 0x00000010
> #define FGP_NOWAIT 0x00000020
> #define FGP_FOR_MMAP 0x00000040
> -#define FGP_ENTRY 0x00000080
> -#define FGP_STABLE 0x00000100
> +#define FGP_STABLE 0x00000080
>
> struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
> int fgp_flags, gfp_t gfp);
> @@ -546,6 +545,9 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping,
> return __filemap_get_folio(mapping, index, FGP_LOCK, 0);
> }
>
> +struct folio *__filemap_get_folio_entry(struct address_space *mapping,
> + pgoff_t index, int fgp_flags);
> +
> /**
> * find_get_page - find and get a page reference
> * @mapping: the address_space to search
> diff --git a/mm/filemap.c b/mm/filemap.c
> index c4d4ace9cc7003..d04613347b3e71 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1887,8 +1887,6 @@ static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
> *
> * * %FGP_ACCESSED - The folio will be marked accessed.
> * * %FGP_LOCK - The folio is returned locked.
> - * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
> - * instead of allocating a new folio to replace it.
> * * %FGP_CREAT - If no page is present then a new page is allocated using
> * @gfp and added to the page cache and the VM's LRU list.
> * The page is returned locked and with an increased refcount.
> @@ -1914,11 +1912,8 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
>
> repeat:
> folio = mapping_get_entry(mapping, index);
> - if (xa_is_value(folio)) {
> - if (fgp_flags & FGP_ENTRY)
> - return folio;
> + if (xa_is_value(folio))
> folio = NULL;
> - }
> if (!folio)
> goto no_page;
>
> @@ -1994,6 +1989,49 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
> }
> EXPORT_SYMBOL(__filemap_get_folio);
>
> +
> +/**
> + * __filemap_get_folio_entry - Find and get a reference to a folio.
> + * @mapping: The address_space to search.
> + * @index: The page index.
> + * @fgp_flags: %FGP flags modify how the folio is returned.
> + *
> + * Looks up the page cache entry at @mapping & @index. If there is a shadow /
> + * swap / DAX entry, return it instead of allocating a new folio to replace it.
> + *
> + * @fgp_flags can be zero or more of these flags:
> + *
> + * * %FGP_LOCK - The folio is returned locked.
> + *
> + * If there is a page cache page, it is returned with an increased refcount.
> + *
> + * Return: The found folio or %NULL otherwise.
> + */
> +struct folio *__filemap_get_folio_entry(struct address_space *mapping,
> + pgoff_t index, int fgp_flags)
> +{
> + struct folio *folio;
> +
> + if (WARN_ON_ONCE(fgp_flags & ~FGP_LOCK))
> + return NULL;
> +
> +repeat:
> + folio = mapping_get_entry(mapping, index);
> + if (folio && !xa_is_value(folio) && (fgp_flags & FGP_LOCK)) {
> + folio_lock(folio);
> +
> + /* Has the page been truncated? */
> + if (unlikely(folio->mapping != mapping)) {
> + folio_unlock(folio);
> + folio_put(folio);
> + goto repeat;
> + }
> + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
> + }
> +
> + return folio;
> +}
> +
> static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
> xa_mark_t mark)
> {
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index abe6cfd92ffa0e..88b517c338a6db 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3088,10 +3088,10 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
> mapping = candidate->f_mapping;
>
> for (index = off_start; index < off_end; index += nr_pages) {
> - struct folio *folio = __filemap_get_folio(mapping, index,
> - FGP_ENTRY, 0);
> + struct folio *folio;
>
> nr_pages = 1;
> + folio = __filemap_get_folio_entry(mapping, index, 0);
> if (xa_is_value(folio) || !folio)
> continue;
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index c301487be5fb40..0a36563ef7a0c1 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -888,8 +888,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
> * At first avoid shmem_get_folio(,,,SGP_READ): that fails
> * beyond i_size, and reports fallocated pages as holes.
> */
> - folio = __filemap_get_folio(inode->i_mapping, index,
> - FGP_ENTRY | FGP_LOCK, 0);
> + folio = __filemap_get_folio_entry(inode->i_mapping, index, FGP_LOCK);
> if (!xa_is_value(folio))
> return folio;
> /*
> @@ -1860,7 +1859,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
> sbinfo = SHMEM_SB(inode->i_sb);
> charge_mm = vma ? vma->vm_mm : NULL;
>
> - folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
> + folio = __filemap_get_folio_entry(mapping, index, FGP_LOCK);
> if (folio && vma && userfaultfd_minor(vma)) {
> if (!xa_is_value(folio)) {
> folio_unlock(folio);
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 2927507b43d819..1f45241987aea2 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -384,7 +384,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
> {
> swp_entry_t swp;
> struct swap_info_struct *si;
> - struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0);
> + struct folio *folio = __filemap_get_folio_entry(mapping, index, 0);
>
> if (!xa_is_value(folio))
> goto out;
> --
> 2.39.0
>
On Mon, Jan 16, 2023 at 01:18:07PM +0000, Matthew Wilcox wrote:
> Essentially reverting 44835d20b2a0.
Yep.
> Although we retain the merging of
> the lock & get functions via the use of FGP flags. Let me think about
> it for a day.
Yes. But looking at the code again I wonder if even that is needed.
Out of the users of FGP_ENTRY / __filemap_get_folio_entry:
- split_huge_pages_in_file really should not be using it at all,
given that it checks for xa_is_value and treats that as !folio
- one doesn't pass FGP_LOCK and could just use filemap_get_entry
- the othr two are in shmem, so we could move the locking logic
there (and maybe in future optimize it in the callers)
That would be something like this, although it should be split into
two or three patches:
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 29e1f9e76eb6dd..ecd1ff40a80621 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -504,9 +504,9 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
#define FGP_NOFS 0x00000010
#define FGP_NOWAIT 0x00000020
#define FGP_FOR_MMAP 0x00000040
-#define FGP_ENTRY 0x00000080
-#define FGP_STABLE 0x00000100
+#define FGP_STABLE 0x00000080
+void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
int fgp_flags, gfp_t gfp);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
diff --git a/mm/filemap.c b/mm/filemap.c
index c4d4ace9cc7003..85bd86c44e14d2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1832,7 +1832,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
*/
/*
- * mapping_get_entry - Get a page cache entry.
+ * filemap_get_entry - Get a page cache entry.
* @mapping: the address_space to search
* @index: The page cache index.
*
@@ -1843,7 +1843,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
*
* Return: The folio, swap or shadow entry, %NULL if nothing is found.
*/
-static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
+void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
XA_STATE(xas, &mapping->i_pages, index);
struct folio *folio;
@@ -1887,8 +1887,6 @@ static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
*
* * %FGP_ACCESSED - The folio will be marked accessed.
* * %FGP_LOCK - The folio is returned locked.
- * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
- * instead of allocating a new folio to replace it.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
@@ -1913,12 +1911,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
struct folio *folio;
repeat:
- folio = mapping_get_entry(mapping, index);
- if (xa_is_value(folio)) {
- if (fgp_flags & FGP_ENTRY)
- return folio;
+ folio = filemap_get_entry(mapping, index);
+ if (xa_is_value(folio))
folio = NULL;
- }
if (!folio)
goto no_page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index abe6cfd92ffa0e..b182eb99044e9a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3088,11 +3088,10 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
mapping = candidate->f_mapping;
for (index = off_start; index < off_end; index += nr_pages) {
- struct folio *folio = __filemap_get_folio(mapping, index,
- FGP_ENTRY, 0);
+ struct folio *folio = filemap_get_folio(mapping, index);
nr_pages = 1;
- if (xa_is_value(folio) || !folio)
+ if (!folio)
continue;
if (!folio_test_large(folio))
diff --git a/mm/shmem.c b/mm/shmem.c
index 028675cd97d445..4650192dbcb91b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -880,6 +880,28 @@ void shmem_unlock_mapping(struct address_space *mapping)
}
}
+static struct folio *shmem_get_entry(struct address_space *mapping,
+ pgoff_t index)
+{
+ struct folio *folio;
+
+repeat:
+ folio = filemap_get_entry(mapping, index);
+ if (folio && !xa_is_value(folio)) {
+ folio_lock(folio);
+
+ /* Has the page been truncated? */
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto repeat;
+ }
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ }
+
+ return folio;
+}
+
static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
{
struct folio *folio;
@@ -888,8 +910,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
* At first avoid shmem_get_folio(,,,SGP_READ): that fails
* beyond i_size, and reports fallocated pages as holes.
*/
- folio = __filemap_get_folio(inode->i_mapping, index,
- FGP_ENTRY | FGP_LOCK, 0);
+ folio = shmem_get_entry(inode->i_mapping, index);
if (!xa_is_value(folio))
return folio;
/*
@@ -1860,7 +1881,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
sbinfo = SHMEM_SB(inode->i_sb);
charge_mm = vma ? vma->vm_mm : NULL;
- folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
+ folio = shmem_get_entry(mapping, index);
if (folio && vma && userfaultfd_minor(vma)) {
if (!xa_is_value(folio)) {
folio_unlock(folio);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2927507b43d819..e7f2083ad7e40a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -384,7 +384,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
{
swp_entry_t swp;
struct swap_info_struct *si;
- struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0);
+ struct folio *folio = filemap_get_entry(mapping, index);
if (!xa_is_value(folio))
goto out;