LinuxLists.cc - [PATCH v4 0/2] erofs: support large folios for fscache mode

2022-12-01 07:50:04

Subject: [PATCH v4 0/2] erofs: support large folios for fscache mode

v4:
- patch 1: move the change to the INLINE routine to a separate patch
which will be sent laterly
- patch 1: introduce erofs_fscache_req_chain() helper to improve the
code organization
- patch 1: misc improvement to make the code cleaner
- patch 2: also document the new feature in documentation

v3: https://lore.kernel.org/all/[email protected]/
v2: https://lore.kernel.org/all/[email protected]/
v1: https://lore.kernel.org/all/[email protected]/

v3:
- patch 1: when large folios supported, one folio or folio range can be
mapped into several slices, with each slice mapped to different cookies,
and thus each slice needs its own netfs_cache_resources.

In the implementation of v2, each .read_folio() or .readahead() calling
corresponds to only one request, and thus only one netfs_cache_resources
(embedded in the request). In this case, fscache_begin_read_operation()
may be called multiple times on this cres, while cres->ops->end_operation()
is called only once when the whole request completes. This can cause
the leakage of the corresponding cachefiles_object->n_accesses refcount,
which will cause the user daemon hangs there forever waiting for
cache->object_count decreasing to 0 when the user daemon exits.

Worsely, as we mentioned previously, when large folios supported, one
folio or folio range can be mapped to multiple chunks on different
cookies, in which case each mapped chunk needs its own cres. In the
implementation of v2, each .read_folio() or .readahead() calling
corresponds to only one request, and thus only one cres. This will make
the only cres used by the first chunk gets overridden by the following
chunk.

To fix this, we introduce listed requests, where each .read_folio() or
.readahead() calling can correspond to a list of requests, with each
request corresponds to one cres.

v2:
- patch 2: keep the enabling for iomap and fscache mode in separate
patches; don't enable the feature for the meta data routine for now
(Gao Xiang)

Patch 1 is the main part of supporting large folios for fscache mode. It
relies on a pending patch[1] adding .prepare_ondemand_read() interface
in Cachefiles.

Patch 2 just turns the switch on and enables the feature for fscache
mode. It relies on a previous patch[2] which enables this feature for
iomap mode.

[1] https://lore.kernel.org/all/[email protected]/
[2] https://lore.kernel.org/all/[email protected]/

Jingbo Xu (2):
erofs: support large folios for fscache mode
erofs: enable large folios for fscache mode

Documentation/filesystems/erofs.rst | 2 +
fs/erofs/fscache.c | 148 +++++++++++++++-------------
fs/erofs/inode.c | 3 +-
3 files changed, 83 insertions(+), 70 deletions(-)

--
2.19.1.6.gb485710b

2022-12-01 07:50:04

by Jingbo Xu

[permalink] [raw]

Subject: [PATCH v4 1/2] erofs: support large folios for fscache mode

When large folios supported, one folio can be split into several slices,
each of which may be mapped to META/UNMAPPED/MAPPED, and the folio can
be unlocked as a whole only when all slices have completed.

Thus always allocate erofs_fscache_request for each .read_folio() or
.readahead(), in which case the allocated request is responsible for
unlocking folios when all slices have completed.

As described above, each folio or folio range can be mapped into several
slices, while these slices may be mapped to different cookies, and thus
each slice needs its own netfs_cache_resources. Here we introduce
chained requests to support this, where each .read_folio() or
.readahead() calling can correspond to multiple requests. Each request
has its own netfs_cache_resources and thus is used to access one cookie.
Among these requests, there's a primary request, with the others
pointing to the primary request.

Signed-off-by: Jingbo Xu <[email protected]>
---
fs/erofs/fscache.c | 148 ++++++++++++++++++++++++---------------------
1 file changed, 80 insertions(+), 68 deletions(-)

diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 3e794891cd91..f14886c479bd 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -12,6 +12,7 @@ static LIST_HEAD(erofs_domain_list);
static struct vfsmount *erofs_pseudo_mnt;

struct erofs_fscache_request {
+ struct erofs_fscache_request *primary;
struct netfs_cache_resources cache_resources;
struct address_space *mapping; /* The mapping being accessed */
loff_t start; /* Start position */
@@ -38,6 +39,26 @@ static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_spac
return req;
}

+static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
+ size_t len)
+{
+ struct erofs_fscache_request *req;
+
+ /* use primary request for the first submission */
+ if (!primary->submitted) {
+ refcount_inc(&primary->ref);
+ return primary;
+ }
+
+ req = erofs_fscache_req_alloc(primary->mapping,
+ primary->start + primary->submitted, len);
+ if (!IS_ERR(req)) {
+ req->primary = primary;
+ refcount_inc(&primary->ref);
+ }
+ return req;
+}
+
static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
{
struct folio *folio;
@@ -56,17 +77,19 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
folio_unlock(folio);
}
rcu_read_unlock();
-
- if (req->cache_resources.ops)
- req->cache_resources.ops->end_operation(&req->cache_resources);
-
- kfree(req);
}

static void erofs_fscache_req_put(struct erofs_fscache_request *req)
{
- if (refcount_dec_and_test(&req->ref))
- erofs_fscache_req_complete(req);
+ if (refcount_dec_and_test(&req->ref)) {
+ if (req->cache_resources.ops)
+ req->cache_resources.ops->end_operation(&req->cache_resources);
+ if (!req->primary)
+ erofs_fscache_req_complete(req);
+ else
+ erofs_fscache_req_put(req->primary);
+ kfree(req);
+ }
}

static void erofs_fscache_subreq_complete(void *priv,
@@ -74,8 +97,12 @@ static void erofs_fscache_subreq_complete(void *priv,
{
struct erofs_fscache_request *req = priv;

- if (IS_ERR_VALUE(transferred_or_error))
- req->error = transferred_or_error;
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ if (req->primary)
+ req->primary->error = transferred_or_error;
+ else
+ req->error = transferred_or_error;
+ }
erofs_fscache_req_put(req);
}

@@ -131,7 +158,6 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
done += slen;
}
DBG_BUGON(done != len);
- req->submitted += len;
return 0;
}

@@ -167,32 +193,19 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
return ret;
}

-/*
- * Read into page cache in the range described by (@pos, @len).
- *
- * On return, if the output @unlock is true, the caller is responsible for page
- * unlocking; otherwise the callee will take this responsibility through request
- * completion.
- *
- * The return value is the number of bytes successfully handled, or negative
- * error code on failure. The only exception is that, the length of the range
- * instead of the error code is returned on failure after request is allocated,
- * so that .readahead() could advance rac accordingly.
- */
-static int erofs_fscache_data_read(struct address_space *mapping,
- loff_t pos, size_t len, bool *unlock)
+static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
{
+ struct address_space *mapping = primary->mapping;
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
struct erofs_fscache_request *req;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
struct iov_iter iter;
+ loff_t pos = primary->start + primary->submitted;
size_t count;
int ret;

- *unlock = true;
-
map.m_la = pos;
ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
if (ret)
@@ -220,17 +233,19 @@ static int erofs_fscache_data_read(struct address_space *mapping,
}
iov_iter_zero(PAGE_SIZE - size, &iter);
erofs_put_metabuf(&buf);
- return PAGE_SIZE;
+ primary->submitted += PAGE_SIZE;
+ return 0;
}

+ count = primary->len - primary->submitted;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
- count = len;
iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count);
iov_iter_zero(count, &iter);
- return count;
+ primary->submitted += count;
+ return 0;
}

- count = min_t(size_t, map.m_llen - (pos - map.m_la), len);
+ count = min_t(size_t, map.m_llen - (pos - map.m_la), count);
DBG_BUGON(!count || count % PAGE_SIZE);

mdev = (struct erofs_map_dev) {
@@ -241,68 +256,65 @@ static int erofs_fscache_data_read(struct address_space *mapping,
if (ret)
return ret;

- req = erofs_fscache_req_alloc(mapping, pos, count);
+ req = erofs_fscache_req_chain(primary, count);
if (IS_ERR(req))
return PTR_ERR(req);

- *unlock = false;
ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
req, mdev.m_pa + (pos - map.m_la), count);
- if (ret)
- req->error = ret;
-
erofs_fscache_req_put(req);
- return count;
+ primary->submitted += count;
+ return ret;
}

-static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+static int erofs_fscache_data_read(struct erofs_fscache_request *req)
{
- bool unlock;
int ret;

- DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
+ do {
+ ret = erofs_fscache_data_read_slice(req);
+ if (ret)
+ req->error = ret;
+ } while (!ret && req->submitted < req->len);

- ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio),
- folio_size(folio), &unlock);
- if (unlock) {
- if (ret > 0)
- folio_mark_uptodate(folio);
+ return ret;
+}
+
+static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+{
+ struct erofs_fscache_request *req;
+ int ret;
+
+ req = erofs_fscache_req_alloc(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(req)) {
folio_unlock(folio);
+ return PTR_ERR(req);
}
- return ret < 0 ? ret : 0;
+
+ ret = erofs_fscache_data_read(req);
+ erofs_fscache_req_put(req);
+ return ret;
}

static void erofs_fscache_readahead(struct readahead_control *rac)
{
- struct folio *folio;
- size_t len, done = 0;
- loff_t start, pos;
- bool unlock;
- int ret, size;
+ struct erofs_fscache_request *req;

if (!readahead_count(rac))
return;

- start = readahead_pos(rac);
- len = readahead_length(rac);
+ req = erofs_fscache_req_alloc(rac->mapping,
+ readahead_pos(rac), readahead_length(rac));
+ if (IS_ERR(req))
+ return;

- do {
- pos = start + done;
- ret = erofs_fscache_data_read(rac->mapping, pos,
- len - done, &unlock);
- if (ret <= 0)
- return;
+ /* The request completion will drop refs on the folios. */
+ while (readahead_folio(rac))
+ ;

- size = ret;
- while (size) {
- folio = readahead_folio(rac);
- size -= folio_size(folio);
- if (unlock) {
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- }
- }
- } while ((done += ret) < len);
+ erofs_fscache_data_read(req);
+ erofs_fscache_req_put(req);
}

static const struct address_space_operations erofs_fscache_meta_aops = {
--
2.19.1.6.gb485710b

2022-12-01 08:27:12

by Jingbo Xu

[permalink] [raw]

Subject: [PATCH v4 2/2] erofs: enable large folios for fscache mode

Enable large folios for fscache mode. Enable this feature for
non-compressed format for now, until the compression part supports large
folios later.

One thing worth noting is that, the feature is not enabled for the meta
data routine since meta inodes don't need large folios for now, nor do
they support readahead yet.

Also document this new feature.

Signed-off-by: Jingbo Xu <[email protected]>
Reviewed-by: Jia Zhu <[email protected]>
---
Documentation/filesystems/erofs.rst | 2 ++
fs/erofs/inode.c | 3 +--
2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst
index 82af67fdaf99..1c1f7404b338 100644
--- a/Documentation/filesystems/erofs.rst
+++ b/Documentation/filesystems/erofs.rst
@@ -72,6 +72,8 @@ Here are the main features of EROFS:

- Support merging tail-end data into a special inode as fragments.

+ - Support large folios for uncompressed files.
+
- Support direct I/O on uncompressed files to avoid double caching for loop
devices;

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index e457b8a59ee7..85932086d23f 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -295,8 +295,7 @@ static int erofs_fill_inode(struct inode *inode)
goto out_unlock;
}
inode->i_mapping->a_ops = &erofs_raw_access_aops;
- if (!erofs_is_fscache_mode(inode->i_sb))
- mapping_set_large_folios(inode->i_mapping);
+ mapping_set_large_folios(inode->i_mapping);
#ifdef CONFIG_EROFS_FS_ONDEMAND
if (erofs_is_fscache_mode(inode->i_sb))
inode->i_mapping->a_ops = &erofs_fscache_access_aops;
--
2.19.1.6.gb485710b

2022-12-01 10:36:17

by Jia Zhu

[permalink] [raw]

Subject: Re: [Phishing Risk] [External] [PATCH v4 1/2] erofs: support large folios for fscache mode

在 2022/12/1 15:42, Jingbo Xu 写道:
> When large folios supported, one folio can be split into several slices,
> each of which may be mapped to META/UNMAPPED/MAPPED, and the folio can
> be unlocked as a whole only when all slices have completed.
>
> Thus always allocate erofs_fscache_request for each .read_folio() or
> .readahead(), in which case the allocated request is responsible for
> unlocking folios when all slices have completed.
>
> As described above, each folio or folio range can be mapped into several
> slices, while these slices may be mapped to different cookies, and thus
> each slice needs its own netfs_cache_resources. Here we introduce
> chained requests to support this, where each .read_folio() or
> .readahead() calling can correspond to multiple requests. Each request
> has its own netfs_cache_resources and thus is used to access one cookie.
> Among these requests, there's a primary request, with the others
> pointing to the primary request.
>
> Signed-off-by: Jingbo Xu <[email protected]>

Reviewed-by: Jia Zhu <[email protected]>
Thanks.

> ---
> fs/erofs/fscache.c | 148 ++++++++++++++++++++++++---------------------
> 1 file changed, 80 insertions(+), 68 deletions(-)
>
> diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
> index 3e794891cd91..f14886c479bd 100644
> --- a/fs/erofs/fscache.c
> +++ b/fs/erofs/fscache.c
> @@ -12,6 +12,7 @@ static LIST_HEAD(erofs_domain_list);
> static struct vfsmount *erofs_pseudo_mnt;
>
> struct erofs_fscache_request {
> + struct erofs_fscache_request *primary;
> struct netfs_cache_resources cache_resources;
> struct address_space *mapping; /* The mapping being accessed */
> loff_t start; /* Start position */
> @@ -38,6 +39,26 @@ static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_spac
> return req;
> }
>
> +static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
> + size_t len)
> +{
> + struct erofs_fscache_request *req;
> +
> + /* use primary request for the first submission */
> + if (!primary->submitted) {
> + refcount_inc(&primary->ref);
> + return primary;
> + }
> +
> + req = erofs_fscache_req_alloc(primary->mapping,
> + primary->start + primary->submitted, len);
> + if (!IS_ERR(req)) {
> + req->primary = primary;
> + refcount_inc(&primary->ref);
> + }
> + return req;
> +}
> +
> static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
> {
> struct folio *folio;
> @@ -56,17 +77,19 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
> folio_unlock(folio);
> }
> rcu_read_unlock();
> -
> - if (req->cache_resources.ops)
> - req->cache_resources.ops->end_operation(&req->cache_resources);
> -
> - kfree(req);
> }
>
> static void erofs_fscache_req_put(struct erofs_fscache_request *req)
> {
> - if (refcount_dec_and_test(&req->ref))
> - erofs_fscache_req_complete(req);
> + if (refcount_dec_and_test(&req->ref)) {
> + if (req->cache_resources.ops)
> + req->cache_resources.ops->end_operation(&req->cache_resources);
> + if (!req->primary)
> + erofs_fscache_req_complete(req);
> + else
> + erofs_fscache_req_put(req->primary);
> + kfree(req);
> + }
> }
>
> static void erofs_fscache_subreq_complete(void *priv,
> @@ -74,8 +97,12 @@ static void erofs_fscache_subreq_complete(void *priv,
> {
> struct erofs_fscache_request *req = priv;
>
> - if (IS_ERR_VALUE(transferred_or_error))
> - req->error = transferred_or_error;
> + if (IS_ERR_VALUE(transferred_or_error)) {
> + if (req->primary)
> + req->primary->error = transferred_or_error;
> + else
> + req->error = transferred_or_error;
> + }
> erofs_fscache_req_put(req);
> }
>
> @@ -131,7 +158,6 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
> done += slen;
> }
> DBG_BUGON(done != len);
> - req->submitted += len;
> return 0;
> }
>
> @@ -167,32 +193,19 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
> return ret;
> }
>
> -/*
> - * Read into page cache in the range described by (@pos, @len).
> - *
> - * On return, if the output @unlock is true, the caller is responsible for page
> - * unlocking; otherwise the callee will take this responsibility through request
> - * completion.
> - *
> - * The return value is the number of bytes successfully handled, or negative
> - * error code on failure. The only exception is that, the length of the range
> - * instead of the error code is returned on failure after request is allocated,
> - * so that .readahead() could advance rac accordingly.
> - */
> -static int erofs_fscache_data_read(struct address_space *mapping,
> - loff_t pos, size_t len, bool *unlock)
> +static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
> {
> + struct address_space *mapping = primary->mapping;
> struct inode *inode = mapping->host;
> struct super_block *sb = inode->i_sb;
> struct erofs_fscache_request *req;
> struct erofs_map_blocks map;
> struct erofs_map_dev mdev;
> struct iov_iter iter;
> + loff_t pos = primary->start + primary->submitted;
> size_t count;
> int ret;
>
> - *unlock = true;
> -
> map.m_la = pos;
> ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
> if (ret)
> @@ -220,17 +233,19 @@ static int erofs_fscache_data_read(struct address_space *mapping,
> }
> iov_iter_zero(PAGE_SIZE - size, &iter);
> erofs_put_metabuf(&buf);
> - return PAGE_SIZE;
> + primary->submitted += PAGE_SIZE;
> + return 0;
> }
>
> + count = primary->len - primary->submitted;
> if (!(map.m_flags & EROFS_MAP_MAPPED)) {
> - count = len;
> iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count);
> iov_iter_zero(count, &iter);
> - return count;
> + primary->submitted += count;
> + return 0;
> }
>
> - count = min_t(size_t, map.m_llen - (pos - map.m_la), len);
> + count = min_t(size_t, map.m_llen - (pos - map.m_la), count);
> DBG_BUGON(!count || count % PAGE_SIZE);
>
> mdev = (struct erofs_map_dev) {
> @@ -241,68 +256,65 @@ static int erofs_fscache_data_read(struct address_space *mapping,
> if (ret)
> return ret;
>
> - req = erofs_fscache_req_alloc(mapping, pos, count);
> + req = erofs_fscache_req_chain(primary, count);
> if (IS_ERR(req))
> return PTR_ERR(req);
>
> - *unlock = false;
> ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
> req, mdev.m_pa + (pos - map.m_la), count);
> - if (ret)
> - req->error = ret;
> -
> erofs_fscache_req_put(req);
> - return count;
> + primary->submitted += count;
> + return ret;
> }
>
> -static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
> +static int erofs_fscache_data_read(struct erofs_fscache_request *req)
> {
> - bool unlock;
> int ret;
>
> - DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
> + do {
> + ret = erofs_fscache_data_read_slice(req);
> + if (ret)
> + req->error = ret;
> + } while (!ret && req->submitted < req->len);
>
> - ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio),
> - folio_size(folio), &unlock);
> - if (unlock) {
> - if (ret > 0)
> - folio_mark_uptodate(folio);
> + return ret;
> +}
> +
> +static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
> +{
> + struct erofs_fscache_request *req;
> + int ret;
> +
> + req = erofs_fscache_req_alloc(folio_mapping(folio),
> + folio_pos(folio), folio_size(folio));
> + if (IS_ERR(req)) {
> folio_unlock(folio);
> + return PTR_ERR(req);
> }
> - return ret < 0 ? ret : 0;
> +
> + ret = erofs_fscache_data_read(req);
> + erofs_fscache_req_put(req);
> + return ret;
> }
>
> static void erofs_fscache_readahead(struct readahead_control *rac)
> {
> - struct folio *folio;
> - size_t len, done = 0;
> - loff_t start, pos;
> - bool unlock;
> - int ret, size;
> + struct erofs_fscache_request *req;
>
> if (!readahead_count(rac))
> return;
>
> - start = readahead_pos(rac);
> - len = readahead_length(rac);
> + req = erofs_fscache_req_alloc(rac->mapping,
> + readahead_pos(rac), readahead_length(rac));
> + if (IS_ERR(req))
> + return;
>
> - do {
> - pos = start + done;
> - ret = erofs_fscache_data_read(rac->mapping, pos,
> - len - done, &unlock);
> - if (ret <= 0)
> - return;
> + /* The request completion will drop refs on the folios. */
> + while (readahead_folio(rac))
> + ;
>
> - size = ret;
> - while (size) {
> - folio = readahead_folio(rac);
> - size -= folio_size(folio);
> - if (unlock) {
> - folio_mark_uptodate(folio);
> - folio_unlock(folio);
> - }
> - }
> - } while ((done += ret) < len);
> + erofs_fscache_data_read(req);
> + erofs_fscache_req_put(req);
> }
>
> static const struct address_space_operations erofs_fscache_meta_aops = {