ext4_encrypted_zeroout() could end up leaking a bio and bounce page.
Fortunately it's not used much. While we're fixing things up,
refactor out common code into the static function alloc_bounce_page().
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/crypto.c | 56 +++++++++++++++++++++++++-------------------------------
1 file changed, 25 insertions(+), 31 deletions(-)
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 68c7ab8..e43ed93 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -324,6 +324,28 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
return 0;
}
+static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx)
+{
+ struct page *ciphertext_page = alloc_page(GFP_NOFS);
+
+ if (!ciphertext_page) {
+ /* This is a potential bottleneck, but at least we'll have
+ * forward progress. */
+ ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
+ GFP_NOFS);
+ if (WARN_ON_ONCE(!ciphertext_page)) {
+ ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
+ GFP_NOFS | __GFP_WAIT);
+ }
+ ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+ } else {
+ ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+ }
+ ctx->flags |= EXT4_WRITE_PATH_FL;
+ ctx->w.bounce_page = ciphertext_page;
+ return ciphertext_page;
+}
+
/**
* ext4_encrypt() - Encrypts a page
* @inode: The inode for which the encryption should take place
@@ -353,22 +375,7 @@ struct page *ext4_encrypt(struct inode *inode,
return (struct page *) ctx;
/* The encryption operation will require a bounce page. */
- ciphertext_page = alloc_page(GFP_NOFS);
- if (!ciphertext_page) {
- /* This is a potential bottleneck, but at least we'll have
- * forward progress. */
- ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
- GFP_NOFS);
- if (WARN_ON_ONCE(!ciphertext_page)) {
- ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
- GFP_NOFS | __GFP_WAIT);
- }
- ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
- } else {
- ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
- }
- ctx->flags |= EXT4_WRITE_PATH_FL;
- ctx->w.bounce_page = ciphertext_page;
+ ciphertext_page = alloc_bounce_page(ctx);
ctx->w.control_page = plaintext_page;
err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
plaintext_page, ciphertext_page);
@@ -434,21 +441,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- ciphertext_page = alloc_page(GFP_NOFS);
- if (!ciphertext_page) {
- /* This is a potential bottleneck, but at least we'll have
- * forward progress. */
- ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
- GFP_NOFS);
- if (WARN_ON_ONCE(!ciphertext_page)) {
- ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
- GFP_NOFS | __GFP_WAIT);
- }
- ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
- } else {
- ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
- }
- ctx->w.bounce_page = ciphertext_page;
+ ciphertext_page = alloc_bounce_page(ctx);
while (len--) {
err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
@@ -470,6 +463,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
goto errout;
}
err = submit_bio_wait(WRITE, bio);
+ bio_put(bio);
if (err)
goto errout;
}
--
2.3.0
Factor out calls to ext4_inherit_context() and move them to
__ext4_new_inode(); this fixes a problem where ext4_tmpfile() wasn't
calling calling ext4_inherit_context(), so the temporary file wasn't
getting protected. Since the blocks for the tmpfile could end up on
disk, they really should be protected if the tmpfile is created within
the context of an encrypted directory.
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/ext4.h | 9 +++++++++
fs/ext4/ialloc.c | 26 ++++++++++++++++++++------
fs/ext4/namei.c | 29 +----------------------------
3 files changed, 30 insertions(+), 34 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7435ff2..bd8d32d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2149,6 +2149,11 @@ static inline int ext4_get_encryption_info(struct inode *inode)
return 0;
}
+static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
+{
+ return EXT4_I(inode)->i_crypt_info;
+}
+
#else
static inline int ext4_has_encryption_key(struct inode *inode)
{
@@ -2158,6 +2163,10 @@ static inline int ext4_get_encryption_info(struct inode *inode)
{
return 0;
}
+static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
+{
+ return NULL;
+}
#endif
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ddca169..173c1ae 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -726,11 +726,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_group_t i;
ext4_group_t flex_group;
struct ext4_group_info *grp;
+ int encrypt = 0;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
return ERR_PTR(-EPERM);
+ if ((ext4_encrypted_inode(dir) ||
+ DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+ err = ext4_get_encryption_info(dir);
+ if (err)
+ return ERR_PTR(err);
+ if (ext4_encryption_info(dir) == NULL)
+ return ERR_PTR(-EPERM);
+ if (!handle)
+ nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
+ encrypt = 1;
+ }
+
sb = dir->i_sb;
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
@@ -996,12 +1010,6 @@ got:
ei->i_block_group = group;
ei->i_last_alloc_group = ~0;
- /* If the directory encrypted, then we should encrypt the inode. */
- if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) &&
- (ext4_encrypted_inode(dir) ||
- DUMMY_ENCRYPTION_ENABLED(sbi)))
- ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
-
ext4_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
@@ -1063,6 +1071,12 @@ got:
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
+ if (encrypt) {
+ err = ext4_inherit_context(dir, inode);
+ if (err)
+ goto fail_free_drop;
+ }
+
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_std_error(sb, err);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6ab50f8..1e7d65d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2437,20 +2437,7 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
- err = 0;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (!err && (ext4_encrypted_inode(dir) ||
- DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) {
- err = ext4_inherit_context(dir, inode);
- if (err) {
- clear_nlink(inode);
- unlock_new_inode(inode);
- iput(inode);
- }
- }
-#endif
- if (!err)
- err = ext4_add_nondir(handle, dentry, inode);
+ err = ext4_add_nondir(handle, dentry, inode);
if (!err && IS_DIRSYNC(dir))
ext4_handle_sync(handle);
}
@@ -2631,14 +2618,6 @@ retry:
err = ext4_init_new_dir(handle, dir, inode);
if (err)
goto out_clear_inode;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (ext4_encrypted_inode(dir) ||
- DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) {
- err = ext4_inherit_context(dir, inode);
- if (err)
- goto out_clear_inode;
- }
-#endif
err = ext4_mark_inode_dirty(handle, inode);
if (!err)
err = ext4_add_entry(handle, dentry, inode);
@@ -3106,12 +3085,6 @@ static int ext4_symlink(struct inode *dir,
err = -ENOMEM;
goto err_drop_inode;
}
- err = ext4_inherit_context(dir, inode);
- if (err)
- goto err_drop_inode;
- err = ext4_get_encryption_info(inode);
- if (err)
- goto err_drop_inode;
istr.name = (const unsigned char *) symname;
istr.len = len;
ostr.name = sd->encrypted_path;
--
2.3.0
Previously we were taking the required padding when allocating space
for the on-disk symlink. This caused a buffer overrun which could
trigger a krenel crash when running fsstress.
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/crypto_fname.c | 25 +++++++++++++++----------
fs/ext4/ext4.h | 1 +
fs/ext4/namei.c | 32 +++++++++++++++++++++-----------
3 files changed, 37 insertions(+), 21 deletions(-)
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 23af41f..7dc4eb5 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -262,8 +262,20 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize)
return ((size+blksize-1)/blksize)*blksize;
}
-/**
- * ext4_fname_crypto_alloc_obuff() -
+unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen)
+{
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+ int padding = 32;
+
+ if (ci)
+ padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK);
+ if (ilen < EXT4_CRYPTO_BLOCK_SIZE)
+ ilen = EXT4_CRYPTO_BLOCK_SIZE;
+ return ext4_fname_crypto_round_up(ilen, padding);
+}
+
+/*
+ * ext4_fname_crypto_alloc_buffer() -
*
* Allocates an output buffer that is sufficient for the crypto operation
* specified by the context and the direction.
@@ -271,15 +283,8 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize)
int ext4_fname_crypto_alloc_buffer(struct inode *inode,
u32 ilen, struct ext4_str *crypto_str)
{
- unsigned int olen;
- int padding = 16;
- struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+ unsigned int olen = ext4_fname_encrypted_size(inode, ilen);
- if (ci)
- padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK);
- if (padding < EXT4_CRYPTO_BLOCK_SIZE)
- padding = EXT4_CRYPTO_BLOCK_SIZE;
- olen = ext4_fname_crypto_round_up(ilen, padding);
crypto_str->len = olen;
if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2)
olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bd8d32d..730c88d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2090,6 +2090,7 @@ static inline int ext4_sb_has_crypto(struct super_block *sb)
/* crypto_fname.c */
bool ext4_valid_filenames_enc_mode(uint32_t mode);
u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
+unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen);
int ext4_fname_crypto_alloc_buffer(struct inode *inode,
u32 ilen, struct ext4_str *crypto_str);
int _ext4_fname_disk_to_usr(struct inode *inode,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 401b099..bda4a5d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3039,10 +3039,23 @@ static int ext4_symlink(struct inode *dir,
encryption_required = (ext4_encrypted_inode(dir) ||
DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)));
- if (encryption_required)
- disk_link.len = encrypted_symlink_data_len(len) + 1;
- if (disk_link.len > dir->i_sb->s_blocksize)
- return -ENAMETOOLONG;
+ if (encryption_required) {
+ err = ext4_get_encryption_info(dir);
+ if (err)
+ return err;
+ if (ext4_encryption_info(dir) == NULL)
+ return -EPERM;
+ disk_link.len = (ext4_fname_encrypted_size(dir, len) +
+ sizeof(struct ext4_encrypted_symlink_data));
+ sd = kzalloc(disk_link.len, GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+ }
+
+ if (disk_link.len > dir->i_sb->s_blocksize) {
+ err = -ENAMETOOLONG;
+ goto err_free_sd;
+ }
dquot_initialize(dir);
@@ -3073,18 +3086,14 @@ static int ext4_symlink(struct inode *dir,
if (IS_ERR(inode)) {
if (handle)
ext4_journal_stop(handle);
- return PTR_ERR(inode);
+ err = PTR_ERR(inode);
+ goto err_free_sd;
}
if (encryption_required) {
struct qstr istr;
struct ext4_str ostr;
- sd = kzalloc(disk_link.len, GFP_NOFS);
- if (!sd) {
- err = -ENOMEM;
- goto err_drop_inode;
- }
istr.name = (const unsigned char *) symname;
istr.len = len;
ostr.name = sd->encrypted_path;
@@ -3156,10 +3165,11 @@ static int ext4_symlink(struct inode *dir,
err_drop_inode:
if (handle)
ext4_journal_stop(handle);
- kfree(sd);
clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
+err_free_sd:
+ kfree(sd);
return err;
}
--
2.3.0
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/dir.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index e11e6ae..f9e1491 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -593,6 +593,13 @@ finished:
return 0;
}
+static int ext4_dir_open(struct inode * inode, struct file * filp)
+{
+ if (ext4_encrypted_inode(inode))
+ return ext4_get_encryption_info(inode) ? -EACCES : 0;
+ return 0;
+}
+
static int ext4_release_dir(struct inode *inode, struct file *filp)
{
if (filp->private_data)
@@ -635,5 +642,6 @@ const struct file_operations ext4_dir_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.fsync = ext4_sync_file,
+ .open = ext4_dir_open,
.release = ext4_release_dir,
};
--
2.3.0
Set up the encryption information for newly created inodes immediately
after they inherit their encryption context from their parent
directories.
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/crypto_policy.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
index 683391f..81980a15 100644
--- a/fs/ext4/crypto_policy.c
+++ b/fs/ext4/crypto_policy.c
@@ -206,6 +206,7 @@ int ext4_inherit_context(struct inode *parent, struct inode *child)
if (!res) {
ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT);
ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA);
+ res = ext4_get_encryption_info(child);
}
return res;
}
--
2.3.0
Fix a potential memory leak where fname->crypto_buf.name wouldn't get
freed in some error paths, and also make the error handling easier to
understand/audit.
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/crypto_fname.c | 35 ++++++++++++++++-------------------
1 file changed, 16 insertions(+), 19 deletions(-)
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 29a2dc9..23af41f 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -401,7 +401,7 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
((iname->name[1] == '.') && (iname->len == 2))))) {
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
- goto out;
+ return 0;
}
ret = ext4_get_encryption_info(dir);
if (ret)
@@ -411,19 +411,16 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
ret = ext4_fname_crypto_alloc_buffer(dir, iname->len,
&fname->crypto_buf);
if (ret < 0)
- goto out;
+ return ret;
ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf);
if (ret < 0)
- goto out;
+ goto errout;
fname->disk_name.name = fname->crypto_buf.name;
fname->disk_name.len = fname->crypto_buf.len;
- ret = 0;
- goto out;
- }
- if (!lookup) {
- ret = -EACCES;
- goto out;
+ return 0;
}
+ if (!lookup)
+ return -EACCES;
/* We don't have the key and we are doing a lookup; decode the
* user-supplied name
@@ -431,19 +428,17 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
if (iname->name[0] == '_')
bigname = 1;
if ((bigname && (iname->len != 33)) ||
- (!bigname && (iname->len > 43))) {
- ret = -ENOENT;
- }
+ (!bigname && (iname->len > 43)))
+ return -ENOENT;
+
fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
- if (fname->crypto_buf.name == NULL) {
- ret = -ENOMEM;
- goto out;
- }
+ if (fname->crypto_buf.name == NULL)
+ return -ENOMEM;
ret = digest_decode(iname->name + bigname, iname->len - bigname,
fname->crypto_buf.name);
if (ret < 0) {
ret = -ENOENT;
- goto out;
+ goto errout;
}
fname->crypto_buf.len = ret;
if (bigname) {
@@ -453,8 +448,10 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
fname->disk_name.name = fname->crypto_buf.name;
fname->disk_name.len = fname->crypto_buf.len;
}
- ret = 0;
-out:
+ return 0;
+errout:
+ kfree(fname->crypto_buf.name);
+ fname->crypto_buf.name = NULL;
return ret;
}
--
2.3.0
Thanks to Chao Yu <[email protected]> for pointing out the need for
this check.
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/namei.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1e7d65d..401b099 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3647,6 +3647,15 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
u8 new_file_type;
int retval;
+ if ((ext4_encrypted_inode(old_dir) ||
+ ext4_encrypted_inode(new_dir)) &&
+ (old_dir != new_dir) &&
+ (!ext4_is_child_context_consistent_with_parent(new_dir,
+ old.inode) ||
+ !ext4_is_child_context_consistent_with_parent(old_dir,
+ new.inode)))
+ return -EPERM;
+
dquot_initialize(old.dir);
dquot_initialize(new.dir);
--
2.3.0
Thanks to Chao Yu <[email protected]> for pointing out we were
missing this check.
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/crypto_policy.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
index 81980a15..a1d434d 100644
--- a/fs/ext4/crypto_policy.c
+++ b/fs/ext4/crypto_policy.c
@@ -93,6 +93,8 @@ int ext4_process_policy(const struct ext4_encryption_policy *policy,
return -EINVAL;
if (!ext4_inode_has_encryption_context(inode)) {
+ if (!S_ISDIR(inode->i_mode))
+ return -EINVAL;
if (!ext4_empty_dir(inode))
return -ENOTEMPTY;
return ext4_create_encryption_context_from_policy(inode,
--
2.3.0
2015-05-28 20:47 GMT-03:00 Theodore Ts'o <[email protected]>:
> +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
> +{
> + return NULL;
> +}
> #endif
Ts´o, this code can not return anything be empty or not ?
Albino
On Fri, May 29, 2015 at 08:09:01AM -0300, Albino Biasutti Neto wrote:
> 2015-05-28 20:47 GMT-03:00 Theodore Ts'o <[email protected]>:
> > +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
> > +{
> > + return NULL;
> > +}
> > #endif
>
> Ts?o, this code can not return anything be empty or not ?
I'm not sure what you ar asking, since I can't quite parse your
question as a valid english sentence, sorry.
I think what you are asking is why is this function always returning
NULL?
That's because it's in the #else clause of an #ifdef
CONFIG_EXT4_FS_ENCRYPTION. In the case where
CONFIG_EXT4_FS_ENCRYPTION is defined, it returns:
static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
{
return EXT4_I(inode)->i_crypt_info;
}
The idea is to minimize the need for #ifdef's in the .c files.
Regards,
- Ted
On Thu, May 28, 2015 at 07:47:40PM -0400, Theodore Ts'o wrote:
> ext4_encrypted_zeroout() could end up leaking a bio and bounce page.
> Fortunately it's not used much. While we're fixing things up,
> refactor out common code into the static function alloc_bounce_page().
>
> Signed-off-by: Theodore Ts'o <[email protected]>
> ---
> fs/ext4/crypto.c | 56 +++++++++++++++++++++++++-------------------------------
> 1 file changed, 25 insertions(+), 31 deletions(-)
>
> diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
> index 68c7ab8..e43ed93 100644
> --- a/fs/ext4/crypto.c
> +++ b/fs/ext4/crypto.c
> @@ -324,6 +324,28 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
> return 0;
> }
>
> +static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx)
> +{
> + struct page *ciphertext_page = alloc_page(GFP_NOFS);
> +
> + if (!ciphertext_page) {
> + /* This is a potential bottleneck, but at least we'll have
> + * forward progress. */
> + ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
> + GFP_NOFS);
> + if (WARN_ON_ONCE(!ciphertext_page)) {
> + ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
> + GFP_NOFS | __GFP_WAIT);
Not sure why __GFP_WAIT is defined here.
Even GFP_NOFS has __GFP_WAIT.
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
IMO, __GFP_NOFAIL should be used here?
Otherwise, we seem to handle ENOMEM instead.
Thanks,
> + }
> + ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
> + } else {
> + ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
> + }
> + ctx->flags |= EXT4_WRITE_PATH_FL;
> + ctx->w.bounce_page = ciphertext_page;
> + return ciphertext_page;
> +}
> +
> /**
> * ext4_encrypt() - Encrypts a page
> * @inode: The inode for which the encryption should take place
> @@ -353,22 +375,7 @@ struct page *ext4_encrypt(struct inode *inode,
> return (struct page *) ctx;
>
> /* The encryption operation will require a bounce page. */
> - ciphertext_page = alloc_page(GFP_NOFS);
> - if (!ciphertext_page) {
> - /* This is a potential bottleneck, but at least we'll have
> - * forward progress. */
> - ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
> - GFP_NOFS);
> - if (WARN_ON_ONCE(!ciphertext_page)) {
> - ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
> - GFP_NOFS | __GFP_WAIT);
> - }
> - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
> - } else {
> - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
> - }
> - ctx->flags |= EXT4_WRITE_PATH_FL;
> - ctx->w.bounce_page = ciphertext_page;
> + ciphertext_page = alloc_bounce_page(ctx);
> ctx->w.control_page = plaintext_page;
> err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
> plaintext_page, ciphertext_page);
> @@ -434,21 +441,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
> if (IS_ERR(ctx))
> return PTR_ERR(ctx);
>
> - ciphertext_page = alloc_page(GFP_NOFS);
> - if (!ciphertext_page) {
> - /* This is a potential bottleneck, but at least we'll have
> - * forward progress. */
> - ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
> - GFP_NOFS);
> - if (WARN_ON_ONCE(!ciphertext_page)) {
> - ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
> - GFP_NOFS | __GFP_WAIT);
> - }
> - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
> - } else {
> - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
> - }
> - ctx->w.bounce_page = ciphertext_page;
> + ciphertext_page = alloc_bounce_page(ctx);
>
> while (len--) {
> err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
> @@ -470,6 +463,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
> goto errout;
> }
> err = submit_bio_wait(WRITE, bio);
> + bio_put(bio);
> if (err)
> goto errout;
> }
> --
> 2.3.0
2015-05-29 13:33 GMT-03:00 Theodore Ts'o <[email protected]>:
> I think what you are asking is why is this function always returning
> NULL?
Yes. Or empty, only function.
> That's because it's in the #else clause of an #ifdef
> CONFIG_EXT4_FS_ENCRYPTION. In the case where
> CONFIG_EXT4_FS_ENCRYPTION is defined, it returns:
>
> static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
> {
> return EXT4_I(inode)->i_crypt_info;
> }
>
> The idea is to minimize the need for #ifdef's in the .c files.
Understand, thanks.
Albino
On Fri, May 29, 2015 at 11:08:34AM -0700, Jaegeuk Kim wrote:
> Not sure why __GFP_WAIT is defined here.
> Even GFP_NOFS has __GFP_WAIT.
>
> #define GFP_NOFS (__GFP_WAIT | __GFP_IO)
>
> IMO, __GFP_NOFAIL should be used here?
> Otherwise, we seem to handle ENOMEM instead.
You're right, thanks for pointing that out. I've since changed the
patch so that we just return ENOMEM, and then made sure that we could
correctly handle ENOMEM.
On a different front, I've been thinking more about your proposal to
swap alloc_pages() and mempool_alloc(), since you were apparently
seeing OOM killers with a sufficiently aggressive fio workload on a
memory constrained device.
I took a closer look at how mempool_alloc() works, and in fact it will
try alloc_pages() first; but with GFP flags which causes it to not try
very hard. (__GFP_NOMEMALLOC, __GFP_NORETRY, __GFP_NOWARN,
!__GFP_WAIT, !__GFP_IO).
If this fails, it tries to use the memory pool, and then assuming the
original request includes __GFP_WAIT, it will retry the alloc_pages()
using the original gfp mask, if this *still* fails, it will wait for a
page to be released to the mempool. As a result, mempool_alloc() will
never fail when called with __GFP_WAIT, which means my original
concerns over mempool_alloc() failing were misplaced.
This has lead me to the following thoughts/conclusions:
1) We can just drop the alloc_page() call from alloc_bounce_page(),
since mempool_alloc() will call alloc_page() first. This also allows
us to remove all of the complexity relating to the
EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL flag.
2) Since mempool_alloc() will end up calling alloc_pages() --- and in
fact will try to alloc_pages() *twice* --- once with a "don't try
hard", and then a second time the way we would have alwyas called it
--- in practice this won't reduce the number of calls to allocate and
free pages assuming that there is enough memory in the system.
3) I don't think increasing num_prealloc_crypto_pages() to BIO_MAX is
the best idea; (a) it sequesters a large amount of memory, and (b)
even 256 pages is guaranteed to be enough. On a Samsung 840 EVO (for
example), you can have up to 32767k in a single request, and up to 31
requests in its NCQ. So in the worst case, you can have close to a
gigabyte of outstanding writes, and thus with a sufficiently evil set
of fio options, we could end up needing that many bounce pages. And
remember, the mempool is a shared resource; if we have multiple
devices with encrypted file systems, the memory pressure could be even
greater.
4) I need to run the experiment, but what might make sense is to just
call mempool_alloc() with GFP_NOWAIT. This will allow us to use
memory if it is available, but if not, we will simply retry the page
for writeback. The mempool is there simply to provide a bit of extra
spare capacity so that we send out reasonably sized I/O's when under
very high memory pressure. We can make the size of the mempool a
tunable which is dynamically adjustable using a sysfs file, and we can
see what seems to be the best tradeoff between having a fixed memory
allocation and performance under high memory pressure and heavy write
loads.
What do you think? Does that make sense? If so, you might want to
try a similar experiment; try removing the alloc_page() fallback
altogether, try calling mempool_alloc() with GFP_NOWAIT, and then try
different settings for num_prealloc_crypto_pages.
- Ted
On Sun, May 31, 2015 at 10:26:01AM -0400, Theodore Ts'o wrote:
> On Fri, May 29, 2015 at 11:08:34AM -0700, Jaegeuk Kim wrote:
> > Not sure why __GFP_WAIT is defined here.
> > Even GFP_NOFS has __GFP_WAIT.
> >
> > #define GFP_NOFS (__GFP_WAIT | __GFP_IO)
> >
> > IMO, __GFP_NOFAIL should be used here?
> > Otherwise, we seem to handle ENOMEM instead.
>
> You're right, thanks for pointing that out. I've since changed the
> patch so that we just return ENOMEM, and then made sure that we could
> correctly handle ENOMEM.
>
>
> On a different front, I've been thinking more about your proposal to
> swap alloc_pages() and mempool_alloc(), since you were apparently
> seeing OOM killers with a sufficiently aggressive fio workload on a
> memory constrained device.
>
> I took a closer look at how mempool_alloc() works, and in fact it will
> try alloc_pages() first; but with GFP flags which causes it to not try
> very hard. (__GFP_NOMEMALLOC, __GFP_NORETRY, __GFP_NOWARN,
> !__GFP_WAIT, !__GFP_IO).
>
> If this fails, it tries to use the memory pool, and then assuming the
> original request includes __GFP_WAIT, it will retry the alloc_pages()
> using the original gfp mask, if this *still* fails, it will wait for a
> page to be released to the mempool. As a result, mempool_alloc() will
> never fail when called with __GFP_WAIT, which means my original
> concerns over mempool_alloc() failing were misplaced.
Thank you for pointing out this.
Indeed, mempool_alloc never fails if __GFP_WAIT is set.
>
> This has lead me to the following thoughts/conclusions:
>
> 1) We can just drop the alloc_page() call from alloc_bounce_page(),
> since mempool_alloc() will call alloc_page() first. This also allows
> us to remove all of the complexity relating to the
> EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL flag.
Agreed.
So, once I tested a couple of flags given to the origial alloc_page(), it turns
out that my OOM killer issue was occurred due to __GFP_WAIT.
When I tested the original allo_page() with GFP_NOWAIT, the issue was gone.
>
> 2) Since mempool_alloc() will end up calling alloc_pages() --- and in
> fact will try to alloc_pages() *twice* --- once with a "don't try
> hard", and then a second time the way we would have alwyas called it
> --- in practice this won't reduce the number of calls to allocate and
> free pages assuming that there is enough memory in the system.
Right, the call count of alloc/free_pages was not the actual issue here.
>
> 3) I don't think increasing num_prealloc_crypto_pages() to BIO_MAX is
> the best idea; (a) it sequesters a large amount of memory, and (b)
> even 256 pages is guaranteed to be enough. On a Samsung 840 EVO (for
> example), you can have up to 32767k in a single request, and up to 31
> requests in its NCQ. So in the worst case, you can have close to a
> gigabyte of outstanding writes, and thus with a sufficiently evil set
> of fio options, we could end up needing that many bounce pages. And
> remember, the mempool is a shared resource; if we have multiple
> devices with encrypted file systems, the memory pressure could be even
> greater.
Agreed. I thought that mempool could be used to reuse the pages in the pool
as many as possible. But, I misunderstood the usage of it.
>
> 4) I need to run the experiment, but what might make sense is to just
> call mempool_alloc() with GFP_NOWAIT. This will allow us to use
> memory if it is available, but if not, we will simply retry the page
> for writeback. The mempool is there simply to provide a bit of extra
> spare capacity so that we send out reasonably sized I/O's when under
> very high memory pressure. We can make the size of the mempool a
> tunable which is dynamically adjustable using a sysfs file, and we can
> see what seems to be the best tradeoff between having a fixed memory
> allocation and performance under high memory pressure and heavy write
> loads.
It seems that the number of preallocated pages is not a critical issue to me
as of now. But, definitely, it should be tunable through sysfs for various
environments.
>
> What do you think? Does that make sense? If so, you might want to
> try a similar experiment; try removing the alloc_page() fallback
> altogether, try calling mempool_alloc() with GFP_NOWAIT, and then try
> different settings for num_prealloc_crypto_pages.
Obviously, as your suggestion, it'd be better to call mempool_alloc(GFP_NOWAIT)
as the final approach [1]. I've also seen that this can resolve all my concerns.
Regarding to the num_prealloc_crypto_pages, let me take a time to investigate
further in terms of the performance under different workloads.
Thank you so much,
[1]
---
>From 16655dbe229610d679fd449a22b6f4565edfb89d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <[email protected]>
Date: Mon, 1 Jun 2015 12:39:30 -0700
Subject: [PATCH] f2fs crypto: remove alloc_page for bounce_page
We don't need to call alloc_page() prior to mempool_alloc(), since the
mempool_alloc() calls alloc_page() internally.
And, if __GFP_WAIT is set, it never fails on page allocation, so let's
give GFP_NOWAIT and handle ENOMEM by writepage().
Signed-off-by: Jaegeuk Kim <[email protected]>
---
fs/f2fs/crypto.c | 33 ++++++++++++---------------------
fs/f2fs/f2fs_crypto.h | 3 +--
2 files changed, 13 insertions(+), 23 deletions(-)
diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c
index 2c7819a..be6af18 100644
--- a/fs/f2fs/crypto.c
+++ b/fs/f2fs/crypto.c
@@ -83,10 +83,7 @@ void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx)
unsigned long flags;
if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) {
- if (ctx->flags & F2FS_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL)
- __free_page(ctx->w.bounce_page);
- else
- mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool);
+ mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool);
ctx->w.bounce_page = NULL;
}
ctx->w.control_page = NULL;
@@ -408,34 +405,28 @@ struct page *f2fs_encrypt(struct inode *inode,
return (struct page *)ctx;
/* The encryption operation will require a bounce page. */
- ciphertext_page = alloc_page(GFP_NOFS);
+ ciphertext_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT);
if (!ciphertext_page) {
- /*
- * This is a potential bottleneck, but at least we'll have
- * forward progress.
- */
- ciphertext_page = mempool_alloc(f2fs_bounce_page_pool,
- GFP_NOFS);
- if (WARN_ON_ONCE(!ciphertext_page))
- ciphertext_page = mempool_alloc(f2fs_bounce_page_pool,
- GFP_NOFS | __GFP_WAIT);
- ctx->flags &= ~F2FS_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
- } else {
- ctx->flags |= F2FS_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+ err = -ENOMEM;
+ goto err_out;
}
+
ctx->flags |= F2FS_WRITE_PATH_FL;
ctx->w.bounce_page = ciphertext_page;
ctx->w.control_page = plaintext_page;
err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index,
plaintext_page, ciphertext_page);
- if (err) {
- f2fs_release_crypto_ctx(ctx);
- return ERR_PTR(err);
- }
+ if (err)
+ goto err_out;
+
SetPagePrivate(ciphertext_page);
set_page_private(ciphertext_page, (unsigned long)ctx);
lock_page(ciphertext_page);
return ciphertext_page;
+
+err_out:
+ f2fs_release_crypto_ctx(ctx);
+ return ERR_PTR(err);
}
/**
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
index be59d91..c2c1c2b 100644
--- a/fs/f2fs/f2fs_crypto.h
+++ b/fs/f2fs/f2fs_crypto.h
@@ -84,8 +84,7 @@ struct f2fs_crypt_info {
};
#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
-#define F2FS_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002
-#define F2FS_WRITE_PATH_FL 0x00000004
+#define F2FS_WRITE_PATH_FL 0x00000002
struct f2fs_crypto_ctx {
union {
--
2.1.1