This patch set implements secure delete for ext4. New
flags have been added to ext4_free_blocks and ext4_delete_entry
that cause the data blocks and directory entries to be
zeroed before they are released. Files that have the
secure remove attribute flag on will have their data and
directory entries zeroed when they are deleted.
Allison Henderson (2):
EXT4: Secure Delete: Zero out file data
EXT4: Secure Delete: Zero out files directory entry
fs/ext4/ext4.h | 6 ++++++
fs/ext4/extents.c | 3 +++
fs/ext4/inode.c | 3 +++
fs/ext4/mballoc.c | 8 ++++++++
fs/ext4/namei.c | 52 +++++++++++++++++++++++++++++++++++++++++-----------
fs/ext4/xattr.c | 12 ++++++++----
6 files changed, 69 insertions(+), 15 deletions(-)
The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
to ext4_free_blocks. This flag causes causes blocks to be
zerod before they are freed. Files that have the EXT4_SECRM_FL
attribute flag on will have their blocks zerod when
they are released. The EXT4_SECRM_FL attribute flag can
be enabled useing chattr +s
Signed-off-by: Allison Henderson <[email protected]>
---
v1->v2
Removed check for discard mount option and replaced with
check for secure discard and discard_zeroes_data
Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
v2->v3
Removed code for discard. A seperate patch will
be done to add that code in the block layer
:100644 100644 1921392... 38a4d75... M fs/ext4/ext4.h
:100644 100644 f815cc8... cf178f3... M fs/ext4/extents.c
:100644 100644 62f86e7... 8fdae7d... M fs/ext4/inode.c
:100644 100644 6ed859d... 94872b2... M fs/ext4/mballoc.c
:100644 100644 c757adc... 1ff7532... M fs/ext4/xattr.c
fs/ext4/ext4.h | 1 +
fs/ext4/extents.c | 3 +++
fs/ext4/inode.c | 3 +++
fs/ext4/mballoc.c | 8 ++++++++
fs/ext4/xattr.c | 12 ++++++++----
5 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1921392..38a4d75 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -526,6 +526,7 @@ struct ext4_new_group_data {
#define EXT4_FREE_BLOCKS_METADATA 0x0001
#define EXT4_FREE_BLOCKS_FORGET 0x0002
#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
+#define EXT4_FREE_BLOCKS_ZERO 0x0008
/*
* ioctl commands
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f815cc8..cf178f3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
flags |= EXT4_FREE_BLOCKS_METADATA;
+
+ if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
+ flags |= EXT4_FREE_BLOCKS_ZERO;
#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 62f86e7..8fdae7d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
for (p = first; p < last; p++)
*p = 0;
+ if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
+ flags |= EXT4_FREE_BLOCKS_ZERO;
+
ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
return 0;
out_err:
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6ed859d..94872b2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_debug("freeing block %llu\n", block);
trace_ext4_free_blocks(inode, block, count, flags);
+ if (flags & EXT4_FREE_BLOCKS_ZERO) {
+ err = sb_issue_zeroout(inode->i_sb, block, count, GFP_NOFS);
+ if (err < 0)
+ goto error_return;
+ else
+ err = 0;
+ }
+
if (flags & EXT4_FREE_BLOCKS_FORGET) {
struct buffer_head *tbh = bh;
int i;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c757adc..1ff7532 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -471,7 +471,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
struct mb_cache_entry *ce = NULL;
- int error = 0;
+ int free_blocks_flags, error = 0;
ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
error = ext4_journal_get_write_access(handle, bh);
@@ -484,9 +484,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
if (ce)
mb_cache_entry_free(ce);
get_bh(bh);
- ext4_free_blocks(handle, inode, bh, 0, 1,
- EXT4_FREE_BLOCKS_METADATA |
- EXT4_FREE_BLOCKS_FORGET);
+ free_blocks_flags = EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET;
+
+ if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
+ free_blocks_flags |= EXT4_FREE_BLOCKS_ZERO;
+
+ ext4_free_blocks(handle, inode, bh, 0, 1, free_blocks_flags);
} else {
le32_add_cpu(&BHDR(bh)->h_refcount, -1);
error = ext4_handle_dirty_metadata(handle, inode, bh);
--
1.7.1
This patch zeros out a files directory entry when a file
with the EXT4_SECRM_FL attribute flag is deleted. A new flag
parameter has been added to the ext4_delete_entry routine,
that will force the entry to be zeroed out and flushed to the disk.
Signed-off-by: Allison Henderson <[email protected]>
---
v1->v2
Removed new inode parameter in ext4_delete_entry and replaced
with a new flag for ext4_delete_entry
:100644 100644 38a4d75... 1e50f78... M fs/ext4/ext4.h
:100644 100644 b754b77... 9af19f4... M fs/ext4/namei.c
fs/ext4/ext4.h | 5 +++++
fs/ext4/namei.c | 52 +++++++++++++++++++++++++++++++++++++++++-----------
2 files changed, 46 insertions(+), 11 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 38a4d75..1e50f78 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -529,6 +529,11 @@ struct ext4_new_group_data {
#define EXT4_FREE_BLOCKS_ZERO 0x0008
/*
+ * Flags used by ext4_delete_entry
+ */
+#define EXT4_DEL_ENTRY_ZERO 0x0001
+
+/*
* ioctl commands
*/
#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b754b77..9af19f4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1643,10 +1643,12 @@ cleanup:
static int ext4_delete_entry(handle_t *handle,
struct inode *dir,
struct ext4_dir_entry_2 *de_del,
- struct buffer_head *bh)
+ struct buffer_head *bh,
+ int flags)
{
struct ext4_dir_entry_2 *de, *pde;
unsigned int blocksize = dir->i_sb->s_blocksize;
+ struct ext4_super_block *es = EXT4_SB(dir->i_sb)->s_es;
int i, err;
i = 0;
@@ -1673,10 +1675,34 @@ static int ext4_delete_entry(handle_t *handle,
de->inode = 0;
dir->i_version++;
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, dir, bh);
- if (unlikely(err)) {
- ext4_std_error(dir->i_sb, err);
- return err;
+
+ /*
+ * If the secure remove flag is on, zero
+ * out the entry and write it out to the
+ * disk
+ */
+ if (flags & EXT4_DEL_ENTRY_ZERO) {
+ memset(de->name, 0x00, de->name_len);
+ de->file_type = 0;
+
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh)) {
+ es->s_last_error_block =
+ cpu_to_le64(bh->b_blocknr);
+ ext4_error_inode(dir,
+ __func__, __LINE__, bh->b_blocknr,
+ "IO error syncing itable block");
+ return -EIO;
+ }
+ } else {
+ err = ext4_handle_dirty_metadata(handle,
+ dir, bh);
+
+ if (unlikely(err)) {
+ ext4_std_error(dir->i_sb, err);
+ return err;
+ }
}
return 0;
}
@@ -2162,7 +2188,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
if (!empty_dir(inode))
goto end_rmdir;
- retval = ext4_delete_entry(handle, dir, de, bh);
+ retval = ext4_delete_entry(handle, dir, de, bh, 0);
if (retval)
goto end_rmdir;
if (!EXT4_DIR_LINK_EMPTY(inode))
@@ -2190,7 +2216,7 @@ end_rmdir:
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
- int retval;
+ int retval, del_entry_flags;
struct inode *inode;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
@@ -2215,6 +2241,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
goto end_unlink;
inode = dentry->d_inode;
+ del_entry_flags = EXT4_I(inode)->i_flags & EXT4_SECRM_FL ?
+ EXT4_DEL_ENTRY_ZERO : 0;
retval = -EIO;
if (le32_to_cpu(de->inode) != inode->i_ino)
@@ -2226,7 +2254,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ino, inode->i_nlink);
inode->i_nlink = 1;
}
- retval = ext4_delete_entry(handle, dir, de, bh);
+ retval = ext4_delete_entry(handle, dir, de, bh, del_entry_flags);
if (retval)
goto end_unlink;
dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
@@ -2404,7 +2432,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode, *new_inode;
struct buffer_head *old_bh, *new_bh, *dir_bh;
struct ext4_dir_entry_2 *old_de, *new_de;
- int retval, force_da_alloc = 0;
+ int retval, del_entry_flags, force_da_alloc = 0;
dquot_initialize(old_dir);
dquot_initialize(new_dir);
@@ -2503,11 +2531,13 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
/*
* ok, that's it
*/
+ del_entry_flags = EXT4_I(old_inode)->i_flags & EXT4_SECRM_FL ?
+ EXT4_DEL_ENTRY_ZERO : 0;
if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
old_de->name_len != old_dentry->d_name.len ||
strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
(retval = ext4_delete_entry(handle, old_dir,
- old_de, old_bh)) == -ENOENT) {
+ old_de, old_bh, del_entry_flags)) == -ENOENT) {
/* old_de could have moved from under us during htree split, so
* make sure that we are deleting the right entry. We might
* also be pointing to a stale entry in the unused part of
@@ -2518,7 +2548,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
if (old_bh2) {
retval = ext4_delete_entry(handle, old_dir,
- old_de2, old_bh2);
+ old_de2, old_bh2, del_entry_flags);
brelse(old_bh2);
}
}
--
1.7.1
On 2011-06-30, at 3:22 PM, Allison Henderson wrote:
> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
> to ext4_free_blocks. This flag causes causes blocks to be
> zerod before they are freed. Files that have the EXT4_SECRM_FL
> attribute flag on will have their blocks zerod when
> they are released. The EXT4_SECRM_FL attribute flag can
> be enabled useing chattr +s
>
> Signed-off-by: Allison Henderson <[email protected]>
> ---
> v1->v2
> Removed check for discard mount option and replaced with
> check for secure discard and discard_zeroes_data
>
> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>
> v2->v3
> Removed code for discard. A seperate patch will
> be done to add that code in the block layer
>
> :100644 100644 1921392... 38a4d75... M fs/ext4/ext4.h
> :100644 100644 f815cc8... cf178f3... M fs/ext4/extents.c
> :100644 100644 62f86e7... 8fdae7d... M fs/ext4/inode.c
> :100644 100644 6ed859d... 94872b2... M fs/ext4/mballoc.c
> :100644 100644 c757adc... 1ff7532... M fs/ext4/xattr.c
> fs/ext4/ext4.h | 1 +
> fs/ext4/extents.c | 3 +++
> fs/ext4/inode.c | 3 +++
> fs/ext4/mballoc.c | 8 ++++++++
> fs/ext4/xattr.c | 12 ++++++++----
> 5 files changed, 23 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1921392..38a4d75 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
> #define EXT4_FREE_BLOCKS_METADATA 0x0001
> #define EXT4_FREE_BLOCKS_FORGET 0x0002
> #define EXT4_FREE_BLOCKS_VALIDATED 0x0004
> +#define EXT4_FREE_BLOCKS_ZERO 0x0008
>
> /*
> * ioctl commands
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index f815cc8..cf178f3 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
>
> if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> flags |= EXT4_FREE_BLOCKS_METADATA;
> +
> + if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
> + flags |= EXT4_FREE_BLOCKS_ZERO;
> #ifdef EXTENTS_STATS
> {
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 62f86e7..8fdae7d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
> for (p = first; p < last; p++)
> *p = 0;
>
> + if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
> + flags |= EXT4_FREE_BLOCKS_ZERO;
> +
> ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
> return 0;
> out_err:
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 6ed859d..94872b2 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
> ext4_debug("freeing block %llu\n", block);
> trace_ext4_free_blocks(inode, block, count, flags);
>
> + if (flags & EXT4_FREE_BLOCKS_ZERO) {
> + err = sb_issue_zeroout(inode->i_sb, block, count, GFP_NOFS);
Does sb_issue_zeroout() use the SCSI "write same" feature in the
background? That would avoid busying the CPU/controller/bus with
writing out zeroes, which might be expensive for a large file.
> + if (err < 0)
> + goto error_return;
> + else
> + err = 0;
> + }
> +
> if (flags & EXT4_FREE_BLOCKS_FORGET) {
> struct buffer_head *tbh = bh;
> int i;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index c757adc..1ff7532 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -471,7 +471,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> struct buffer_head *bh)
> {
> struct mb_cache_entry *ce = NULL;
> - int error = 0;
> + int free_blocks_flags, error = 0;
>
> ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
> error = ext4_journal_get_write_access(handle, bh);
> @@ -484,9 +484,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> if (ce)
> mb_cache_entry_free(ce);
> get_bh(bh);
> - ext4_free_blocks(handle, inode, bh, 0, 1,
> - EXT4_FREE_BLOCKS_METADATA |
> - EXT4_FREE_BLOCKS_FORGET);
> + free_blocks_flags = EXT4_FREE_BLOCKS_METADATA |
> + EXT4_FREE_BLOCKS_FORGET;
> +
> + if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
> + free_blocks_flags |= EXT4_FREE_BLOCKS_ZERO;
> +
> + ext4_free_blocks(handle, inode, bh, 0, 1, free_blocks_flags);
> } else {
> le32_add_cpu(&BHDR(bh)->h_refcount, -1);
> error = ext4_handle_dirty_metadata(handle, inode, bh);
> --
> 1.7.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Cheers, Andreas
On 06/30/2011 03:15 PM, Andreas Dilger wrote:
> On 2011-06-30, at 3:22 PM, Allison Henderson wrote:
>> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
>> to ext4_free_blocks. This flag causes causes blocks to be
>> zerod before they are freed. Files that have the EXT4_SECRM_FL
>> attribute flag on will have their blocks zerod when
>> they are released. The EXT4_SECRM_FL attribute flag can
>> be enabled useing chattr +s
>>
>> Signed-off-by: Allison Henderson<[email protected]>
>> ---
>> v1->v2
>> Removed check for discard mount option and replaced with
>> check for secure discard and discard_zeroes_data
>>
>> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>>
>> v2->v3
>> Removed code for discard. A seperate patch will
>> be done to add that code in the block layer
>>
>> :100644 100644 1921392... 38a4d75... M fs/ext4/ext4.h
>> :100644 100644 f815cc8... cf178f3... M fs/ext4/extents.c
>> :100644 100644 62f86e7... 8fdae7d... M fs/ext4/inode.c
>> :100644 100644 6ed859d... 94872b2... M fs/ext4/mballoc.c
>> :100644 100644 c757adc... 1ff7532... M fs/ext4/xattr.c
>> fs/ext4/ext4.h | 1 +
>> fs/ext4/extents.c | 3 +++
>> fs/ext4/inode.c | 3 +++
>> fs/ext4/mballoc.c | 8 ++++++++
>> fs/ext4/xattr.c | 12 ++++++++----
>> 5 files changed, 23 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 1921392..38a4d75 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
>> #define EXT4_FREE_BLOCKS_METADATA 0x0001
>> #define EXT4_FREE_BLOCKS_FORGET 0x0002
>> #define EXT4_FREE_BLOCKS_VALIDATED 0x0004
>> +#define EXT4_FREE_BLOCKS_ZERO 0x0008
>>
>> /*
>> * ioctl commands
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index f815cc8..cf178f3 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
>>
>> if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
>> flags |= EXT4_FREE_BLOCKS_METADATA;
>> +
>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>> + flags |= EXT4_FREE_BLOCKS_ZERO;
>> #ifdef EXTENTS_STATS
>> {
>> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 62f86e7..8fdae7d 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
>> for (p = first; p< last; p++)
>> *p = 0;
>>
>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>> + flags |= EXT4_FREE_BLOCKS_ZERO;
>> +
>> ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
>> return 0;
>> out_err:
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index 6ed859d..94872b2 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
>> ext4_debug("freeing block %llu\n", block);
>> trace_ext4_free_blocks(inode, block, count, flags);
>>
>> + if (flags& EXT4_FREE_BLOCKS_ZERO) {
>> + err = sb_issue_zeroout(inode->i_sb, block, count, GFP_NOFS);
>
> Does sb_issue_zeroout() use the SCSI "write same" feature in the
> background? That would avoid busying the CPU/controller/bus with
> writing out zeroes, which might be expensive for a large file.
>
Hmm, that's a good question, I will dig into it and see if I can find out.
>> + if (err< 0)
>> + goto error_return;
>> + else
>> + err = 0;
>> + }
>> +
>> if (flags& EXT4_FREE_BLOCKS_FORGET) {
>> struct buffer_head *tbh = bh;
>> int i;
>> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
>> index c757adc..1ff7532 100644
>> --- a/fs/ext4/xattr.c
>> +++ b/fs/ext4/xattr.c
>> @@ -471,7 +471,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>> struct buffer_head *bh)
>> {
>> struct mb_cache_entry *ce = NULL;
>> - int error = 0;
>> + int free_blocks_flags, error = 0;
>>
>> ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
>> error = ext4_journal_get_write_access(handle, bh);
>> @@ -484,9 +484,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>> if (ce)
>> mb_cache_entry_free(ce);
>> get_bh(bh);
>> - ext4_free_blocks(handle, inode, bh, 0, 1,
>> - EXT4_FREE_BLOCKS_METADATA |
>> - EXT4_FREE_BLOCKS_FORGET);
>> + free_blocks_flags = EXT4_FREE_BLOCKS_METADATA |
>> + EXT4_FREE_BLOCKS_FORGET;
>> +
>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>> + free_blocks_flags |= EXT4_FREE_BLOCKS_ZERO;
>> +
>> + ext4_free_blocks(handle, inode, bh, 0, 1, free_blocks_flags);
>> } else {
>> le32_add_cpu(&BHDR(bh)->h_refcount, -1);
>> error = ext4_handle_dirty_metadata(handle, inode, bh);
>> --
>> 1.7.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
> Cheers, Andreas
>
>
>
>
>
>>>>> "Allison" == Allison Henderson <[email protected]> writes:
>> Does sb_issue_zeroout() use the SCSI "write same" feature in the
>> background? That would avoid busying the CPU/controller/bus with
>> writing out zeroes, which might be expensive for a large file.
>>
Allison> Hmm, that's a good question, I will dig into it and see if I
Allison> can find out.
This is a bit of an ongoing project.
Unfortunately WRITE SAME is quirk central as many drives only implement
block ranges corresponding to what RAID vendors told them they needed.
Many of the drives I tested have internal caps at 16 or 32MB and will
fail in interesting (i.e. not necessarily graceful) ways if given bigger
ranges.
I've been lobbying for a way for devices to report their WRITE SAME
limit for a while. That feature finally made it into the latest SBC3
draft. The changes required to support it went into the SCSI layer
during the last merge window. What remains is wiring this up to
blkdev_issue_zeroout(). I have some patches sitting in my queue that I
hope to get polished and submitted soon.
Anyway. From a filesystem perspective sb_issue_zeroout() interface is
definitely the way to go. WRITE SAME will eventually be called if the
device supports it.
--
Martin K. Petersen Oracle Linux Engineering
On 06/30/2011 06:18 PM, Martin K. Petersen wrote:
>>>>>> "Allison" == Allison Henderson<[email protected]> writes:
>
>>> Does sb_issue_zeroout() use the SCSI "write same" feature in the
>>> background? That would avoid busying the CPU/controller/bus with
>>> writing out zeroes, which might be expensive for a large file.
>>>
> Allison> Hmm, that's a good question, I will dig into it and see if I
> Allison> can find out.
>
> This is a bit of an ongoing project.
>
> Unfortunately WRITE SAME is quirk central as many drives only implement
> block ranges corresponding to what RAID vendors told them they needed.
> Many of the drives I tested have internal caps at 16 or 32MB and will
> fail in interesting (i.e. not necessarily graceful) ways if given bigger
> ranges.
>
> I've been lobbying for a way for devices to report their WRITE SAME
> limit for a while. That feature finally made it into the latest SBC3
> draft. The changes required to support it went into the SCSI layer
> during the last merge window. What remains is wiring this up to
> blkdev_issue_zeroout(). I have some patches sitting in my queue that I
> hope to get polished and submitted soon.
>
> Anyway. From a filesystem perspective sb_issue_zeroout() interface is
> definitely the way to go. WRITE SAME will eventually be called if the
> device supports it.
>
Ah, I see, well that answers that question. Thx for the through
explanation. :)
On Thu, 30 Jun 2011, Allison Henderson wrote:
> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
> to ext4_free_blocks. This flag causes causes blocks to be
> zerod before they are freed. Files that have the EXT4_SECRM_FL
> attribute flag on will have their blocks zerod when
> they are released. The EXT4_SECRM_FL attribute flag can
> be enabled useing chattr +s
Hi Allison,
the patch looks good, however I have one little nitpick bellow:)
>
> Signed-off-by: Allison Henderson <[email protected]>
> ---
> v1->v2
> Removed check for discard mount option and replaced with
> check for secure discard and discard_zeroes_data
>
> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>
> v2->v3
> Removed code for discard. A seperate patch will
> be done to add that code in the block layer
>
> :100644 100644 1921392... 38a4d75... M fs/ext4/ext4.h
> :100644 100644 f815cc8... cf178f3... M fs/ext4/extents.c
> :100644 100644 62f86e7... 8fdae7d... M fs/ext4/inode.c
> :100644 100644 6ed859d... 94872b2... M fs/ext4/mballoc.c
> :100644 100644 c757adc... 1ff7532... M fs/ext4/xattr.c
> fs/ext4/ext4.h | 1 +
> fs/ext4/extents.c | 3 +++
> fs/ext4/inode.c | 3 +++
> fs/ext4/mballoc.c | 8 ++++++++
> fs/ext4/xattr.c | 12 ++++++++----
> 5 files changed, 23 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1921392..38a4d75 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
> #define EXT4_FREE_BLOCKS_METADATA 0x0001
> #define EXT4_FREE_BLOCKS_FORGET 0x0002
> #define EXT4_FREE_BLOCKS_VALIDATED 0x0004
> +#define EXT4_FREE_BLOCKS_ZERO 0x0008
>
> /*
> * ioctl commands
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index f815cc8..cf178f3 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
>
> if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> flags |= EXT4_FREE_BLOCKS_METADATA;
> +
> + if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
> + flags |= EXT4_FREE_BLOCKS_ZERO;
> #ifdef EXTENTS_STATS
> {
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 62f86e7..8fdae7d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
> for (p = first; p < last; p++)
> *p = 0;
>
> + if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
> + flags |= EXT4_FREE_BLOCKS_ZERO;
> +
> ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
> return 0;
> out_err:
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 6ed859d..94872b2 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
> ext4_debug("freeing block %llu\n", block);
> trace_ext4_free_blocks(inode, block, count, flags);
>
> + if (flags & EXT4_FREE_BLOCKS_ZERO) {
> + err = sb_issue_zeroout(inode->i_sb, block, count, GFP_NOFS);
> + if (err < 0)
> + goto error_return;
> + else
> + err = 0;
No need to set err=0. sb_issue_zeroout() will return zero on success.
Thanks!
-Lukas
> + }
> +
> if (flags & EXT4_FREE_BLOCKS_FORGET) {
> struct buffer_head *tbh = bh;
> int i;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index c757adc..1ff7532 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -471,7 +471,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> struct buffer_head *bh)
> {
> struct mb_cache_entry *ce = NULL;
> - int error = 0;
> + int free_blocks_flags, error = 0;
>
> ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
> error = ext4_journal_get_write_access(handle, bh);
> @@ -484,9 +484,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> if (ce)
> mb_cache_entry_free(ce);
> get_bh(bh);
> - ext4_free_blocks(handle, inode, bh, 0, 1,
> - EXT4_FREE_BLOCKS_METADATA |
> - EXT4_FREE_BLOCKS_FORGET);
> + free_blocks_flags = EXT4_FREE_BLOCKS_METADATA |
> + EXT4_FREE_BLOCKS_FORGET;
> +
> + if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
> + free_blocks_flags |= EXT4_FREE_BLOCKS_ZERO;
> +
> + ext4_free_blocks(handle, inode, bh, 0, 1, free_blocks_flags);
> } else {
> le32_add_cpu(&BHDR(bh)->h_refcount, -1);
> error = ext4_handle_dirty_metadata(handle, inode, bh);
>
--
On 07/01/2011 03:26 AM, Lukas Czerner wrote:
> On Thu, 30 Jun 2011, Allison Henderson wrote:
>
>> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
>> to ext4_free_blocks. This flag causes causes blocks to be
>> zerod before they are freed. Files that have the EXT4_SECRM_FL
>> attribute flag on will have their blocks zerod when
>> they are released. The EXT4_SECRM_FL attribute flag can
>> be enabled useing chattr +s
>
> Hi Allison,
>
> the patch looks good, however I have one little nitpick bellow:)
>>
>> Signed-off-by: Allison Henderson<[email protected]>
>> ---
>> v1->v2
>> Removed check for discard mount option and replaced with
>> check for secure discard and discard_zeroes_data
>>
>> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>>
>> v2->v3
>> Removed code for discard. A seperate patch will
>> be done to add that code in the block layer
>>
>> :100644 100644 1921392... 38a4d75... M fs/ext4/ext4.h
>> :100644 100644 f815cc8... cf178f3... M fs/ext4/extents.c
>> :100644 100644 62f86e7... 8fdae7d... M fs/ext4/inode.c
>> :100644 100644 6ed859d... 94872b2... M fs/ext4/mballoc.c
>> :100644 100644 c757adc... 1ff7532... M fs/ext4/xattr.c
>> fs/ext4/ext4.h | 1 +
>> fs/ext4/extents.c | 3 +++
>> fs/ext4/inode.c | 3 +++
>> fs/ext4/mballoc.c | 8 ++++++++
>> fs/ext4/xattr.c | 12 ++++++++----
>> 5 files changed, 23 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 1921392..38a4d75 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
>> #define EXT4_FREE_BLOCKS_METADATA 0x0001
>> #define EXT4_FREE_BLOCKS_FORGET 0x0002
>> #define EXT4_FREE_BLOCKS_VALIDATED 0x0004
>> +#define EXT4_FREE_BLOCKS_ZERO 0x0008
>>
>> /*
>> * ioctl commands
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index f815cc8..cf178f3 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
>>
>> if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
>> flags |= EXT4_FREE_BLOCKS_METADATA;
>> +
>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>> + flags |= EXT4_FREE_BLOCKS_ZERO;
>> #ifdef EXTENTS_STATS
>> {
>> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 62f86e7..8fdae7d 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
>> for (p = first; p< last; p++)
>> *p = 0;
>>
>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>> + flags |= EXT4_FREE_BLOCKS_ZERO;
>> +
>> ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
>> return 0;
>> out_err:
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index 6ed859d..94872b2 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
>> ext4_debug("freeing block %llu\n", block);
>> trace_ext4_free_blocks(inode, block, count, flags);
>>
>> + if (flags& EXT4_FREE_BLOCKS_ZERO) {
>> + err = sb_issue_zeroout(inode->i_sb, block, count, GFP_NOFS);
>> + if (err< 0)
>> + goto error_return;
>> + else
>> + err = 0;
>
> No need to set err=0. sb_issue_zeroout() will return zero on success.
>
> Thanks!
> -Lukas
Ah, alrighty, I will pull that out. Thx! :)
Allison
>
>> + }
>> +
>> if (flags& EXT4_FREE_BLOCKS_FORGET) {
>> struct buffer_head *tbh = bh;
>> int i;
>> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
>> index c757adc..1ff7532 100644
>> --- a/fs/ext4/xattr.c
>> +++ b/fs/ext4/xattr.c
>> @@ -471,7 +471,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>> struct buffer_head *bh)
>> {
>> struct mb_cache_entry *ce = NULL;
>> - int error = 0;
>> + int free_blocks_flags, error = 0;
>>
>> ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
>> error = ext4_journal_get_write_access(handle, bh);
>> @@ -484,9 +484,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>> if (ce)
>> mb_cache_entry_free(ce);
>> get_bh(bh);
>> - ext4_free_blocks(handle, inode, bh, 0, 1,
>> - EXT4_FREE_BLOCKS_METADATA |
>> - EXT4_FREE_BLOCKS_FORGET);
>> + free_blocks_flags = EXT4_FREE_BLOCKS_METADATA |
>> + EXT4_FREE_BLOCKS_FORGET;
>> +
>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>> + free_blocks_flags |= EXT4_FREE_BLOCKS_ZERO;
>> +
>> + ext4_free_blocks(handle, inode, bh, 0, 1, free_blocks_flags);
>> } else {
>> le32_add_cpu(&BHDR(bh)->h_refcount, -1);
>> error = ext4_handle_dirty_metadata(handle, inode, bh);
>>
>
On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
<[email protected]> wrote:
> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
> to ext4_free_blocks. This flag causes causes blocks to be
> zerod before they are freed. ?Files that have the EXT4_SECRM_FL
> attribute flag on will have their blocks zerod when
> they are released. ?The EXT4_SECRM_FL attribute flag can
> be enabled useing chattr +s
>
> Signed-off-by: Allison Henderson <[email protected]>
> ---
> v1->v2
> Removed check for discard mount option and replaced with
> check for secure discard and discard_zeroes_data
>
> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>
> v2->v3
> Removed code for discard. ?A seperate patch will
> be done to add that code in the block layer
>
> :100644 100644 1921392... 38a4d75... M ?fs/ext4/ext4.h
> :100644 100644 f815cc8... cf178f3... M ?fs/ext4/extents.c
> :100644 100644 62f86e7... 8fdae7d... M ?fs/ext4/inode.c
> :100644 100644 6ed859d... 94872b2... M ?fs/ext4/mballoc.c
> :100644 100644 c757adc... 1ff7532... M ?fs/ext4/xattr.c
> ?fs/ext4/ext4.h ? ?| ? ?1 +
> ?fs/ext4/extents.c | ? ?3 +++
> ?fs/ext4/inode.c ? | ? ?3 +++
> ?fs/ext4/mballoc.c | ? ?8 ++++++++
> ?fs/ext4/xattr.c ? | ? 12 ++++++++----
> ?5 files changed, 23 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1921392..38a4d75 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
> ?#define EXT4_FREE_BLOCKS_METADATA ? ? ?0x0001
> ?#define EXT4_FREE_BLOCKS_FORGET ? ? ? ? ? ? ? ?0x0002
> ?#define EXT4_FREE_BLOCKS_VALIDATED ? ? 0x0004
> +#define EXT4_FREE_BLOCKS_ZERO ? ? ? ? ?0x0008
>
> ?/*
> ?* ioctl commands
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index f815cc8..cf178f3 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
>
> ? ? ? ?if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> ? ? ? ? ? ? ? ?flags |= EXT4_FREE_BLOCKS_METADATA;
> +
> + ? ? ? if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
> + ? ? ? ? ? ? ? flags |= EXT4_FREE_BLOCKS_ZERO;
> ?#ifdef EXTENTS_STATS
> ? ? ? ?{
> ? ? ? ? ? ? ? ?struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 62f86e7..8fdae7d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
> ? ? ? ?for (p = first; p < last; p++)
> ? ? ? ? ? ? ? ?*p = 0;
>
> + ? ? ? if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
> + ? ? ? ? ? ? ? flags |= EXT4_FREE_BLOCKS_ZERO;
> +
> ? ? ? ?ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
> ? ? ? ?return 0;
> ?out_err:
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 6ed859d..94872b2 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
> ? ? ? ?ext4_debug("freeing block %llu\n", block);
> ? ? ? ?trace_ext4_free_blocks(inode, block, count, flags);
>
> + ? ? ? if (flags & EXT4_FREE_BLOCKS_ZERO) {
> + ? ? ? ? ? ? ? err = sb_issue_zeroout(inode->i_sb, block, count, GFP_NOFS);
But the delete of these blocks in not yet committed,
so after reboot, you can end up with a non-deleted but zeroed file data.
Is that acceptable? I should think not.
One way around this is a 2-phase unlink/truncate.
Phase 1: add to orphan list and register a callback on commit
Phase 2: issue zeroout and free the blocks
This won't work for punch hole, but then again, for punch hole
it's probably OK to end up with zeroed data, but non-deleted blocks.
Right?
Amir.
> + ? ? ? ? ? ? ? if (err < 0)
> + ? ? ? ? ? ? ? ? ? ? ? goto error_return;
> + ? ? ? ? ? ? ? else
> + ? ? ? ? ? ? ? ? ? ? ? err = 0;
> + ? ? ? }
> +
> ? ? ? ?if (flags & EXT4_FREE_BLOCKS_FORGET) {
> ? ? ? ? ? ? ? ?struct buffer_head *tbh = bh;
> ? ? ? ? ? ? ? ?int i;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index c757adc..1ff7532 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -471,7 +471,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> ? ? ? ? ? ? ? ? ? ? ? ? struct buffer_head *bh)
> ?{
> ? ? ? ?struct mb_cache_entry *ce = NULL;
> - ? ? ? int error = 0;
> + ? ? ? int free_blocks_flags, error = 0;
>
> ? ? ? ?ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
> ? ? ? ?error = ext4_journal_get_write_access(handle, bh);
> @@ -484,9 +484,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> ? ? ? ? ? ? ? ?if (ce)
> ? ? ? ? ? ? ? ? ? ? ? ?mb_cache_entry_free(ce);
> ? ? ? ? ? ? ? ?get_bh(bh);
> - ? ? ? ? ? ? ? ext4_free_blocks(handle, inode, bh, 0, 1,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?EXT4_FREE_BLOCKS_METADATA |
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?EXT4_FREE_BLOCKS_FORGET);
> + ? ? ? ? ? ? ? free_blocks_flags = EXT4_FREE_BLOCKS_METADATA |
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? EXT4_FREE_BLOCKS_FORGET;
> +
> + ? ? ? ? ? ? ? if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
> + ? ? ? ? ? ? ? ? ? ? ? free_blocks_flags |= EXT4_FREE_BLOCKS_ZERO;
> +
> + ? ? ? ? ? ? ? ext4_free_blocks(handle, inode, bh, 0, 1, free_blocks_flags);
> ? ? ? ?} else {
> ? ? ? ? ? ? ? ?le32_add_cpu(&BHDR(bh)->h_refcount, -1);
> ? ? ? ? ? ? ? ?error = ext4_handle_dirty_metadata(handle, inode, bh);
> --
> 1.7.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>
On 2011-07-02, at 3:33 AM, Amir Goldstein <[email protected]> wrote:
> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
> <[email protected]> wrote:
>> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
>> to ext4_free_blocks. This flag causes causes blocks to be
>> zerod before they are freed. Files that have the EXT4_SECRM_FL
>> attribute flag on will have their blocks zerod when
>> they are released. The EXT4_SECRM_FL attribute flag can
>> be enabled useing chattr +s
>>
>> Signed-off-by: Allison Henderson <[email protected]>
>> ---
>> v1->v2
>> Removed check for discard mount option and replaced with
>> check for secure discard and discard_zeroes_data
>>
>> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>>
>> v2->v3
>> Removed code for discard. A seperate patch will
>> be done to add that code in the block layer
>>
>> :100644 100644 1921392... 38a4d75... M fs/ext4/ext4.h
>> :100644 100644 f815cc8... cf178f3... M fs/ext4/extents.c
>> :100644 100644 62f86e7... 8fdae7d... M fs/ext4/inode.c
>> :100644 100644 6ed859d... 94872b2... M fs/ext4/mballoc.c
>> :100644 100644 c757adc... 1ff7532... M fs/ext4/xattr.c
>> fs/ext4/ext4.h | 1 +
>> fs/ext4/extents.c | 3 +++
>> fs/ext4/inode.c | 3 +++
>> fs/ext4/mballoc.c | 8 ++++++++
>> fs/ext4/xattr.c | 12 ++++++++----
>> 5 files changed, 23 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 1921392..38a4d75 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
>> #define EXT4_FREE_BLOCKS_METADATA 0x0001
>> #define EXT4_FREE_BLOCKS_FORGET 0x0002
>> #define EXT4_FREE_BLOCKS_VALIDATED 0x0004
>> +#define EXT4_FREE_BLOCKS_ZERO 0x0008
>>
>> /*
>> * ioctl commands
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index f815cc8..cf178f3 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
>>
>> if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
>> flags |= EXT4_FREE_BLOCKS_METADATA;
>> +
>> + if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
>> + flags |= EXT4_FREE_BLOCKS_ZERO;
>> #ifdef EXTENTS_STATS
>> {
>> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 62f86e7..8fdae7d 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
>> for (p = first; p < last; p++)
>> *p = 0;
>>
>> + if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
>> + flags |= EXT4_FREE_BLOCKS_ZERO;
>> +
>> ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
>> return 0;
>> out_err:
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index 6ed859d..94872b2 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
>> ext4_debug("freeing block %llu\n", block);
>> trace_ext4_free_blocks(inode, block, count, flags);
>>
>> + if (flags & EXT4_FREE_BLOCKS_ZERO) {
>> + err = sb_issue_zeroout(inode->i_sb, block, count, GFP_NOFS);
>
> But the delete of these blocks in not yet committed,
> so after reboot, you can end up with a non-deleted but zeroed file data.
> Is that acceptable? I should think not.
>
> One way around this is a 2-phase unlink/truncate.
> Phase 1: add to orphan list and register a callback on commit
> Phase 2: issue zeroout and free the blocks
I didn't look at this very closely, but I thought the place this was being done was in the mballoc commit callback, so that it only zeroes the blocks after they are really freed?
The danger then would be that you want to complete the zeroing of the blocks before they are reallocated.
> This won't work for punch hole, but then again, for punch hole
> it's probably OK to end up with zeroed data, but non-deleted blocks.
> Right?
>
> Amir.
>
>> + if (err < 0)
>> + goto error_return;
>> + else
>> + err = 0;
>> + }
>> +
>> if (flags & EXT4_FREE_BLOCKS_FORGET) {
>> struct buffer_head *tbh = bh;
>> int i;
>> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
>> index c757adc..1ff7532 100644
>> --- a/fs/ext4/xattr.c
>> +++ b/fs/ext4/xattr.c
>> @@ -471,7 +471,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>> struct buffer_head *bh)
>> {
>> struct mb_cache_entry *ce = NULL;
>> - int error = 0;
>> + int free_blocks_flags, error = 0;
>>
>> ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
>> error = ext4_journal_get_write_access(handle, bh);
>> @@ -484,9 +484,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>> if (ce)
>> mb_cache_entry_free(ce);
>> get_bh(bh);
>> - ext4_free_blocks(handle, inode, bh, 0, 1,
>> - EXT4_FREE_BLOCKS_METADATA |
>> - EXT4_FREE_BLOCKS_FORGET);
>> + free_blocks_flags = EXT4_FREE_BLOCKS_METADATA |
>> + EXT4_FREE_BLOCKS_FORGET;
>> +
>> + if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
>> + free_blocks_flags |= EXT4_FREE_BLOCKS_ZERO;
>> +
>> + ext4_free_blocks(handle, inode, bh, 0, 1, free_blocks_flags);
>> } else {
>> le32_add_cpu(&BHDR(bh)->h_refcount, -1);
>> error = ext4_handle_dirty_metadata(handle, inode, bh);
>> --
>> 1.7.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Jul 3, 2011 at 10:00 AM, Andreas Dilger <[email protected]> wrote:
> On 2011-07-02, at 3:33 AM, Amir Goldstein <[email protected]> wrote:
>
>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
>> <[email protected]> wrote:
>>> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
>>> to ext4_free_blocks. This flag causes causes blocks to be
>>> zerod before they are freed. ?Files that have the EXT4_SECRM_FL
>>> attribute flag on will have their blocks zerod when
>>> they are released. ?The EXT4_SECRM_FL attribute flag can
>>> be enabled useing chattr +s
>>>
>>> Signed-off-by: Allison Henderson <[email protected]>
>>> ---
>>> v1->v2
>>> Removed check for discard mount option and replaced with
>>> check for secure discard and discard_zeroes_data
>>>
>>> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>>>
>>> v2->v3
>>> Removed code for discard. ?A seperate patch will
>>> be done to add that code in the block layer
>>>
>>> :100644 100644 1921392... 38a4d75... M ?fs/ext4/ext4.h
>>> :100644 100644 f815cc8... cf178f3... M ?fs/ext4/extents.c
>>> :100644 100644 62f86e7... 8fdae7d... M ?fs/ext4/inode.c
>>> :100644 100644 6ed859d... 94872b2... M ?fs/ext4/mballoc.c
>>> :100644 100644 c757adc... 1ff7532... M ?fs/ext4/xattr.c
>>> ?fs/ext4/ext4.h ? ?| ? ?1 +
>>> ?fs/ext4/extents.c | ? ?3 +++
>>> ?fs/ext4/inode.c ? | ? ?3 +++
>>> ?fs/ext4/mballoc.c | ? ?8 ++++++++
>>> ?fs/ext4/xattr.c ? | ? 12 ++++++++----
>>> ?5 files changed, 23 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>> index 1921392..38a4d75 100644
>>> --- a/fs/ext4/ext4.h
>>> +++ b/fs/ext4/ext4.h
>>> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
>>> ?#define EXT4_FREE_BLOCKS_METADATA 0x0001
>>> ?#define EXT4_FREE_BLOCKS_FORGET ? ? ? ? ? ? ? ?0x0002
>>> ?#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
>>> +#define EXT4_FREE_BLOCKS_ZERO ? ? ? ? ?0x0008
>>>
>>> ?/*
>>> ?* ioctl commands
>>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>>> index f815cc8..cf178f3 100644
>>> --- a/fs/ext4/extents.c
>>> +++ b/fs/ext4/extents.c
>>> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
>>>
>>> ? ? ? ?if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
>>> ? ? ? ? ? ? ? ?flags |= EXT4_FREE_BLOCKS_METADATA;
>>> +
>>> + ? ? ? if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
>>> + ? ? ? ? ? ? ? flags |= EXT4_FREE_BLOCKS_ZERO;
>>> ?#ifdef EXTENTS_STATS
>>> ? ? ? ?{
>>> ? ? ? ? ? ? ? ?struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>> index 62f86e7..8fdae7d 100644
>>> --- a/fs/ext4/inode.c
>>> +++ b/fs/ext4/inode.c
>>> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
>>> ? ? ? ?for (p = first; p < last; p++)
>>> ? ? ? ? ? ? ? ?*p = 0;
>>>
>>> + ? ? ? if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
>>> + ? ? ? ? ? ? ? flags |= EXT4_FREE_BLOCKS_ZERO;
>>> +
>>> ? ? ? ?ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
>>> ? ? ? ?return 0;
>>> ?out_err:
>>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>>> index 6ed859d..94872b2 100644
>>> --- a/fs/ext4/mballoc.c
>>> +++ b/fs/ext4/mballoc.c
>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
>>> ? ? ? ?ext4_debug("freeing block %llu\n", block);
>>> ? ? ? ?trace_ext4_free_blocks(inode, block, count, flags);
>>>
>>> + ? ? ? if (flags & EXT4_FREE_BLOCKS_ZERO) {
>>> + ? ? ? ? ? ? ? err = sb_issue_zeroout(inode->i_sb, block, count, GFP_NOFS);
>>
>> But the delete of these blocks in not yet committed,
>> so after reboot, you can end up with a non-deleted but zeroed file data.
>> Is that acceptable? I should think not.
>>
>> One way around this is a 2-phase unlink/truncate.
>> Phase 1: add to orphan list and register a callback on commit
>> Phase 2: issue zeroout and free the blocks
>
> I didn't look at this very closely, but I thought the place this was being done was in the mballoc commit callback, so that it only zeroes the blocks after they are really freed?
Nope, the zeroout is in the beginning of ext4_free_blocks().
I also thought it was a mistake until I realized the purpose of secure
delete was security,
so deleting non-zeroed blocks is not an option.
>
> The danger then would be that you want to complete the zeroing of the blocks before they are reallocated.
>
Actually, I think the secure delete requirement is stronger - that
zeroing is completed *before* blocks are freed.
Otherwise, after blocks are freed, the power may go down and the disks
may be mounted on a different
system where the deleted blocks can be reallocated.
>> This won't work for punch hole, but then again, for punch hole
>> it's probably OK to end up with zeroed data, but non-deleted blocks.
>> Right?
>>
>> Amir.
>>
>>> + ? ? ? ? ? ? ? if (err < 0)
>>> + ? ? ? ? ? ? ? ? ? ? ? goto error_return;
>>> + ? ? ? ? ? ? ? else
>>> + ? ? ? ? ? ? ? ? ? ? ? err = 0;
>>> + ? ? ? }
>>> +
>>> ? ? ? ?if (flags & EXT4_FREE_BLOCKS_FORGET) {
>>> ? ? ? ? ? ? ? ?struct buffer_head *tbh = bh;
>>> ? ? ? ? ? ? ? ?int i;
>>> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
>>> index c757adc..1ff7532 100644
>>> --- a/fs/ext4/xattr.c
>>> +++ b/fs/ext4/xattr.c
>>> @@ -471,7 +471,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>>> ? ? ? ? ? ? ? ? ? ? ? ? struct buffer_head *bh)
>>> ?{
>>> ? ? ? ?struct mb_cache_entry *ce = NULL;
>>> - ? ? ? int error = 0;
>>> + ? ? ? int free_blocks_flags, error = 0;
>>>
>>> ? ? ? ?ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
>>> ? ? ? ?error = ext4_journal_get_write_access(handle, bh);
>>> @@ -484,9 +484,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>>> ? ? ? ? ? ? ? ?if (ce)
>>> ? ? ? ? ? ? ? ? ? ? ? ?mb_cache_entry_free(ce);
>>> ? ? ? ? ? ? ? ?get_bh(bh);
>>> - ? ? ? ? ? ? ? ext4_free_blocks(handle, inode, bh, 0, 1,
>>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?EXT4_FREE_BLOCKS_METADATA |
>>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?EXT4_FREE_BLOCKS_FORGET);
>>> + ? ? ? ? ? ? ? free_blocks_flags = EXT4_FREE_BLOCKS_METADATA |
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? EXT4_FREE_BLOCKS_FORGET;
>>> +
>>> + ? ? ? ? ? ? ? if (EXT4_I(inode)->i_flags & EXT4_SECRM_FL)
>>> + ? ? ? ? ? ? ? ? ? ? ? free_blocks_flags |= EXT4_FREE_BLOCKS_ZERO;
>>> +
>>> + ? ? ? ? ? ? ? ext4_free_blocks(handle, inode, bh, 0, 1, free_blocks_flags);
>>> ? ? ? ?} else {
>>> ? ? ? ? ? ? ? ?le32_add_cpu(&BHDR(bh)->h_refcount, -1);
>>> ? ? ? ? ? ? ? ?error = ext4_handle_dirty_metadata(handle, inode, bh);
>>> --
>>> 1.7.1
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>>> the body of a message to [email protected]
>>> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>
On 07/03/2011 12:37 AM, Amir Goldstein wrote:
> On Sun, Jul 3, 2011 at 10:00 AM, Andreas Dilger<[email protected]> wrote:
>> On 2011-07-02, at 3:33 AM, Amir Goldstein<[email protected]> wrote:
>>
>>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
>>> <[email protected]> wrote:
>>>> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
>>>> to ext4_free_blocks. This flag causes causes blocks to be
>>>> zerod before they are freed. Files that have the EXT4_SECRM_FL
>>>> attribute flag on will have their blocks zerod when
>>>> they are released. The EXT4_SECRM_FL attribute flag can
>>>> be enabled useing chattr +s
>>>>
>>>> Signed-off-by: Allison Henderson<[email protected]>
>>>> ---
>>>> v1->v2
>>>> Removed check for discard mount option and replaced with
>>>> check for secure discard and discard_zeroes_data
>>>>
>>>> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>>>>
>>>> v2->v3
>>>> Removed code for discard. A seperate patch will
>>>> be done to add that code in the block layer
>>>>
>>>> :100644 100644 1921392... 38a4d75... M fs/ext4/ext4.h
>>>> :100644 100644 f815cc8... cf178f3... M fs/ext4/extents.c
>>>> :100644 100644 62f86e7... 8fdae7d... M fs/ext4/inode.c
>>>> :100644 100644 6ed859d... 94872b2... M fs/ext4/mballoc.c
>>>> :100644 100644 c757adc... 1ff7532... M fs/ext4/xattr.c
>>>> fs/ext4/ext4.h | 1 +
>>>> fs/ext4/extents.c | 3 +++
>>>> fs/ext4/inode.c | 3 +++
>>>> fs/ext4/mballoc.c | 8 ++++++++
>>>> fs/ext4/xattr.c | 12 ++++++++----
>>>> 5 files changed, 23 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>>> index 1921392..38a4d75 100644
>>>> --- a/fs/ext4/ext4.h
>>>> +++ b/fs/ext4/ext4.h
>>>> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
>>>> #define EXT4_FREE_BLOCKS_METADATA 0x0001
>>>> #define EXT4_FREE_BLOCKS_FORGET 0x0002
>>>> #define EXT4_FREE_BLOCKS_VALIDATED 0x0004
>>>> +#define EXT4_FREE_BLOCKS_ZERO 0x0008
>>>>
>>>> /*
>>>> * ioctl commands
>>>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>>>> index f815cc8..cf178f3 100644
>>>> --- a/fs/ext4/extents.c
>>>> +++ b/fs/ext4/extents.c
>>>> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
>>>>
>>>> if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
>>>> flags |= EXT4_FREE_BLOCKS_METADATA;
>>>> +
>>>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>>>> + flags |= EXT4_FREE_BLOCKS_ZERO;
>>>> #ifdef EXTENTS_STATS
>>>> {
>>>> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>>> index 62f86e7..8fdae7d 100644
>>>> --- a/fs/ext4/inode.c
>>>> +++ b/fs/ext4/inode.c
>>>> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
>>>> for (p = first; p< last; p++)
>>>> *p = 0;
>>>>
>>>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>>>> + flags |= EXT4_FREE_BLOCKS_ZERO;
>>>> +
>>>> ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
>>>> return 0;
>>>> out_err:
>>>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>>>> index 6ed859d..94872b2 100644
>>>> --- a/fs/ext4/mballoc.c
>>>> +++ b/fs/ext4/mballoc.c
>>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
>>>> ext4_debug("freeing block %llu\n", block);
>>>> trace_ext4_free_blocks(inode, block, count, flags);
>>>>
>>>> + if (flags& EXT4_FREE_BLOCKS_ZERO) {
>>>> + err = sb_issue_zeroout(inode->i_sb, block, count, GFP_NOFS);
>>>
>>> But the delete of these blocks in not yet committed,
>>> so after reboot, you can end up with a non-deleted but zeroed file data.
>>> Is that acceptable? I should think not.
>>>
>>> One way around this is a 2-phase unlink/truncate.
>>> Phase 1: add to orphan list and register a callback on commit
>>> Phase 2: issue zeroout and free the blocks
>>
>> I didn't look at this very closely, but I thought the place this was being done was in the mballoc commit callback, so that it only zeroes the blocks after they are really freed?
>
> Nope, the zeroout is in the beginning of ext4_free_blocks().
> I also thought it was a mistake until I realized the purpose of secure
> delete was security,
> so deleting non-zeroed blocks is not an option.
>
>>
>> The danger then would be that you want to complete the zeroing of the blocks before they are reallocated.
>>
>
> Actually, I think the secure delete requirement is stronger - that
> zeroing is completed *before* blocks are freed.
> Otherwise, after blocks are freed, the power may go down and the disks
> may be mounted on a different
> system where the deleted blocks can be reallocated.
Thx all for the reviews! It sounds like the zero out code is in the
right spot then. We are thinking about adding an optimization too,
where we use use secure discard instead of the sb_issue_zeroout, but
only if the device supports it. I was thinking about putting that code
some where in the commit call back because that is where the existing
discard code is, but maybe that's not such a good place to put it then?
What does everyone think? Thx!
Allison
>
>>> This won't work for punch hole, but then again, for punch hole
>>> it's probably OK to end up with zeroed data, but non-deleted blocks.
>>> Right?
>>>
>>> Amir.
>>>
>>>> + if (err< 0)
>>>> + goto error_return;
>>>> + else
>>>> + err = 0;
>>>> + }
>>>> +
>>>> if (flags& EXT4_FREE_BLOCKS_FORGET) {
>>>> struct buffer_head *tbh = bh;
>>>> int i;
>>>> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
>>>> index c757adc..1ff7532 100644
>>>> --- a/fs/ext4/xattr.c
>>>> +++ b/fs/ext4/xattr.c
>>>> @@ -471,7 +471,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>>>> struct buffer_head *bh)
>>>> {
>>>> struct mb_cache_entry *ce = NULL;
>>>> - int error = 0;
>>>> + int free_blocks_flags, error = 0;
>>>>
>>>> ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
>>>> error = ext4_journal_get_write_access(handle, bh);
>>>> @@ -484,9 +484,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>>>> if (ce)
>>>> mb_cache_entry_free(ce);
>>>> get_bh(bh);
>>>> - ext4_free_blocks(handle, inode, bh, 0, 1,
>>>> - EXT4_FREE_BLOCKS_METADATA |
>>>> - EXT4_FREE_BLOCKS_FORGET);
>>>> + free_blocks_flags = EXT4_FREE_BLOCKS_METADATA |
>>>> + EXT4_FREE_BLOCKS_FORGET;
>>>> +
>>>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>>>> + free_blocks_flags |= EXT4_FREE_BLOCKS_ZERO;
>>>> +
>>>> + ext4_free_blocks(handle, inode, bh, 0, 1, free_blocks_flags);
>>>> } else {
>>>> le32_add_cpu(&BHDR(bh)->h_refcount, -1);
>>>> error = ext4_handle_dirty_metadata(handle, inode, bh);
>>>> --
>>>> 1.7.1
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>>>> the body of a message to [email protected]
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
On Mon, Jul 4, 2011 at 8:19 PM, Allison Henderson
<[email protected]> wrote:
> On 07/03/2011 12:37 AM, Amir Goldstein wrote:
>>
>> On Sun, Jul 3, 2011 at 10:00 AM, Andreas Dilger<[email protected]> ?wrote:
>>>
>>> On 2011-07-02, at 3:33 AM, Amir Goldstein<[email protected]> ?wrote:
>>>
>>>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
>>>> <[email protected]> ?wrote:
>>>>>
>>>>> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
>>>>> to ext4_free_blocks. This flag causes causes blocks to be
>>>>> zerod before they are freed. ?Files that have the EXT4_SECRM_FL
>>>>> attribute flag on will have their blocks zerod when
>>>>> they are released. ?The EXT4_SECRM_FL attribute flag can
>>>>> be enabled useing chattr +s
>>>>>
>>>>> Signed-off-by: Allison Henderson<[email protected]>
>>>>> ---
>>>>> v1->v2
>>>>> Removed check for discard mount option and replaced with
>>>>> check for secure discard and discard_zeroes_data
>>>>>
>>>>> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>>>>>
>>>>> v2->v3
>>>>> Removed code for discard. ?A seperate patch will
>>>>> be done to add that code in the block layer
>>>>>
>>>>> :100644 100644 1921392... 38a4d75... M ?fs/ext4/ext4.h
>>>>> :100644 100644 f815cc8... cf178f3... M ?fs/ext4/extents.c
>>>>> :100644 100644 62f86e7... 8fdae7d... M ?fs/ext4/inode.c
>>>>> :100644 100644 6ed859d... 94872b2... M ?fs/ext4/mballoc.c
>>>>> :100644 100644 c757adc... 1ff7532... M ?fs/ext4/xattr.c
>>>>> ?fs/ext4/ext4.h ? ?| ? ?1 +
>>>>> ?fs/ext4/extents.c | ? ?3 +++
>>>>> ?fs/ext4/inode.c ? | ? ?3 +++
>>>>> ?fs/ext4/mballoc.c | ? ?8 ++++++++
>>>>> ?fs/ext4/xattr.c ? | ? 12 ++++++++----
>>>>> ?5 files changed, 23 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>>>> index 1921392..38a4d75 100644
>>>>> --- a/fs/ext4/ext4.h
>>>>> +++ b/fs/ext4/ext4.h
>>>>> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
>>>>> ?#define EXT4_FREE_BLOCKS_METADATA 0x0001
>>>>> ?#define EXT4_FREE_BLOCKS_FORGET ? ? ? ? ? ? ? ?0x0002
>>>>> ?#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
>>>>> +#define EXT4_FREE_BLOCKS_ZERO ? ? ? ? ?0x0008
>>>>>
>>>>> ?/*
>>>>> ?* ioctl commands
>>>>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>>>>> index f815cc8..cf178f3 100644
>>>>> --- a/fs/ext4/extents.c
>>>>> +++ b/fs/ext4/extents.c
>>>>> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle,
>>>>> struct inode *inode,
>>>>>
>>>>> ? ? ? ?if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
>>>>> ? ? ? ? ? ? ? ?flags |= EXT4_FREE_BLOCKS_METADATA;
>>>>> +
>>>>> + ? ? ? if (EXT4_I(inode)->i_flags& ?EXT4_SECRM_FL)
>>>>> + ? ? ? ? ? ? ? flags |= EXT4_FREE_BLOCKS_ZERO;
>>>>> ?#ifdef EXTENTS_STATS
>>>>> ? ? ? ?{
>>>>> ? ? ? ? ? ? ? ?struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>>>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>>>> index 62f86e7..8fdae7d 100644
>>>>> --- a/fs/ext4/inode.c
>>>>> +++ b/fs/ext4/inode.c
>>>>> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle,
>>>>> struct inode *inode,
>>>>> ? ? ? ?for (p = first; p< ?last; p++)
>>>>> ? ? ? ? ? ? ? ?*p = 0;
>>>>>
>>>>> + ? ? ? if (EXT4_I(inode)->i_flags& ?EXT4_SECRM_FL)
>>>>> + ? ? ? ? ? ? ? flags |= EXT4_FREE_BLOCKS_ZERO;
>>>>> +
>>>>> ? ? ? ?ext4_free_blocks(handle, inode, NULL, block_to_free, count,
>>>>> flags);
>>>>> ? ? ? ?return 0;
>>>>> ?out_err:
>>>>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>>>>> index 6ed859d..94872b2 100644
>>>>> --- a/fs/ext4/mballoc.c
>>>>> +++ b/fs/ext4/mballoc.c
>>>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct
>>>>> inode *inode,
>>>>> ? ? ? ?ext4_debug("freeing block %llu\n", block);
>>>>> ? ? ? ?trace_ext4_free_blocks(inode, block, count, flags);
>>>>>
>>>>> + ? ? ? if (flags& ?EXT4_FREE_BLOCKS_ZERO) {
>>>>> + ? ? ? ? ? ? ? err = sb_issue_zeroout(inode->i_sb, block, count,
>>>>> GFP_NOFS);
>>>>
>>>> But the delete of these blocks in not yet committed,
>>>> so after reboot, you can end up with a non-deleted but zeroed file data.
>>>> Is that acceptable? I should think not.
>>>>
>>>> One way around this is a 2-phase unlink/truncate.
>>>> Phase 1: add to orphan list and register a callback on commit
>>>> Phase 2: issue zeroout and free the blocks
>>>
>>> I didn't look at this very closely, but I thought the place this was
>>> being done was in the mballoc commit callback, so that it only zeroes the
>>> blocks after they are really freed?
>>
>> Nope, the zeroout is in the beginning of ext4_free_blocks().
>> I also thought it was a mistake until I realized the purpose of secure
>> delete was security,
>> so deleting non-zeroed blocks is not an option.
>>
>>>
>>> The danger then would be that you want to complete the zeroing of the
>>> blocks before they are reallocated.
>>>
>>
>> Actually, I think the secure delete requirement is stronger - that
>> zeroing is completed *before* blocks are freed.
>> Otherwise, after blocks are freed, the power may go down and the disks
>> may be mounted on a different
>> system where the deleted blocks can be reallocated.
>
> Thx all for the reviews! ?It sounds like the zero out code is in the right
> spot then. ?We are thinking about adding an optimization too, where we use
> use secure discard instead of the sb_issue_zeroout, but only if the device
> supports it. ?I was thinking about putting that code some where in the
> commit call back because that is where the existing discard code is, but
> maybe that's not such a good place to put it then? ?What does everyone
> think? ?Thx!
>
> Allison
>
I already stated my opinion about the need for 2-phase secure delete.
If you have to choose between security (zeroout pre commit) and the
atomicity of the unlink() command (zeroout post commit), then it's
a question of policy.
Is there any other FS (or OS) that implements secure delete?
Perhaps we could follow its semantics.
Cheers,
Amir.
On 2011-07-04, at 11:44 AM, Amir Goldstein wrote:
> On Mon, Jul 4, 2011 at 8:19 PM, Allison Henderson
> <[email protected]> wrote:
>>
>> Thx all for the reviews! It sounds like the zero out code is in the right
>> spot then. We are thinking about adding an optimization too, where we use
>> use secure discard instead of the sb_issue_zeroout, but only if the device
>> supports it. I was thinking about putting that code some where in the
>> commit call back because that is where the existing discard code is, but
>> maybe that's not such a good place to put it then? What does everyone
>> think? Thx!
>
> I already stated my opinion about the need for 2-phase secure delete.
> If you have to choose between security (zeroout pre commit) and the
> atomicity of the unlink() command (zeroout post commit), then it's
> a question of policy.
> Is there any other FS (or OS) that implements secure delete?
> Perhaps we could follow its semantics.
One thing we did ages ago, before extent-mapped files made unlink so
fast, was to move the blocks from unlinked files and truncated-to-zero
files to a delete queue in the main transaction, and then do the unlink
via a separate thread.
This facility could be resurrected (a version of the patch was posted to
linux-ext4 at http://www.spinics.net/lists/linux-ext4/msg06178.html) to
do the block zeroing/discard in the context of the unlink thread. It
could be structured so that sync/fsync on the file waits for background
zeroing to complete, so that apps doing secure delete + fsync will be
sure that the file is safely erased. The fsync would be needed for this
in any case, otherwise even an inline async zero-fill could fail if the
system crashes before the blocks are actually flushed to disk.
Cheers, Andreas
On 07/04/2011 11:19 AM, Andreas Dilger wrote:
> On 2011-07-04, at 11:44 AM, Amir Goldstein wrote:
>> On Mon, Jul 4, 2011 at 8:19 PM, Allison Henderson
>> <[email protected]> wrote:
>>>
>>> Thx all for the reviews! It sounds like the zero out code is in the right
>>> spot then. We are thinking about adding an optimization too, where we use
>>> use secure discard instead of the sb_issue_zeroout, but only if the device
>>> supports it. I was thinking about putting that code some where in the
>>> commit call back because that is where the existing discard code is, but
>>> maybe that's not such a good place to put it then? What does everyone
>>> think? Thx!
>>
>> I already stated my opinion about the need for 2-phase secure delete.
>> If you have to choose between security (zeroout pre commit) and the
>> atomicity of the unlink() command (zeroout post commit), then it's
>> a question of policy.
>> Is there any other FS (or OS) that implements secure delete?
>> Perhaps we could follow its semantics.
>
> One thing we did ages ago, before extent-mapped files made unlink so
> fast, was to move the blocks from unlinked files and truncated-to-zero
> files to a delete queue in the main transaction, and then do the unlink
> via a separate thread.
>
> This facility could be resurrected (a version of the patch was posted to
> linux-ext4 at http://www.spinics.net/lists/linux-ext4/msg06178.html) to
> do the block zeroing/discard in the context of the unlink thread. It
> could be structured so that sync/fsync on the file waits for background
> zeroing to complete, so that apps doing secure delete + fsync will be
> sure that the file is safely erased. The fsync would be needed for this
> in any case, otherwise even an inline async zero-fill could fail if the
> system crashes before the blocks are actually flushed to disk.
>
> Cheers, Andreas
>
>
Oh alrighty, I will look into that then. Thx all for your input! :)
Allison
On 07/02/2011 02:33 AM, Amir Goldstein wrote:
> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
> <[email protected]> wrote:
>> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
>> to ext4_free_blocks. This flag causes causes blocks to be
>> zerod before they are freed. Files that have the EXT4_SECRM_FL
>> attribute flag on will have their blocks zerod when
>> they are released. The EXT4_SECRM_FL attribute flag can
>> be enabled useing chattr +s
>>
>> Signed-off-by: Allison Henderson<[email protected]>
>> ---
>> v1->v2
>> Removed check for discard mount option and replaced with
>> check for secure discard and discard_zeroes_data
>>
>> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>>
>> v2->v3
>> Removed code for discard. A seperate patch will
>> be done to add that code in the block layer
>>
>> :100644 100644 1921392... 38a4d75... M fs/ext4/ext4.h
>> :100644 100644 f815cc8... cf178f3... M fs/ext4/extents.c
>> :100644 100644 62f86e7... 8fdae7d... M fs/ext4/inode.c
>> :100644 100644 6ed859d... 94872b2... M fs/ext4/mballoc.c
>> :100644 100644 c757adc... 1ff7532... M fs/ext4/xattr.c
>> fs/ext4/ext4.h | 1 +
>> fs/ext4/extents.c | 3 +++
>> fs/ext4/inode.c | 3 +++
>> fs/ext4/mballoc.c | 8 ++++++++
>> fs/ext4/xattr.c | 12 ++++++++----
>> 5 files changed, 23 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 1921392..38a4d75 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
>> #define EXT4_FREE_BLOCKS_METADATA 0x0001
>> #define EXT4_FREE_BLOCKS_FORGET 0x0002
>> #define EXT4_FREE_BLOCKS_VALIDATED 0x0004
>> +#define EXT4_FREE_BLOCKS_ZERO 0x0008
>>
>> /*
>> * ioctl commands
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index f815cc8..cf178f3 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
>>
>> if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
>> flags |= EXT4_FREE_BLOCKS_METADATA;
>> +
>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>> + flags |= EXT4_FREE_BLOCKS_ZERO;
>> #ifdef EXTENTS_STATS
>> {
>> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 62f86e7..8fdae7d 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
>> for (p = first; p< last; p++)
>> *p = 0;
>>
>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>> + flags |= EXT4_FREE_BLOCKS_ZERO;
>> +
>> ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
>> return 0;
>> out_err:
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index 6ed859d..94872b2 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
>> ext4_debug("freeing block %llu\n", block);
>> trace_ext4_free_blocks(inode, block, count, flags);
>>
>> + if (flags& EXT4_FREE_BLOCKS_ZERO) {
>> + err = sb_issue_zeroout(inode->i_sb, block, count, GFP_NOFS);
>
> But the delete of these blocks in not yet committed,
> so after reboot, you can end up with a non-deleted but zeroed file data.
> Is that acceptable? I should think not.
>
> One way around this is a 2-phase unlink/truncate.
> Phase 1: add to orphan list and register a callback on commit
> Phase 2: issue zeroout and free the blocks
>
> This won't work for punch hole, but then again, for punch hole
> it's probably OK to end up with zeroed data, but non-deleted blocks.
> Right?
>
> Amir.
Hi, I had a quick question about the orphan list. I notice that
ext4_ext_truncate and also ext4_ext_punch_hole already have a call to
ext4_orphan_add that happens really early before any calls to free
blocks. Does this address your earlier concerns, or is there another
reason I missed? Thx!
Allison Henderson
>
>> + if (err< 0)
>> + goto error_return;
>> + else
>> + err = 0;
>> + }
>> +
>> if (flags& EXT4_FREE_BLOCKS_FORGET) {
>> struct buffer_head *tbh = bh;
>> int i;
>> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
>> index c757adc..1ff7532 100644
>> --- a/fs/ext4/xattr.c
>> +++ b/fs/ext4/xattr.c
>> @@ -471,7 +471,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>> struct buffer_head *bh)
>> {
>> struct mb_cache_entry *ce = NULL;
>> - int error = 0;
>> + int free_blocks_flags, error = 0;
>>
>> ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
>> error = ext4_journal_get_write_access(handle, bh);
>> @@ -484,9 +484,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>> if (ce)
>> mb_cache_entry_free(ce);
>> get_bh(bh);
>> - ext4_free_blocks(handle, inode, bh, 0, 1,
>> - EXT4_FREE_BLOCKS_METADATA |
>> - EXT4_FREE_BLOCKS_FORGET);
>> + free_blocks_flags = EXT4_FREE_BLOCKS_METADATA |
>> + EXT4_FREE_BLOCKS_FORGET;
>> +
>> + if (EXT4_I(inode)->i_flags& EXT4_SECRM_FL)
>> + free_blocks_flags |= EXT4_FREE_BLOCKS_ZERO;
>> +
>> + ext4_free_blocks(handle, inode, bh, 0, 1, free_blocks_flags);
>> } else {
>> le32_add_cpu(&BHDR(bh)->h_refcount, -1);
>> error = ext4_handle_dirty_metadata(handle, inode, bh);
>> --
>> 1.7.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 7, 2011 at 12:05 AM, Allison Henderson
<[email protected]> wrote:
> On 07/02/2011 02:33 AM, Amir Goldstein wrote:
>>
>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
>> <[email protected]> ?wrote:
>>>
>>> The first patch adds a new flag, EXT4_FREE_BLOCKS_ZERO,
>>> to ext4_free_blocks. This flag causes causes blocks to be
>>> zerod before they are freed. ?Files that have the EXT4_SECRM_FL
>>> attribute flag on will have their blocks zerod when
>>> they are released. ?The EXT4_SECRM_FL attribute flag can
>>> be enabled useing chattr +s
>>>
>>> Signed-off-by: Allison Henderson<[email protected]>
>>> ---
>>> v1->v2
>>> Removed check for discard mount option and replaced with
>>> check for secure discard and discard_zeroes_data
>>>
>>> Added BLKDEV_DISCARD_SECURE to the sb_issue_discard call
>>>
>>> v2->v3
>>> Removed code for discard. ?A seperate patch will
>>> be done to add that code in the block layer
>>>
>>> :100644 100644 1921392... 38a4d75... M ?fs/ext4/ext4.h
>>> :100644 100644 f815cc8... cf178f3... M ?fs/ext4/extents.c
>>> :100644 100644 62f86e7... 8fdae7d... M ?fs/ext4/inode.c
>>> :100644 100644 6ed859d... 94872b2... M ?fs/ext4/mballoc.c
>>> :100644 100644 c757adc... 1ff7532... M ?fs/ext4/xattr.c
>>> ?fs/ext4/ext4.h ? ?| ? ?1 +
>>> ?fs/ext4/extents.c | ? ?3 +++
>>> ?fs/ext4/inode.c ? | ? ?3 +++
>>> ?fs/ext4/mballoc.c | ? ?8 ++++++++
>>> ?fs/ext4/xattr.c ? | ? 12 ++++++++----
>>> ?5 files changed, 23 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>> index 1921392..38a4d75 100644
>>> --- a/fs/ext4/ext4.h
>>> +++ b/fs/ext4/ext4.h
>>> @@ -526,6 +526,7 @@ struct ext4_new_group_data {
>>> ?#define EXT4_FREE_BLOCKS_METADATA ? ? ?0x0001
>>> ?#define EXT4_FREE_BLOCKS_FORGET ? ? ? ? ? ? ? ?0x0002
>>> ?#define EXT4_FREE_BLOCKS_VALIDATED ? ? 0x0004
>>> +#define EXT4_FREE_BLOCKS_ZERO ? ? ? ? ?0x0008
>>>
>>> ?/*
>>> ?* ioctl commands
>>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>>> index f815cc8..cf178f3 100644
>>> --- a/fs/ext4/extents.c
>>> +++ b/fs/ext4/extents.c
>>> @@ -2231,6 +2231,9 @@ static int ext4_remove_blocks(handle_t *handle,
>>> struct inode *inode,
>>>
>>> ? ? ? ?if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
>>> ? ? ? ? ? ? ? ?flags |= EXT4_FREE_BLOCKS_METADATA;
>>> +
>>> + ? ? ? if (EXT4_I(inode)->i_flags& ?EXT4_SECRM_FL)
>>> + ? ? ? ? ? ? ? flags |= EXT4_FREE_BLOCKS_ZERO;
>>> ?#ifdef EXTENTS_STATS
>>> ? ? ? ?{
>>> ? ? ? ? ? ? ? ?struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>> index 62f86e7..8fdae7d 100644
>>> --- a/fs/ext4/inode.c
>>> +++ b/fs/ext4/inode.c
>>> @@ -4182,6 +4182,9 @@ static int ext4_clear_blocks(handle_t *handle,
>>> struct inode *inode,
>>> ? ? ? ?for (p = first; p< ?last; p++)
>>> ? ? ? ? ? ? ? ?*p = 0;
>>>
>>> + ? ? ? if (EXT4_I(inode)->i_flags& ?EXT4_SECRM_FL)
>>> + ? ? ? ? ? ? ? flags |= EXT4_FREE_BLOCKS_ZERO;
>>> +
>>> ? ? ? ?ext4_free_blocks(handle, inode, NULL, block_to_free, count,
>>> flags);
>>> ? ? ? ?return 0;
>>> ?out_err:
>>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>>> index 6ed859d..94872b2 100644
>>> --- a/fs/ext4/mballoc.c
>>> +++ b/fs/ext4/mballoc.c
>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct
>>> inode *inode,
>>> ? ? ? ?ext4_debug("freeing block %llu\n", block);
>>> ? ? ? ?trace_ext4_free_blocks(inode, block, count, flags);
>>>
>>> + ? ? ? if (flags& ?EXT4_FREE_BLOCKS_ZERO) {
>>> + ? ? ? ? ? ? ? err = sb_issue_zeroout(inode->i_sb, block, count,
>>> GFP_NOFS);
>>
>> But the delete of these blocks in not yet committed,
>> so after reboot, you can end up with a non-deleted but zeroed file data.
>> Is that acceptable? I should think not.
>>
>> One way around this is a 2-phase unlink/truncate.
>> Phase 1: add to orphan list and register a callback on commit
>> Phase 2: issue zeroout and free the blocks
>>
>> This won't work for punch hole, but then again, for punch hole
>> it's probably OK to end up with zeroed data, but non-deleted blocks.
>> Right?
>>
>> Amir.
>
> Hi, I had a quick question about the orphan list. ?I notice that
> ext4_ext_truncate and also ext4_ext_punch_hole already have a call to
> ext4_orphan_add that happens really early before any calls to free blocks.
> ?Does this address your earlier concerns, or is there another reason I
> missed? ?Thx!
>
It doesn't address the concerns of getting a non-deleted file with zeroed data
after crash, because the existence of the inode on the orphan list after crash
depends on the transaction that added it to the list being committed.
And your patch zeroes the blocks before that transaction is committed.
However, the orphan list gives you a very good framework to implement
deferred delete (by a kernel thread) as Andreas suggested.
Unlink should be simple, because freeing unlinked inode blocks it is anyway
deferred till the inode refcount drops to zero.
Truncate is more tricky, because of the truncate shrink/extend requirement
(that all data is zeroes after extending the inode's size via truncate
system call),
so a shrinking-deferred truncate would have to mark all the
to-be-deleted extents
uninitialized.
Amir.
On 2011-07-07, at 1:05 AM, Amir Goldstein wrote:
> On Thu, Jul 7, 2011 at 12:05 AM, Allison Henderson
> <[email protected]> wrote:
>> On 07/02/2011 02:33 AM, Amir Goldstein wrote:
>>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
>>> <[email protected]> wrote:
>>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct
>>>> inode *inode,
>>>> ext4_debug("freeing block %llu\n", block);
>>>> trace_ext4_free_blocks(inode, block, count, flags);
>>>>
>>>> + if (flags& EXT4_FREE_BLOCKS_ZERO) {
>>>> + err = sb_issue_zeroout(inode->i_sb, block, count,
>>>> GFP_NOFS);
>>>
>>> But the delete of these blocks in not yet committed,
>>> so after reboot, you can end up with a non-deleted but zeroed file data.
>>> Is that acceptable? I should think not.
>>>
>>> One way around this is a 2-phase unlink/truncate.
>>> Phase 1: add to orphan list and register a callback on commit
>>> Phase 2: issue zeroout and free the blocks
>>>
>>> This won't work for punch hole, but then again, for punch hole
>>> it's probably OK to end up with zeroed data, but non-deleted blocks.
>>> Right?
>>
>> Hi, I had a quick question about the orphan list. I notice that
>> ext4_ext_truncate and also ext4_ext_punch_hole already have a call to
>> ext4_orphan_add that happens really early before any calls to free blocks.
>> Does this address your earlier concerns, or is there another reason I
>> missed? Thx!
>
> It doesn't address the concerns of getting a non-deleted file with zeroed data
> after crash, because the existence of the inode on the orphan list after crash
> depends on the transaction that added it to the list being committed.
> And your patch zeroes the blocks before that transaction is committed.
>
> However, the orphan list gives you a very good framework to implement
> deferred delete (by a kernel thread) as Andreas suggested.
> Unlink should be simple, because freeing unlinked inode blocks it is anyway
> deferred till the inode refcount drops to zero.
Right. The patch that I referenced moved all of the blocks from unlink
and truncate-to-zero from the current inode to a new temporary inode on the
orphan list (simply copying the i_blocks field + i_block and i_size, IIRC,
and zeroing them on the original inode).
> Truncate is more tricky, because of the truncate shrink/extend requirement
> (that all data is zeroes after extending the inode's size via truncate
> system call), so a shrinking-deferred truncate would have to mark all the
> to-be-deleted extents uninitialized.
It would be possible to do this for partial truncate/punch as well, to
move whole blocks over to a new inode on the orphan list and zeroing only
the 1 or 2 partial blocks inline.
It should even be possible to leverage the "block migrate" facility used
by defrag, so that we don't duplicate this code. That would mean just
allocating a temp "unlink" inode in the kernel and putting it on the orphan
list (like an open-unlinked file), migrate the selected range of blocks,
and then zeroing the blocks in the background before unlinking the inode.
I don't think that just marking the deleted extents as uninitialized is
enough, since it would still leave "private" data on disk that could be
read afterward. This would also only work for extent-mapped filesystems.
There may need to be some work to enable the migrate code on block-mapped
files, if you want to allow secure-delete on those files, but that is good
IMHO since it also means that we could defrag block-mapped files.
Cheers, Andreas
On 07/07/2011 12:52 PM, Andreas Dilger wrote:
> On 2011-07-07, at 1:05 AM, Amir Goldstein wrote:
>> On Thu, Jul 7, 2011 at 12:05 AM, Allison Henderson
>> <[email protected]> wrote:
>>> On 07/02/2011 02:33 AM, Amir Goldstein wrote:
>>>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
>>>> <[email protected]> wrote:
>>>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct
>>>>> inode *inode,
>>>>> ext4_debug("freeing block %llu\n", block);
>>>>> trace_ext4_free_blocks(inode, block, count, flags);
>>>>>
>>>>> + if (flags& EXT4_FREE_BLOCKS_ZERO) {
>>>>> + err = sb_issue_zeroout(inode->i_sb, block, count,
>>>>> GFP_NOFS);
>>>>
>>>> But the delete of these blocks in not yet committed,
>>>> so after reboot, you can end up with a non-deleted but zeroed file data.
>>>> Is that acceptable? I should think not.
>>>>
>>>> One way around this is a 2-phase unlink/truncate.
>>>> Phase 1: add to orphan list and register a callback on commit
>>>> Phase 2: issue zeroout and free the blocks
>>>>
>>>> This won't work for punch hole, but then again, for punch hole
>>>> it's probably OK to end up with zeroed data, but non-deleted blocks.
>>>> Right?
>>>
>>> Hi, I had a quick question about the orphan list. I notice that
>>> ext4_ext_truncate and also ext4_ext_punch_hole already have a call to
>>> ext4_orphan_add that happens really early before any calls to free blocks.
>>> Does this address your earlier concerns, or is there another reason I
>>> missed? Thx!
>>
>> It doesn't address the concerns of getting a non-deleted file with zeroed data
>> after crash, because the existence of the inode on the orphan list after crash
>> depends on the transaction that added it to the list being committed.
>> And your patch zeroes the blocks before that transaction is committed.
>>
>> However, the orphan list gives you a very good framework to implement
>> deferred delete (by a kernel thread) as Andreas suggested.
>> Unlink should be simple, because freeing unlinked inode blocks it is anyway
>> deferred till the inode refcount drops to zero.
>
> Right. The patch that I referenced moved all of the blocks from unlink
> and truncate-to-zero from the current inode to a new temporary inode on the
> orphan list (simply copying the i_blocks field + i_block and i_size, IIRC,
> and zeroing them on the original inode).
>
>> Truncate is more tricky, because of the truncate shrink/extend requirement
>> (that all data is zeroes after extending the inode's size via truncate
>> system call), so a shrinking-deferred truncate would have to mark all the
>> to-be-deleted extents uninitialized.
>
> It would be possible to do this for partial truncate/punch as well, to
> move whole blocks over to a new inode on the orphan list and zeroing only
> the 1 or 2 partial blocks inline.
>
> It should even be possible to leverage the "block migrate" facility used
> by defrag, so that we don't duplicate this code. That would mean just
> allocating a temp "unlink" inode in the kernel and putting it on the orphan
> list (like an open-unlinked file), migrate the selected range of blocks,
> and then zeroing the blocks in the background before unlinking the inode.
>
> I don't think that just marking the deleted extents as uninitialized is
> enough, since it would still leave "private" data on disk that could be
> read afterward. This would also only work for extent-mapped filesystems.
>
> There may need to be some work to enable the migrate code on block-mapped
> files, if you want to allow secure-delete on those files, but that is good
> IMHO since it also means that we could defrag block-mapped files.
>
> Cheers, Andreas
>
Ah, ok then. Yes, part of the requirements was to make secure delete
work for partial truncates, punch hole, and also indexed files. So that
will save me some time if I can get the migrate routines work. Thx for
the pointers all!
Allison Henderson
On Thu, Jul 7, 2011 at 11:19 PM, Allison Henderson
<[email protected]> wrote:
> On 07/07/2011 12:52 PM, Andreas Dilger wrote:
>>
>> On 2011-07-07, at 1:05 AM, Amir Goldstein wrote:
>>>
>>> On Thu, Jul 7, 2011 at 12:05 AM, Allison Henderson
>>> <[email protected]> ?wrote:
>>>>
>>>> On 07/02/2011 02:33 AM, Amir Goldstein wrote:
>>>>>
>>>>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
>>>>> <[email protected]> ? wrote:
>>>>>>
>>>>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct
>>>>>> inode *inode,
>>>>>> ? ? ? ?ext4_debug("freeing block %llu\n", block);
>>>>>> ? ? ? ?trace_ext4_free_blocks(inode, block, count, flags);
>>>>>>
>>>>>> + ? ? ? if (flags& ? EXT4_FREE_BLOCKS_ZERO) {
>>>>>> + ? ? ? ? ? ? ? err = sb_issue_zeroout(inode->i_sb, block, count,
>>>>>> GFP_NOFS);
>>>>>
>>>>> But the delete of these blocks in not yet committed,
>>>>> so after reboot, you can end up with a non-deleted but zeroed file
>>>>> data.
>>>>> Is that acceptable? I should think not.
>>>>>
>>>>> One way around this is a 2-phase unlink/truncate.
>>>>> Phase 1: add to orphan list and register a callback on commit
>>>>> Phase 2: issue zeroout and free the blocks
>>>>>
>>>>> This won't work for punch hole, but then again, for punch hole
>>>>> it's probably OK to end up with zeroed data, but non-deleted blocks.
>>>>> Right?
>>>>
>>>> Hi, I had a quick question about the orphan list. ?I notice that
>>>> ext4_ext_truncate and also ext4_ext_punch_hole already have a call to
>>>> ext4_orphan_add that happens really early before any calls to free
>>>> blocks.
>>>> ?Does this address your earlier concerns, or is there another reason I
>>>> missed? ?Thx!
>>>
>>> It doesn't address the concerns of getting a non-deleted file with zeroed
>>> data
>>> after crash, because the existence of the inode on the orphan list after
>>> crash
>>> depends on the transaction that added it to the list being committed.
>>> And your patch zeroes the blocks before that transaction is committed.
>>>
>>> However, the orphan list gives you a very good framework to implement
>>> deferred delete (by a kernel thread) as Andreas suggested.
>>> Unlink should be simple, because freeing unlinked inode blocks it is
>>> anyway
>>> deferred till the inode refcount drops to zero.
>>
>> Right. ?The patch that I referenced moved all of the blocks from unlink
>> and truncate-to-zero from the current inode to a new temporary inode on
>> the
>> orphan list (simply copying the i_blocks field + i_block and i_size, IIRC,
>> and zeroing them on the original inode).
>>
>>> Truncate is more tricky, because of the truncate shrink/extend
>>> requirement
>>> (that all data is zeroes after extending the inode's size via truncate
>>> system call), so a shrinking-deferred truncate would have to mark all the
>>> to-be-deleted extents uninitialized.
>>
>> It would be possible to do this for partial truncate/punch as well, to
>> move whole blocks over to a new inode on the orphan list and zeroing only
>> the 1 or 2 partial blocks inline.
>>
>> It should even be possible to leverage the "block migrate" facility used
>> by defrag, so that we don't duplicate this code. ?That would mean just
>> allocating a temp "unlink" inode in the kernel and putting it on the
>> orphan
>> list (like an open-unlinked file), migrate the selected range of blocks,
>> and then zeroing the blocks in the background before unlinking the inode.
>>
>> I don't think that just marking the deleted extents as uninitialized is
>> enough, since it would still leave "private" data on disk that could be
>> read afterward. ?This would also only work for extent-mapped filesystems.
>>
>> There may need to be some work to enable the migrate code on block-mapped
>> files, if you want to allow secure-delete on those files, but that is good
>> IMHO since it also means that we could defrag block-mapped files.
>>
>> Cheers, Andreas
>>
>
> Ah, ok then. ?Yes, part of the requirements was to make secure delete work
> for partial truncates, punch hole, and also indexed files. ?So that will
> save me some time if I can get the migrate routines work. ?Thx for the
> pointers all!
>
I realized that there is a basic flaw in the concept of deferred-secure-delete.
From a security point of view, after a crash during a secure-delete,
if the file is not there, all its data should have been wiped.
Orphan cleanup on the next mount may be done on a system that
doesn't respect secure delete.
So for real security, the unlink/truncate command cannot return before
all data is wiped.
The unlink/truncate metadata changes must not even be committed
before all data is wiped (or at least part of the data with partial truncate).
Amir.
On 07/07/2011 05:09 PM, Amir Goldstein wrote:
> On Thu, Jul 7, 2011 at 11:19 PM, Allison Henderson
> <[email protected]> wrote:
>> On 07/07/2011 12:52 PM, Andreas Dilger wrote:
>>>
>>> On 2011-07-07, at 1:05 AM, Amir Goldstein wrote:
>>>>
>>>> On Thu, Jul 7, 2011 at 12:05 AM, Allison Henderson
>>>> <[email protected]> wrote:
>>>>>
>>>>> On 07/02/2011 02:33 AM, Amir Goldstein wrote:
>>>>>>
>>>>>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
>>>>>> <[email protected]> wrote:
>>>>>>>
>>>>>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct
>>>>>>> inode *inode,
>>>>>>> ext4_debug("freeing block %llu\n", block);
>>>>>>> trace_ext4_free_blocks(inode, block, count, flags);
>>>>>>>
>>>>>>> + if (flags& EXT4_FREE_BLOCKS_ZERO) {
>>>>>>> + err = sb_issue_zeroout(inode->i_sb, block, count,
>>>>>>> GFP_NOFS);
>>>>>>
>>>>>> But the delete of these blocks in not yet committed,
>>>>>> so after reboot, you can end up with a non-deleted but zeroed file
>>>>>> data.
>>>>>> Is that acceptable? I should think not.
>>>>>>
>>>>>> One way around this is a 2-phase unlink/truncate.
>>>>>> Phase 1: add to orphan list and register a callback on commit
>>>>>> Phase 2: issue zeroout and free the blocks
>>>>>>
>>>>>> This won't work for punch hole, but then again, for punch hole
>>>>>> it's probably OK to end up with zeroed data, but non-deleted blocks.
>>>>>> Right?
>>>>>
>>>>> Hi, I had a quick question about the orphan list. I notice that
>>>>> ext4_ext_truncate and also ext4_ext_punch_hole already have a call to
>>>>> ext4_orphan_add that happens really early before any calls to free
>>>>> blocks.
>>>>> Does this address your earlier concerns, or is there another reason I
>>>>> missed? Thx!
>>>>
>>>> It doesn't address the concerns of getting a non-deleted file with zeroed
>>>> data
>>>> after crash, because the existence of the inode on the orphan list after
>>>> crash
>>>> depends on the transaction that added it to the list being committed.
>>>> And your patch zeroes the blocks before that transaction is committed.
>>>>
>>>> However, the orphan list gives you a very good framework to implement
>>>> deferred delete (by a kernel thread) as Andreas suggested.
>>>> Unlink should be simple, because freeing unlinked inode blocks it is
>>>> anyway
>>>> deferred till the inode refcount drops to zero.
>>>
>>> Right. The patch that I referenced moved all of the blocks from unlink
>>> and truncate-to-zero from the current inode to a new temporary inode on
>>> the
>>> orphan list (simply copying the i_blocks field + i_block and i_size, IIRC,
>>> and zeroing them on the original inode).
>>>
>>>> Truncate is more tricky, because of the truncate shrink/extend
>>>> requirement
>>>> (that all data is zeroes after extending the inode's size via truncate
>>>> system call), so a shrinking-deferred truncate would have to mark all the
>>>> to-be-deleted extents uninitialized.
>>>
>>> It would be possible to do this for partial truncate/punch as well, to
>>> move whole blocks over to a new inode on the orphan list and zeroing only
>>> the 1 or 2 partial blocks inline.
>>>
>>> It should even be possible to leverage the "block migrate" facility used
>>> by defrag, so that we don't duplicate this code. That would mean just
>>> allocating a temp "unlink" inode in the kernel and putting it on the
>>> orphan
>>> list (like an open-unlinked file), migrate the selected range of blocks,
>>> and then zeroing the blocks in the background before unlinking the inode.
>>>
>>> I don't think that just marking the deleted extents as uninitialized is
>>> enough, since it would still leave "private" data on disk that could be
>>> read afterward. This would also only work for extent-mapped filesystems.
>>>
>>> There may need to be some work to enable the migrate code on block-mapped
>>> files, if you want to allow secure-delete on those files, but that is good
>>> IMHO since it also means that we could defrag block-mapped files.
>>>
>>> Cheers, Andreas
>>>
>>
>> Ah, ok then. Yes, part of the requirements was to make secure delete work
>> for partial truncates, punch hole, and also indexed files. So that will
>> save me some time if I can get the migrate routines work. Thx for the
>> pointers all!
>>
>
> I realized that there is a basic flaw in the concept of deferred-secure-delete.
> From a security point of view, after a crash during a secure-delete,
> if the file is not there, all its data should have been wiped.
> Orphan cleanup on the next mount may be done on a system that
> doesn't respect secure delete.
> So for real security, the unlink/truncate command cannot return before
> all data is wiped.
> The unlink/truncate metadata changes must not even be committed
> before all data is wiped (or at least part of the data with partial truncate).
>
> Amir.
I see, so then it sounds like using a background thread to do the
zeroing would not help us if we have to wait for it complete anyway.
Going back to the 2 phase approach, this means that we need to do the
zero out and then the free before we do the orphan list and commit?
Just trying to make sure I understand things correctly :)
Allison Henderson
On 2011-07-07, at 6:09 PM, Amir Goldstein wrote:
> On Thu, Jul 7, 2011 at 11:19 PM, Allison Henderson
> <[email protected]> wrote:
>>
>> Ah, ok then. Yes, part of the requirements was to make secure delete work
>> for partial truncates, punch hole, and also indexed files. So that will
>> save me some time if I can get the migrate routines work. Thx for the
>> pointers all!
>
> I realized that there is a basic flaw in the concept of deferred-secure-delete.
> From a security point of view, after a crash during a secure-delete,
> if the file is not there, all its data should have been wiped.
> Orphan cleanup on the next mount may be done on a system that
> doesn't respect secure delete.
> So for real security, the unlink/truncate command cannot return before
> all data is wiped.
I don't necessarily agree. People who are using secure delete don't
necessarily want their performance to go into the toilet, which is what
would happen if each unlink/zero happened sequentially. It is far more
efficient to submit a large batch of unlink/zero operations and then do
an sync() or fsync() at the end. This allows multiple journal operations
to be coalesced (e.g. unlinks from directory) and block zero requests to
be merged.
> The unlink/truncate metadata changes must not even be committed
> before all data is wiped (or at least part of the data with partial truncate).
That depends on the user, I think. If someone does "rm -r {dir}" it may
be better that the files are removed from the namespace more quickly, and
secure deleted (in a batch) more quickly, than having "rm" wait for each
file to be unlinked and maybe leave files in the namespace that didn't
even get a chance to be unlinked.
Cheers, Andreas
On 07/08/2011 03:46 AM, Andreas Dilger wrote:
> On 2011-07-07, at 6:09 PM, Amir Goldstein wrote:
>> On Thu, Jul 7, 2011 at 11:19 PM, Allison Henderson
>> <[email protected]> wrote:
>>> Ah, ok then. Yes, part of the requirements was to make secure delete work
>>> for partial truncates, punch hole, and also indexed files. So that will
>>> save me some time if I can get the migrate routines work. Thx for the
>>> pointers all!
>> I realized that there is a basic flaw in the concept of deferred-secure-delete.
>> From a security point of view, after a crash during a secure-delete,
>> if the file is not there, all its data should have been wiped.
>> Orphan cleanup on the next mount may be done on a system that
>> doesn't respect secure delete.
>> So for real security, the unlink/truncate command cannot return before
>> all data is wiped.
> I don't necessarily agree. People who are using secure delete don't
> necessarily want their performance to go into the toilet, which is what
> would happen if each unlink/zero happened sequentially. It is far more
> efficient to submit a large batch of unlink/zero operations and then do
> an sync() or fsync() at the end. This allows multiple journal operations
> to be coalesced (e.g. unlinks from directory) and block zero requests to
> be merged.
>
I think that secure delete should be synchronous, slow and 100% crash proof.
When you unlink a "secure delete" file, you should not be able to access the raw
disk and find the data anywhere that used to be there.
This is an unusual operation and *most* users do not need it or care. Definitely
not something that should happen by default.
In my experience having helped provide this feature in the past, people usually
ended up doing something in user space (i.e., move the file to a directory where
it will be shredded and then unlinked). This is similar to the scheme you
outlined for a kernel implementation, but has the advantage that you could still
see the original file name somewhere if you wanted to check to see if it was
gone....
Ric
>> The unlink/truncate metadata changes must not even be committed
>> before all data is wiped (or at least part of the data with partial truncate).
> That depends on the user, I think. If someone does "rm -r {dir}" it may
> be better that the files are removed from the namespace more quickly, and
> secure deleted (in a batch) more quickly, than having "rm" wait for each
> file to be unlinked and maybe leave files in the namespace that didn't
> even get a chance to be unlinked.
>
> Cheers, Andreas
>
>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jul 8, 2011 at 5:46 AM, Andreas Dilger <[email protected]> wrote:
> On 2011-07-07, at 6:09 PM, Amir Goldstein wrote:
>> On Thu, Jul 7, 2011 at 11:19 PM, Allison Henderson
>> <[email protected]> wrote:
>>>
>>> Ah, ok then. ?Yes, part of the requirements was to make secure delete work
>>> for partial truncates, punch hole, and also indexed files. ?So that will
>>> save me some time if I can get the migrate routines work. ?Thx for the
>>> pointers all!
>>
>> I realized that there is a basic flaw in the concept of deferred-secure-delete.
>> From a security point of view, after a crash during a secure-delete,
>> if the file is not there, all its data should have been wiped.
>> Orphan cleanup on the next mount may be done on a system that
>> doesn't respect secure delete.
>> So for real security, the unlink/truncate command cannot return before
>> all data is wiped.
>
> I don't necessarily agree. ?People who are using secure delete don't
> necessarily want their performance to go into the toilet, which is what
> would happen if each unlink/zero happened sequentially. ?It is far more
> efficient to submit a large batch of unlink/zero operations and then do
> an sync() or fsync() at the end. ?This allows multiple journal operations
> to be coalesced (e.g. unlinks from directory) and block zero requests to
> be merged.
>
>> The unlink/truncate metadata changes must not even be committed
>> before all data is wiped (or at least part of the data with partial truncate).
>
> That depends on the user, I think. ?If someone does "rm -r {dir}" it may
> be better that the files are removed from the namespace more quickly, and
> secure deleted (in a batch) more quickly, than having "rm" wait for each
> file to be unlinked and maybe leave files in the namespace that didn't
> even get a chance to be unlinked.
>
if my system crashes in the middle of "rm -rf my_darkest_secrets/"
I want to be able to see if there are leftovers after reboot, therefore
the directory and files cannot be removed from namespace before
I get the secured delete guaranty.
Amir.
On Fri, Jul 8, 2011 at 4:55 AM, Allison Henderson
<[email protected]> wrote:
> On 07/07/2011 05:09 PM, Amir Goldstein wrote:
>>
>> On Thu, Jul 7, 2011 at 11:19 PM, Allison Henderson
>> <[email protected]> ?wrote:
>>>
>>> On 07/07/2011 12:52 PM, Andreas Dilger wrote:
>>>>
>>>> On 2011-07-07, at 1:05 AM, Amir Goldstein wrote:
>>>>>
>>>>> On Thu, Jul 7, 2011 at 12:05 AM, Allison Henderson
>>>>> <[email protected]> ? ?wrote:
>>>>>>
>>>>>> On 07/02/2011 02:33 AM, Amir Goldstein wrote:
>>>>>>>
>>>>>>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
>>>>>>> <[email protected]> ? ? wrote:
>>>>>>>>
>>>>>>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle,
>>>>>>>> struct
>>>>>>>> inode *inode,
>>>>>>>> ? ? ? ?ext4_debug("freeing block %llu\n", block);
>>>>>>>> ? ? ? ?trace_ext4_free_blocks(inode, block, count, flags);
>>>>>>>>
>>>>>>>> + ? ? ? if (flags& ? ? EXT4_FREE_BLOCKS_ZERO) {
>>>>>>>> + ? ? ? ? ? ? ? err = sb_issue_zeroout(inode->i_sb, block, count,
>>>>>>>> GFP_NOFS);
>>>>>>>
>>>>>>> But the delete of these blocks in not yet committed,
>>>>>>> so after reboot, you can end up with a non-deleted but zeroed file
>>>>>>> data.
>>>>>>> Is that acceptable? I should think not.
>>>>>>>
>>>>>>> One way around this is a 2-phase unlink/truncate.
>>>>>>> Phase 1: add to orphan list and register a callback on commit
>>>>>>> Phase 2: issue zeroout and free the blocks
>>>>>>>
>>>>>>> This won't work for punch hole, but then again, for punch hole
>>>>>>> it's probably OK to end up with zeroed data, but non-deleted blocks.
>>>>>>> Right?
>>>>>>
>>>>>> Hi, I had a quick question about the orphan list. ?I notice that
>>>>>> ext4_ext_truncate and also ext4_ext_punch_hole already have a call to
>>>>>> ext4_orphan_add that happens really early before any calls to free
>>>>>> blocks.
>>>>>> ?Does this address your earlier concerns, or is there another reason I
>>>>>> missed? ?Thx!
>>>>>
>>>>> It doesn't address the concerns of getting a non-deleted file with
>>>>> zeroed
>>>>> data
>>>>> after crash, because the existence of the inode on the orphan list
>>>>> after
>>>>> crash
>>>>> depends on the transaction that added it to the list being committed.
>>>>> And your patch zeroes the blocks before that transaction is committed.
>>>>>
>>>>> However, the orphan list gives you a very good framework to implement
>>>>> deferred delete (by a kernel thread) as Andreas suggested.
>>>>> Unlink should be simple, because freeing unlinked inode blocks it is
>>>>> anyway
>>>>> deferred till the inode refcount drops to zero.
>>>>
>>>> Right. ?The patch that I referenced moved all of the blocks from unlink
>>>> and truncate-to-zero from the current inode to a new temporary inode on
>>>> the
>>>> orphan list (simply copying the i_blocks field + i_block and i_size,
>>>> IIRC,
>>>> and zeroing them on the original inode).
>>>>
>>>>> Truncate is more tricky, because of the truncate shrink/extend
>>>>> requirement
>>>>> (that all data is zeroes after extending the inode's size via truncate
>>>>> system call), so a shrinking-deferred truncate would have to mark all
>>>>> the
>>>>> to-be-deleted extents uninitialized.
>>>>
>>>> It would be possible to do this for partial truncate/punch as well, to
>>>> move whole blocks over to a new inode on the orphan list and zeroing
>>>> only
>>>> the 1 or 2 partial blocks inline.
>>>>
>>>> It should even be possible to leverage the "block migrate" facility used
>>>> by defrag, so that we don't duplicate this code. ?That would mean just
>>>> allocating a temp "unlink" inode in the kernel and putting it on the
>>>> orphan
>>>> list (like an open-unlinked file), migrate the selected range of blocks,
>>>> and then zeroing the blocks in the background before unlinking the
>>>> inode.
>>>>
>>>> I don't think that just marking the deleted extents as uninitialized is
>>>> enough, since it would still leave "private" data on disk that could be
>>>> read afterward. ?This would also only work for extent-mapped
>>>> filesystems.
>>>>
>>>> There may need to be some work to enable the migrate code on
>>>> block-mapped
>>>> files, if you want to allow secure-delete on those files, but that is
>>>> good
>>>> IMHO since it also means that we could defrag block-mapped files.
>>>>
>>>> Cheers, Andreas
>>>>
>>>
>>> Ah, ok then. ?Yes, part of the requirements was to make secure delete
>>> work
>>> for partial truncates, punch hole, and also indexed files. ?So that will
>>> save me some time if I can get the migrate routines work. ?Thx for the
>>> pointers all!
>>>
>>
>> I realized that there is a basic flaw in the concept of
>> deferred-secure-delete.
>> ?From a security point of view, after a crash during a secure-delete,
>> if the file is not there, all its data should have been wiped.
>> Orphan cleanup on the next mount may be done on a system that
>> doesn't respect secure delete.
>> So for real security, the unlink/truncate command cannot return before
>> all data is wiped.
>> The unlink/truncate metadata changes must not even be committed
>> before all data is wiped (or at least part of the data with partial
>> truncate).
>>
>> Amir.
>
>
> I see, so then it sounds like using a background thread to do the zeroing
> would not help us if we have to wait for it complete anyway. Going back to
> the 2 phase approach, this means that we need to do the zero out and then
> the free before we do the orphan list and commit? Just trying to make sure I
> understand things correctly :)
>
Well, that really depends of the precise definition of "secure delete".
If you agree with the "100% secure" interpretation, then your current patch
is "almost" correct.
I can see 2 things that are missing:
1. ext4_unlink() will have to invoke ext4_truncate(0) directly just
like truncate system call does.
This is to prevent an attacker from keeping the protected file open
and preventing freeing
of it's data when the file is removed from the name space.
2. ext4_truncate() currently changes i_disksize first (and adds inode
to orphan list)
and then frees the blocks. for 100% secure delete, you cannot change i_disksize
before zeroing the blocks, so it has to be:
- zeroout range
- change i_disksize and add to orphan list
- free blocks
I don't see how it could be done any other way, but maybe someone else can...
Amir.
On Fri, 2011-07-08 at 03:09 +0300, Amir Goldstein wrote:
> On Thu, Jul 7, 2011 at 11:19 PM, Allison Henderson
> <[email protected]> wrote:
> > On 07/07/2011 12:52 PM, Andreas Dilger wrote:
> >>
> >> On 2011-07-07, at 1:05 AM, Amir Goldstein wrote:
> >>>
> >>> On Thu, Jul 7, 2011 at 12:05 AM, Allison Henderson
> >>> <[email protected]> wrote:
> >>>>
> >>>> On 07/02/2011 02:33 AM, Amir Goldstein wrote:
> >>>>>
> >>>>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
> >>>>> <[email protected]> wrote:
> >>>>>>
> >>>>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle, struct
> >>>>>> inode *inode,
> >>>>>> ext4_debug("freeing block %llu\n", block);
> >>>>>> trace_ext4_free_blocks(inode, block, count, flags);
> >>>>>>
> >>>>>> + if (flags& EXT4_FREE_BLOCKS_ZERO) {
> >>>>>> + err = sb_issue_zeroout(inode->i_sb, block, count,
> >>>>>> GFP_NOFS);
> >>>>>
> >>>>> But the delete of these blocks in not yet committed,
> >>>>> so after reboot, you can end up with a non-deleted but zeroed file
> >>>>> data.
> >>>>> Is that acceptable? I should think not.
> >>>>>
> >>>>> One way around this is a 2-phase unlink/truncate.
> >>>>> Phase 1: add to orphan list and register a callback on commit
> >>>>> Phase 2: issue zeroout and free the blocks
> >>>>>
> >>>>> This won't work for punch hole, but then again, for punch hole
> >>>>> it's probably OK to end up with zeroed data, but non-deleted blocks.
> >>>>> Right?
> >>>>
> >>>> Hi, I had a quick question about the orphan list. I notice that
> >>>> ext4_ext_truncate and also ext4_ext_punch_hole already have a call to
> >>>> ext4_orphan_add that happens really early before any calls to free
> >>>> blocks.
> >>>> Does this address your earlier concerns, or is there another reason I
> >>>> missed? Thx!
> >>>
> >>> It doesn't address the concerns of getting a non-deleted file with zeroed
> >>> data
> >>> after crash, because the existence of the inode on the orphan list after
> >>> crash
> >>> depends on the transaction that added it to the list being committed.
> >>> And your patch zeroes the blocks before that transaction is committed.
> >>>
> >>> However, the orphan list gives you a very good framework to implement
> >>> deferred delete (by a kernel thread) as Andreas suggested.
> >>> Unlink should be simple, because freeing unlinked inode blocks it is
> >>> anyway
> >>> deferred till the inode refcount drops to zero.
> >>
> >> Right. The patch that I referenced moved all of the blocks from unlink
> >> and truncate-to-zero from the current inode to a new temporary inode on
> >> the
> >> orphan list (simply copying the i_blocks field + i_block and i_size, IIRC,
> >> and zeroing them on the original inode).
> >>
> >>> Truncate is more tricky, because of the truncate shrink/extend
> >>> requirement
> >>> (that all data is zeroes after extending the inode's size via truncate
> >>> system call), so a shrinking-deferred truncate would have to mark all the
> >>> to-be-deleted extents uninitialized.
> >>
> >> It would be possible to do this for partial truncate/punch as well, to
> >> move whole blocks over to a new inode on the orphan list and zeroing only
> >> the 1 or 2 partial blocks inline.
> >>
> >> It should even be possible to leverage the "block migrate" facility used
> >> by defrag, so that we don't duplicate this code. That would mean just
> >> allocating a temp "unlink" inode in the kernel and putting it on the
> >> orphan
> >> list (like an open-unlinked file), migrate the selected range of blocks,
> >> and then zeroing the blocks in the background before unlinking the inode.
> >>
> >> I don't think that just marking the deleted extents as uninitialized is
> >> enough, since it would still leave "private" data on disk that could be
> >> read afterward. This would also only work for extent-mapped filesystems.
> >>
> >> There may need to be some work to enable the migrate code on block-mapped
> >> files, if you want to allow secure-delete on those files, but that is good
> >> IMHO since it also means that we could defrag block-mapped files.
> >>
> >> Cheers, Andreas
> >>
> >
> > Ah, ok then. Yes, part of the requirements was to make secure delete work
> > for partial truncates, punch hole, and also indexed files. So that will
> > save me some time if I can get the migrate routines work. Thx for the
> > pointers all!
> >
>
> I realized that there is a basic flaw in the concept of deferred-secure-delete.
> From a security point of view, after a crash during a secure-delete,
> if the file is not there, all its data should have been wiped.
> Orphan cleanup on the next mount may be done on a system that
> doesn't respect secure delete.
> So for real security, the unlink/truncate command cannot return before
> all data is wiped.
I agree. I think the user who expect secure delete will be expecting the
data being completely wiped off from disk, instead of wondering when the
OS/fs will really get rid of the data on the hidden inode by background
thread. Secure delete should be synchronous.
> The unlink/truncate metadata changes must not even be committed
> before all data is wiped (or at least part of the data with partial truncate).
>
> Amir.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07/07/2011 11:29 PM, Amir Goldstein wrote:
> On Fri, Jul 8, 2011 at 4:55 AM, Allison Henderson
> <[email protected]> wrote:
>> On 07/07/2011 05:09 PM, Amir Goldstein wrote:
>>>
>>> On Thu, Jul 7, 2011 at 11:19 PM, Allison Henderson
>>> <[email protected]> wrote:
>>>>
>>>> On 07/07/2011 12:52 PM, Andreas Dilger wrote:
>>>>>
>>>>> On 2011-07-07, at 1:05 AM, Amir Goldstein wrote:
>>>>>>
>>>>>> On Thu, Jul 7, 2011 at 12:05 AM, Allison Henderson
>>>>>> <[email protected]> wrote:
>>>>>>>
>>>>>>> On 07/02/2011 02:33 AM, Amir Goldstein wrote:
>>>>>>>>
>>>>>>>> On Fri, Jul 1, 2011 at 12:22 AM, Allison Henderson
>>>>>>>> <[email protected]> wrote:
>>>>>>>>>
>>>>>>>>> @@ -4485,6 +4485,14 @@ void ext4_free_blocks(handle_t *handle,
>>>>>>>>> struct
>>>>>>>>> inode *inode,
>>>>>>>>> ext4_debug("freeing block %llu\n", block);
>>>>>>>>> trace_ext4_free_blocks(inode, block, count, flags);
>>>>>>>>>
>>>>>>>>> + if (flags& EXT4_FREE_BLOCKS_ZERO) {
>>>>>>>>> + err = sb_issue_zeroout(inode->i_sb, block, count,
>>>>>>>>> GFP_NOFS);
>>>>>>>>
>>>>>>>> But the delete of these blocks in not yet committed,
>>>>>>>> so after reboot, you can end up with a non-deleted but zeroed file
>>>>>>>> data.
>>>>>>>> Is that acceptable? I should think not.
>>>>>>>>
>>>>>>>> One way around this is a 2-phase unlink/truncate.
>>>>>>>> Phase 1: add to orphan list and register a callback on commit
>>>>>>>> Phase 2: issue zeroout and free the blocks
>>>>>>>>
>>>>>>>> This won't work for punch hole, but then again, for punch hole
>>>>>>>> it's probably OK to end up with zeroed data, but non-deleted blocks.
>>>>>>>> Right?
>>>>>>>
>>>>>>> Hi, I had a quick question about the orphan list. I notice that
>>>>>>> ext4_ext_truncate and also ext4_ext_punch_hole already have a call to
>>>>>>> ext4_orphan_add that happens really early before any calls to free
>>>>>>> blocks.
>>>>>>> Does this address your earlier concerns, or is there another reason I
>>>>>>> missed? Thx!
>>>>>>
>>>>>> It doesn't address the concerns of getting a non-deleted file with
>>>>>> zeroed
>>>>>> data
>>>>>> after crash, because the existence of the inode on the orphan list
>>>>>> after
>>>>>> crash
>>>>>> depends on the transaction that added it to the list being committed.
>>>>>> And your patch zeroes the blocks before that transaction is committed.
>>>>>>
>>>>>> However, the orphan list gives you a very good framework to implement
>>>>>> deferred delete (by a kernel thread) as Andreas suggested.
>>>>>> Unlink should be simple, because freeing unlinked inode blocks it is
>>>>>> anyway
>>>>>> deferred till the inode refcount drops to zero.
>>>>>
>>>>> Right. The patch that I referenced moved all of the blocks from unlink
>>>>> and truncate-to-zero from the current inode to a new temporary inode on
>>>>> the
>>>>> orphan list (simply copying the i_blocks field + i_block and i_size,
>>>>> IIRC,
>>>>> and zeroing them on the original inode).
>>>>>
>>>>>> Truncate is more tricky, because of the truncate shrink/extend
>>>>>> requirement
>>>>>> (that all data is zeroes after extending the inode's size via truncate
>>>>>> system call), so a shrinking-deferred truncate would have to mark all
>>>>>> the
>>>>>> to-be-deleted extents uninitialized.
>>>>>
>>>>> It would be possible to do this for partial truncate/punch as well, to
>>>>> move whole blocks over to a new inode on the orphan list and zeroing
>>>>> only
>>>>> the 1 or 2 partial blocks inline.
>>>>>
>>>>> It should even be possible to leverage the "block migrate" facility used
>>>>> by defrag, so that we don't duplicate this code. That would mean just
>>>>> allocating a temp "unlink" inode in the kernel and putting it on the
>>>>> orphan
>>>>> list (like an open-unlinked file), migrate the selected range of blocks,
>>>>> and then zeroing the blocks in the background before unlinking the
>>>>> inode.
>>>>>
>>>>> I don't think that just marking the deleted extents as uninitialized is
>>>>> enough, since it would still leave "private" data on disk that could be
>>>>> read afterward. This would also only work for extent-mapped
>>>>> filesystems.
>>>>>
>>>>> There may need to be some work to enable the migrate code on
>>>>> block-mapped
>>>>> files, if you want to allow secure-delete on those files, but that is
>>>>> good
>>>>> IMHO since it also means that we could defrag block-mapped files.
>>>>>
>>>>> Cheers, Andreas
>>>>>
>>>>
>>>> Ah, ok then. Yes, part of the requirements was to make secure delete
>>>> work
>>>> for partial truncates, punch hole, and also indexed files. So that will
>>>> save me some time if I can get the migrate routines work. Thx for the
>>>> pointers all!
>>>>
>>>
>>> I realized that there is a basic flaw in the concept of
>>> deferred-secure-delete.
>>> From a security point of view, after a crash during a secure-delete,
>>> if the file is not there, all its data should have been wiped.
>>> Orphan cleanup on the next mount may be done on a system that
>>> doesn't respect secure delete.
>>> So for real security, the unlink/truncate command cannot return before
>>> all data is wiped.
>>> The unlink/truncate metadata changes must not even be committed
>>> before all data is wiped (or at least part of the data with partial
>>> truncate).
>>>
>>> Amir.
>>
>>
>> I see, so then it sounds like using a background thread to do the zeroing
>> would not help us if we have to wait for it complete anyway. Going back to
>> the 2 phase approach, this means that we need to do the zero out and then
>> the free before we do the orphan list and commit? Just trying to make sure I
>> understand things correctly :)
>>
>
> Well, that really depends of the precise definition of "secure delete".
> If you agree with the "100% secure" interpretation, then your current patch
> is "almost" correct.
> I can see 2 things that are missing:
> 1. ext4_unlink() will have to invoke ext4_truncate(0) directly just
> like truncate system call does.
> This is to prevent an attacker from keeping the protected file open
> and preventing freeing
> of it's data when the file is removed from the name space.
> 2. ext4_truncate() currently changes i_disksize first (and adds inode
> to orphan list)
> and then frees the blocks. for 100% secure delete, you cannot change i_disksize
> before zeroing the blocks, so it has to be:
> - zeroout range
> - change i_disksize and add to orphan list
> - free blocks
>
> I don't see how it could be done any other way, but maybe someone else can...
>
> Amir.
Ok, I seem to have a lot voices that are interested in the synchronous
implementation, and I think it is a reasonable design too. So I am
going to move forward with this, but I will keep an eye on this thread.
I suppose if we get more voices that are interested in the
asynchronous approach we could always add it as an optimization later.
Thx all for your feed back!! :)
Allison Henderson
On 2011-07-08, at 12:20 PM, Mingming Cao wrote:
> On Fri, 2011-07-08 at 03:09 +0300, Amir Goldstein wrote:
>> I realized that there is a basic flaw in the concept of deferred-secure-delete.
>> From a security point of view, after a crash during a secure-delete,
>> if the file is not there, all its data should have been wiped.
>> Orphan cleanup on the next mount may be done on a system that
>> doesn't respect secure delete.
>> So for real security, the unlink/truncate command cannot return before
>> all data is wiped.
>
> I agree. I think the user who expect secure delete will be expecting the
> data being completely wiped off from disk, instead of wondering when the
> OS/fs will really get rid of the data on the hidden inode by background
> thread. Secure delete should be synchronous.
I'm not going to argue further for async secure delete, but just wanted
to point out that userspace can determine when the "shred" is safely done
on disk (or any other operation for that matter) by doing a sync afterward.
It wouldn't have to "wonder" about anything.
My original proposal for using the delete thread included having sync()
block until all of the background secure unlink/overwrite operations were
finished. That would allow deleting many files at one time, and then
sending all of the requests to the disk more efficiently.
I am just imagining some Enron accountant sweating for hours as his sync
secure-delete is running at 50-100 files/sec (seek limit if there are
1 or 2 seeks/file) on a filesystem with 1M files in it, instead of being
able to delete 50000 files/sec asynchronously and wait a few minutes at
the end as the sync completes. ;-)
A better solution is to just encrypt the data with a per-inode key and
then just overwrite the inode securely when it is unlinked, so when the
key is erased the data is unrecoverable. I imagine that ecryptfs or
similar might do something like that (I don't know much about it, honestly).
Note that this should probably get an EXT4_FEATURE_COMPAT_SECDEL flag, so
e2fsck knows to also wipe secure-delete files when they are unlinked due
to inode corruption, or similar.
This reminds me I also have an e2fsck patch that we've been carrying for a
few years for shared block handling that allows e2fsck to optionally move
inodes to lost+found, delete them, or wipe the shared blocks option. This
is necessary to avoid data leakage between users in case there is some
corruption that links one user's inode to another user's blocks. I'll send
that in another email.
Cheers, Andreas
On 07/09/2011 12:49 AM, Andreas Dilger wrote:
> On 2011-07-08, at 12:20 PM, Mingming Cao wrote:
>> On Fri, 2011-07-08 at 03:09 +0300, Amir Goldstein wrote:
>>> I realized that there is a basic flaw in the concept of deferred-secure-delete.
>>> From a security point of view, after a crash during a secure-delete,
>>> if the file is not there, all its data should have been wiped.
>>> Orphan cleanup on the next mount may be done on a system that
>>> doesn't respect secure delete.
>>> So for real security, the unlink/truncate command cannot return before
>>> all data is wiped.
>> I agree. I think the user who expect secure delete will be expecting the
>> data being completely wiped off from disk, instead of wondering when the
>> OS/fs will really get rid of the data on the hidden inode by background
>> thread. Secure delete should be synchronous.
> I'm not going to argue further for async secure delete, but just wanted
> to point out that userspace can determine when the "shred" is safely done
> on disk (or any other operation for that matter) by doing a sync afterward.
> It wouldn't have to "wonder" about anything.
>
> My original proposal for using the delete thread included having sync()
> block until all of the background secure unlink/overwrite operations were
> finished. That would allow deleting many files at one time, and then
> sending all of the requests to the disk more efficiently.
>
>
> I am just imagining some Enron accountant sweating for hours as his sync
> secure-delete is running at 50-100 files/sec (seek limit if there are
> 1 or 2 seeks/file) on a filesystem with 1M files in it, instead of being
> able to delete 50000 files/sec asynchronously and wait a few minutes at
> the end as the sync completes. ;-)
>
> A better solution is to just encrypt the data with a per-inode key and
> then just overwrite the inode securely when it is unlinked, so when the
> key is erased the data is unrecoverable. I imagine that ecryptfs or
> similar might do something like that (I don't know much about it, honestly).
>
>
> Note that this should probably get an EXT4_FEATURE_COMPAT_SECDEL flag, so
> e2fsck knows to also wipe secure-delete files when they are unlinked due
> to inode corruption, or similar.
>
> This reminds me I also have an e2fsck patch that we've been carrying for a
> few years for shared block handling that allows e2fsck to optionally move
> inodes to lost+found, delete them, or wipe the shared blocks option. This
> is necessary to avoid data leakage between users in case there is some
> corruption that links one user's inode to another user's blocks. I'll send
> that in another email.
>
> Cheers, Andreas
>
Just to wrap up this thread, I will throw out some of the use cases that I have
seen:
(1) In some parts of the world, an employer has a hard requirement to eliminate
records in so many days after termination for any reason (the EU iirc)
(2) legal or data retention requirements. Sarbanes/Oxley (aka SOX) in the US
requires firms to retain data on trades for a specified amount of days. In a
similar way, email is often required to be retained during any legal proceedings
(3) "Scrubbing" a whole disk of data before an upgrade/return to the vendor
(usually done by wiping disks, not removing/scrubbing individual files).
As you point out, performance is a critical aspect of several of these use
cases. Imagine the shifty trader watching the clock to see when they can start
deleting electronic evidence of shady trades :) That said, the promise has to
be that there is no shadow of data left on disk (even after a crash or recovery).
I think that the synchronous method is the easy and obvious way to do this for
most use cases, but as you say, you can always use "shred" from user space to do
something a bit different....
Ric
On Fri, Jul 08, 2011 at 09:29:16AM +0300, Amir Goldstein wrote:
>
> Well, that really depends of the precise definition of "secure delete".
> If you agree with the "100% secure" interpretation, then your current patch
> is "almost" correct.
> I can see 2 things that are missing:
> 1. ext4_unlink() will have to invoke ext4_truncate(0) directly just
> like truncate system call does.
> This is to prevent an attacker from keeping the protected file open
> and preventing freeing
> of it's data when the file is removed from the name space.
Um, no. This breaks Unix semantics, which is going to cause any kind
of headaches, especially if someone sets the secure delete flag on a
directory, and then a hapless application creates a file in said
directory expecting standard Posix semantics.
If the attacker has a handle on the file, he can just simply copy out
the data to another file; if you're worried about someone who manages
to open the file just 1 millisecond before you do the delete (as
opposed 100 milliseconds, when he would have copy out the file), I
don't think it's worth it.
> 2. ext4_truncate() currently changes i_disksize first (and adds inode
> to orphan list)
> and then frees the blocks. for 100% secure delete, you cannot change i_disksize
> before zeroing the blocks, so it has to be:
> - zeroout range
> - change i_disksize and add to orphan list
> - free blocks
What are you worried about here?
- Ted
On Sun, Jul 10, 2011 at 09:19:58AM +0100, Ric Wheeler wrote:
> Just to wrap up this thread, I will throw out some of the use cases
> that I have seen....
Unless we clearly articulate what use case we are hoping to address, I
have to admit I'm a little dubious about whether it's worth it to add
"secure delete". There are plenty of other solutions, including
user-space shred, destruction of an encryption key, etc. All of these
solutions have tradeoffs between performance and security.
So if we're going to implement something, we should think very
carefully about what problem we are hoping to solve, and what sort of
adversaries/threat environment where we'd think this would be useful.
I'll observe that in many cases, where you have the sweating Enron
executive trying to destroy evidence, they're going to be thwarted by
automatic backup policies. This is also true BTW if you're worried
about employment records --- and pawing through several terabytes of
backup tapes to delete (only) the employee records for L?o Apotheker
Platner after he resigned from SAG AG would really be unpleasant. :-)
And of course, if you are using devices such as SSD's or
thin-provisioned devices, file-system level erasure may not really do
a lot of your anyway, even if you are using discard.
So --- does anyone have some thoughts about how this would actually
used by potential customers? If not, my vote would be to keep things
as simple as possible, and if it's too complicated, to think carefully
about whether it's worth it to (re)-add this feature.
- Ted
On 07/11/2011 12:33 AM, Ted Ts'o wrote:
> On Sun, Jul 10, 2011 at 09:19:58AM +0100, Ric Wheeler wrote:
>> Just to wrap up this thread, I will throw out some of the use cases
>> that I have seen....
> Unless we clearly articulate what use case we are hoping to address, I
> have to admit I'm a little dubious about whether it's worth it to add
> "secure delete". There are plenty of other solutions, including
> user-space shred, destruction of an encryption key, etc. All of these
> solutions have tradeoffs between performance and security.
>
> So if we're going to implement something, we should think very
> carefully about what problem we are hoping to solve, and what sort of
> adversaries/threat environment where we'd think this would be useful.
>
> I'll observe that in many cases, where you have the sweating Enron
> executive trying to destroy evidence, they're going to be thwarted by
> automatic backup policies. This is also true BTW if you're worried
> about employment records --- and pawing through several terabytes of
> backup tapes to delete (only) the employee records for L?o Apotheker
> Platner after he resigned from SAG AG would really be unpleasant. :-)
>
> And of course, if you are using devices such as SSD's or
> thin-provisioned devices, file-system level erasure may not really do
> a lot of your anyway, even if you are using discard.
>
> So --- does anyone have some thoughts about how this would actually
> used by potential customers? If not, my vote would be to keep things
> as simple as possible, and if it's too complicated, to think carefully
> about whether it's worth it to (re)-add this feature.
>
> - Ted
I do think that the synchronous secure delete is useful to have, even if slow.
That said, as you point out, there are lots of ways that this will fail
potentially, including:
* you might have copied a file or had blocks paged out that leave a "ghost" trace
* a simple secure delete that overwrites with zeros is not "sufficient" to erase
tracks for some users (look at the multi-pass options shred does for example)
* ssd's or other devices do wear levelling and move data around internally so
you might be able to rip the device apart and look at the raw flash and recover data
That said, for all normal users, I do think that the zero out is still useful
and reasonable. The simple goal is that once securely deleting, your sys admin
cannot use recovery tools or scan a block device and see your deleted file's
data blocks.
Ric
On Mon, 11 Jul 2011, Ric Wheeler wrote:
> On 07/11/2011 12:33 AM, Ted Ts'o wrote:
> > On Sun, Jul 10, 2011 at 09:19:58AM +0100, Ric Wheeler wrote:
> > > Just to wrap up this thread, I will throw out some of the use cases
> > > that I have seen....
> > Unless we clearly articulate what use case we are hoping to address, I
> > have to admit I'm a little dubious about whether it's worth it to add
> > "secure delete". There are plenty of other solutions, including
> > user-space shred, destruction of an encryption key, etc. All of these
> > solutions have tradeoffs between performance and security.
> >
> > So if we're going to implement something, we should think very
> > carefully about what problem we are hoping to solve, and what sort of
> > adversaries/threat environment where we'd think this would be useful.
> >
> > I'll observe that in many cases, where you have the sweating Enron
> > executive trying to destroy evidence, they're going to be thwarted by
> > automatic backup policies. This is also true BTW if you're worried
> > about employment records --- and pawing through several terabytes of
> > backup tapes to delete (only) the employee records for L?o Apotheker
> > Platner after he resigned from SAG AG would really be unpleasant. :-)
> >
> > And of course, if you are using devices such as SSD's or
> > thin-provisioned devices, file-system level erasure may not really do
> > a lot of your anyway, even if you are using discard.
> >
> > So --- does anyone have some thoughts about how this would actually
> > used by potential customers? If not, my vote would be to keep things
> > as simple as possible, and if it's too complicated, to think carefully
> > about whether it's worth it to (re)-add this feature.
> >
> > - Ted
>
> I do think that the synchronous secure delete is useful to have, even if slow.
>
> That said, as you point out, there are lots of ways that this will fail
> potentially, including:
>
> * you might have copied a file or had blocks paged out that leave a "ghost"
> trace
>
> * a simple secure delete that overwrites with zeros is not "sufficient" to
> erase tracks for some users (look at the multi-pass options shred does for
> example)
>
> * ssd's or other devices do wear levelling and move data around internally so
> you might be able to rip the device apart and look at the raw flash and
> recover data
This is my concern as well. Allison I think that if you are going to do
next spin of the patches it would be nice to detect whether the device
supports secure discard or at least regular discard is zeroing the data,
because otherwise just plain overwrite with zeroes is not going to help
at all. SSD will use some other blocks and the data will be still there,
so there is no point in doing overwrite.
However if you do use discard (not sure what secure discard is actually
doing) then you're still not guaranteed to have everything wiped out,
because there might still be some parts of the file in other places of
the device due the previous rewrites. You'll have to use fitrim to get
rid of the other parts, but still there might be some data left in the
user restricted area of the device which the fw uses for better wear-
leveling.
Thanks!
-Lukas
>
> That said, for all normal users, I do think that the zero out is still useful
> and reasonable. The simple goal is that once securely deleting, your sys
> admin cannot use recovery tools or scan a block device and see your deleted
> file's data blocks.
>
> Ric
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
On Mon, Jul 11, 2011 at 2:13 AM, Ted Ts'o <[email protected]> wrote:
> On Fri, Jul 08, 2011 at 09:29:16AM +0300, Amir Goldstein wrote:
>>
>> Well, that really depends of the precise definition of "secure delete".
>> If you agree with the "100% secure" interpretation, then your current patch
>> is "almost" correct.
>> I can see 2 things that are missing:
>> 1. ext4_unlink() will have to invoke ext4_truncate(0) directly just
>> like truncate system call does.
>> This is to prevent an attacker from keeping the protected file open
>> and preventing freeing
>> of it's data when the file is removed from the name space.
>
> Um, no. ?This breaks Unix semantics, which is going to cause any kind
> of headaches, especially if someone sets the secure delete flag on a
> directory, and then a hapless application creates a file in said
> directory expecting standard Posix semantics.
>
> If the attacker has a handle on the file, he can just simply copy out
> the data to another file; if you're worried about someone who manages
> to open the file just 1 millisecond before you do the delete (as
> opposed 100 milliseconds, when he would have copy out the file), I
> don't think it's worth it.
Maybe it's an attacker and maybe the user just doesn't know if an application
has a loose handle for this file.
In both cases, "truncate -s 0 mysecret && rm mysecret" would be more
full proof than "rm mysecret".
I do agree it makes more sense to do it with userspace tools in the first place.
>
>> 2. ext4_truncate() currently changes i_disksize first (and adds inode
>> to orphan list)
>> and then frees the blocks. for 100% secure delete, you cannot change i_disksize
>> before zeroing the blocks, so it has to be:
>> - zeroout range
>> - change i_disksize and add to orphan list
>> - free blocks
>
> What are you worried about here?
I am worried (but it doesn't keep me up at night) from orphan cleanup
after crash in the
middle of secure truncate.
Sure, if you boot with the same kernel, orphan cleanup will securely
truncate the rest
of the file, but another kernel may also non-securely truncate the
file and the accountant
will not know what hit him on judgement day.
Anyway, I don't care much for security nor for accountants, so no need
to convince me
one way or the other.
Amir.
On 07/11/2011 01:20 AM, Lukas Czerner wrote:
> On Mon, 11 Jul 2011, Ric Wheeler wrote:
>
>> On 07/11/2011 12:33 AM, Ted Ts'o wrote:
>>> On Sun, Jul 10, 2011 at 09:19:58AM +0100, Ric Wheeler wrote:
>>>> Just to wrap up this thread, I will throw out some of the use cases
>>>> that I have seen....
>>> Unless we clearly articulate what use case we are hoping to address, I
>>> have to admit I'm a little dubious about whether it's worth it to add
>>> "secure delete". There are plenty of other solutions, including
>>> user-space shred, destruction of an encryption key, etc. All of these
>>> solutions have tradeoffs between performance and security.
>>>
>>> So if we're going to implement something, we should think very
>>> carefully about what problem we are hoping to solve, and what sort of
>>> adversaries/threat environment where we'd think this would be useful.
>>>
>>> I'll observe that in many cases, where you have the sweating Enron
>>> executive trying to destroy evidence, they're going to be thwarted by
>>> automatic backup policies. This is also true BTW if you're worried
>>> about employment records --- and pawing through several terabytes of
>>> backup tapes to delete (only) the employee records for L?o Apotheker
>>> Platner after he resigned from SAG AG would really be unpleasant. :-)
>>>
>>> And of course, if you are using devices such as SSD's or
>>> thin-provisioned devices, file-system level erasure may not really do
>>> a lot of your anyway, even if you are using discard.
>>>
>>> So --- does anyone have some thoughts about how this would actually
>>> used by potential customers? If not, my vote would be to keep things
>>> as simple as possible, and if it's too complicated, to think carefully
>>> about whether it's worth it to (re)-add this feature.
>>>
>>> - Ted
>>
>> I do think that the synchronous secure delete is useful to have, even if slow.
>>
>> That said, as you point out, there are lots of ways that this will fail
>> potentially, including:
>>
>> * you might have copied a file or had blocks paged out that leave a "ghost"
>> trace
>>
>> * a simple secure delete that overwrites with zeros is not "sufficient" to
>> erase tracks for some users (look at the multi-pass options shred does for
>> example)
>>
>> * ssd's or other devices do wear levelling and move data around internally so
>> you might be able to rip the device apart and look at the raw flash and
>> recover data
>
> This is my concern as well. Allison I think that if you are going to do
> next spin of the patches it would be nice to detect whether the device
> supports secure discard or at least regular discard is zeroing the data,
> because otherwise just plain overwrite with zeroes is not going to help
> at all. SSD will use some other blocks and the data will be still there,
> so there is no point in doing overwrite.
Alrighty, initially the patches did have the discard, but we weren't
sure where the appropriate place to put it was, so we decided to leave
it out and add it as an optimization later. We were thinking it needs
to be where ever the existing code for discard is (which is in the
callback commit), but it sounds now like the discard needs to be where
ever the zero out is happening, so I will add that back in in the next set.
>
> However if you do use discard (not sure what secure discard is actually
> doing) then you're still not guaranteed to have everything wiped out,
> because there might still be some parts of the file in other places of
> the device due the previous rewrites. You'll have to use fitrim to get
> rid of the other parts, but still there might be some data left in the
> user restricted area of the device which the fw uses for better wear-
> leveling.
>
> Thanks!
> -Lukas
Ok then, I will check into fitrim and also to see if there are any
solutions to the wear leveling problem. Thx!
Allison Henderson
>
>>
>> That said, for all normal users, I do think that the zero out is still useful
>> and reasonable. The simple goal is that once securely deleting, your sys
>> admin cannot use recovery tools or scan a block device and see your deleted
>> file's data blocks.
>>
>> Ric
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>
> --