2010-10-21 14:16:02

by Lukas Czerner

[permalink] [raw]
Subject: [PATCH] e2fsck: Discard free data and inode blocks.

In Pass 5 when we are checking block and inode bitmaps we have great
opportunity to discard free space and unused inodes on the device,
because bitmaps has just been verified as valid. This commit takes
advantage of this opportunity and discards both, all free space and
unused inodes.

I have added new option '-K' which when set, disables discard. Also when
the underlying device does not support discard, or BLKDISCARD ioctl
returns any kind of error, or when some errors occurred in bitmaps, the
discard is disabled.

As an addition, when there is any not-yet-zeroed inode table and
discard zeroes data, then inode table is marked as zeroed.

Signed-off-by: Lukas Czerner <[email protected]>
---
e2fsck/e2fsck.8.in | 7 +++
e2fsck/e2fsck.h | 1 +
e2fsck/pass5.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
e2fsck/unix.c | 10 ++++-
4 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/e2fsck/e2fsck.8.in b/e2fsck/e2fsck.8.in
index 3fb15e6..507f7b1 100644
--- a/e2fsck/e2fsck.8.in
+++ b/e2fsck/e2fsck.8.in
@@ -19,6 +19,9 @@ e2fsck \- check a Linux ext2/ext3/ext4 file system
.I blocksize
]
[
+.B \-K
+]
+[
.BR \-l | \-L
.I bad_blocks_file
]
@@ -204,6 +207,10 @@ time trials.
@JDEV@Set the pathname where the external-journal for this filesystem can be
@JDEV@found.
.TP
+.BI \-K
+Keep, do not attempt to discard free blocks and unused inodes (discarding
+blocks is useful on solid state devices and sparse / thin-provisioned storage).
+.TP
.BI \-k
When combined with the
.B \-c
diff --git a/e2fsck/e2fsck.h b/e2fsck/e2fsck.h
index d4df5f3..a377246 100644
--- a/e2fsck/e2fsck.h
+++ b/e2fsck/e2fsck.h
@@ -155,6 +155,7 @@ struct resource_track {
#define E2F_OPT_WRITECHECK 0x0200
#define E2F_OPT_COMPRESS_DIRS 0x0400
#define E2F_OPT_FRAGCHECK 0x0800
+#define E2F_OPT_DISCARD 0x1000

/*
* E2fsck flags
diff --git a/e2fsck/pass5.c b/e2fsck/pass5.c
index cbc12f3..a1db499 100644
--- a/e2fsck/pass5.c
+++ b/e2fsck/pass5.c
@@ -10,9 +10,18 @@
*
*/

+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+
#include "e2fsck.h"
#include "problem.h"

+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
static void check_block_bitmaps(e2fsck_t ctx);
static void check_inode_bitmaps(e2fsck_t ctx);
static void check_inode_end(e2fsck_t ctx);
@@ -64,6 +73,66 @@ void e2fsck_pass5(e2fsck_t ctx)
print_resource_track(ctx, _("Pass 5"), &rtrack, ctx->fs->io);
}

+#ifdef __linux__
+
+#ifndef BLKDISCARD
+#define BLKDISCARD _IO(0x12,119)
+#endif
+
+#ifndef BLKDISCARDZEROES
+#define BLKDISCARDZEROES _IO(0x12,124)
+#endif
+
+static void e2fsck_discard_blocks(e2fsck_t ctx, blk_t start,
+ blk_t count)
+{
+ int fd;
+ int ret = 0;
+ int blocksize;
+ ext2_filsys fs = ctx->fs;
+ __uint64_t range[2];
+
+ blocksize = EXT2_BLOCK_SIZE(fs->super);
+
+ range[0] = (__uint64_t)(start);
+ range[1] = (__uint64_t)(count);
+ range[0] *= (__uint64_t)(blocksize);
+ range[1] *= (__uint64_t)(blocksize);
+
+ fd = open64(fs->device_name, O_RDWR);
+ if (fd < 0) {
+ com_err("open", errno,
+ _("while opening %s for discarding"),
+ ctx->device_name);
+ fatal_error(ctx, 0);
+ }
+
+ ret = ioctl(fd, BLKDISCARD, &range);
+ if (ret)
+ ctx->options &= ~E2F_OPT_DISCARD;
+
+ close(fd);
+}
+
+static int e2fsck_discard_zeroes_data(ext2_filsys fs)
+{
+ int fd;
+ int ret;
+ int discard_zeroes_data = 0;
+
+ fd = open64(fs->device_name, O_RDWR);
+
+ if (fd > 0) {
+ ioctl(fd, BLKDISCARDZEROES, &discard_zeroes_data);
+ close(fd);
+ }
+ return discard_zeroes_data;
+}
+#else
+#define e2fsck_discard_blocks(fs, start, len)
+#define e2fsck_discard_zeroes_data(fs) 0
+#endif
+
#define NO_BLK ((blk64_t) -1)

static void print_bitmap_problem(e2fsck_t ctx, int problem,
@@ -108,6 +177,7 @@ static void check_block_bitmaps(e2fsck_t ctx)
int group = 0;
int blocks = 0;
blk64_t free_blocks = 0;
+ blk64_t first_free = ext2fs_blocks_count(fs->super);
int group_free = 0;
int actual, bitmap;
struct problem_context pctx;
@@ -280,11 +350,24 @@ redo_counts:
}
ctx->flags |= E2F_FLAG_PROG_SUPPRESS;
had_problem++;
+ /*
+ * If there a problem we should turn off the discard so we
+ * do not compromise the filesystem.
+ */
+ ctx->options &= ~E2F_OPT_DISCARD;

do_counts:
if (!bitmap && (!skip_group || csum_flag)) {
group_free++;
free_blocks++;
+ if (first_free > i)
+ first_free = i;
+ } else {
+ if ((i > first_free) &&
+ (ctx->options & E2F_OPT_DISCARD))
+ e2fsck_discard_blocks(ctx, first_free,
+ (i - first_free));
+ first_free = ext2fs_blocks_count(fs->super);
}
blocks ++;
if ((blocks == fs->super->s_blocks_per_group) ||
@@ -500,6 +583,11 @@ redo_counts:
}
ctx->flags |= E2F_FLAG_PROG_SUPPRESS;
had_problem++;
+ /*
+ * If there is a problem we should turn off the discard so we
+ * do not compromise the filesystem.
+ */
+ ctx->options &= ~E2F_OPT_DISCARD;

do_counts:
if (bitmap) {
@@ -509,11 +597,42 @@ do_counts:
group_free++;
free_inodes++;
}
+
inodes++;
if ((inodes == fs->super->s_inodes_per_group) ||
(i == fs->super->s_inodes_count)) {
+ blk64_t used_blks, blk, num;
+
free_array[group] = group_free;
dir_array[group] = dirs_count;
+
+ /* Discard inode table */
+ if (ctx->options & E2F_OPT_DISCARD) {
+ used_blks = DIV_ROUND_UP(
+ (EXT2_INODES_PER_GROUP(fs->super) -
+ group_free),
+ EXT2_INODES_PER_BLOCK(fs->super));
+
+ blk = ext2fs_inode_table_loc(fs, group) +
+ used_blks;
+ num = fs->inode_blocks_per_group -
+ used_blks;
+ e2fsck_discard_blocks(ctx, blk, num);
+ }
+
+ /*
+ * If discard zeroes data and the group inode table
+ * was not zeroed yet, set itable as zeroed
+ */
+ if ((ctx->options & E2F_OPT_DISCARD) &&
+ (e2fsck_discard_zeroes_data(fs)) &&
+ !(ext2fs_bg_flags_test(fs, group,
+ EXT2_BG_INODE_ZEROED))) {
+ ext2fs_bg_flags_set(fs, group,
+ EXT2_BG_INODE_ZEROED);
+ ext2fs_group_desc_csum_set(fs, group);
+ }
+
group ++;
inodes = 0;
skip_group = 0;
diff --git a/e2fsck/unix.c b/e2fsck/unix.c
index 7eb269c..15dd204 100644
--- a/e2fsck/unix.c
+++ b/e2fsck/unix.c
@@ -667,7 +667,11 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
ctx->program_name = *argv;
else
ctx->program_name = "e2fsck";
- while ((c = getopt (argc, argv, "panyrcC:B:dE:fvtFVM:b:I:j:P:l:L:N:SsDk")) != EOF)
+
+ /* set defaults */
+ ctx->options |= E2F_OPT_DISCARD;
+
+ while ((c = getopt (argc, argv, "panyrcC:B:dE:fvtFVM:b:I:j:P:l:L:N:SsDkK")) != EOF)
switch (c) {
case 'C':
ctx->progress = e2fsck_update_progress;
@@ -790,6 +794,9 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
case 'k':
keep_bad_blocks++;
break;
+ case 'K':
+ ctx->options &= ~E2F_OPT_DISCARD;
+ break;
default:
usage(ctx);
}
@@ -943,7 +950,6 @@ static errcode_t try_open_fs(e2fsck_t ctx, int flags, io_manager io_ptr,
return retval;
}



2010-10-21 18:07:23

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On 2010-10-21, at 08:15, Lukas Czerner wrote:
> In Pass 5 when we are checking block and inode bitmaps we have great
> opportunity to discard free space and unused inodes on the device,
> because bitmaps has just been verified as valid. This commit takes
> advantage of this opportunity and discards both, all free space and
> unused inodes.
>
> I have added new option '-K' which when set, disables discard. Also when
> the underlying device does not support discard, or BLKDISCARD ioctl
> returns any kind of error, or when some errors occurred in bitmaps, the
> discard is disabled.

I'm always a bit nervous with patches like this, that will prevent data recovery after an e2fsck run (which seems like the opposite of what we want from e2fsck).

Two suggestions:
- it probably makes sense to disable this by default, and allow it to be
specified on the command-line and e2fsck.conf
- should we really have a short option, or a "-E discard" and "-E nodiscard"
options, which allow us to change the default easily at some later time
(which we can't do with a single -K flag)

> +static void e2fsck_discard_blocks(e2fsck_t ctx, blk_t start,
> + blk_t count)
> +{
> + fd = open64(fs->device_name, O_RDWR);
> + if (fd < 0) {
> + com_err("open", errno,
> + _("while opening %s for discarding"),
> + ctx->device_name);
> + fatal_error(ctx, 0);
> + }
> +
> + ret = ioctl(fd, BLKDISCARD, &range);
> + if (ret)
> + ctx->options &= ~E2F_OPT_DISCARD;
> +
> + close(fd);
> +}

If we are calling this ioctl for a lot of small block ranges, doing an open/close for each one could add significant overhead. The unix struct_io_manager already has an open file descriptor for this block device, maybe it is better to encapsulate this operation there? The ioctl also doesn't make sense for non-Linux platforms (though they may have a different ioctl that is equivalent) so that may be a better solution.

(defect) It makes sense to start with a blk64_t for this function, instead of a blk_t that needs to be fixed immediately for > 16TB filesystems, or the block number will be truncated and accidentally discard the wrong data. Oops.


Cheers, Andreas






2010-10-22 09:12:52

by Lukas Czerner

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On Thu, 21 Oct 2010, Andreas Dilger wrote:

> On 2010-10-21, at 08:15, Lukas Czerner wrote:
> > In Pass 5 when we are checking block and inode bitmaps we have great
> > opportunity to discard free space and unused inodes on the device,
> > because bitmaps has just been verified as valid. This commit takes
> > advantage of this opportunity and discards both, all free space and
> > unused inodes.
> >
> > I have added new option '-K' which when set, disables discard. Also when
> > the underlying device does not support discard, or BLKDISCARD ioctl
> > returns any kind of error, or when some errors occurred in bitmaps, the
> > discard is disabled.
>
> I'm always a bit nervous with patches like this, that will prevent data recovery after an e2fsck run (which seems like the opposite of what we want from e2fsck).
>
> Two suggestions:
> - it probably makes sense to disable this by default, and allow it to be
> specified on the command-line and e2fsck.conf
> - should we really have a short option, or a "-E discard" and "-E nodiscard"
> options, which allow us to change the default easily at some later time
> (which we can't do with a single -K flag)

Right, I agree it would be probably better to disable this by default.

>
> > +static void e2fsck_discard_blocks(e2fsck_t ctx, blk_t start,
> > + blk_t count)
> > +{
> > + fd = open64(fs->device_name, O_RDWR);
> > + if (fd < 0) {
> > + com_err("open", errno,
> > + _("while opening %s for discarding"),
> > + ctx->device_name);
> > + fatal_error(ctx, 0);
> > + }
> > +
> > + ret = ioctl(fd, BLKDISCARD, &range);
> > + if (ret)
> > + ctx->options &= ~E2F_OPT_DISCARD;
> > +
> > + close(fd);
> > +}
>
> If we are calling this ioctl for a lot of small block ranges, doing an open/close for each one could add significant overhead. The unix struct_io_manager already has an open file descriptor for this block device, maybe it is better to encapsulate this operation there? The ioctl also doesn't make sense for non-Linux platforms (though they may have a different ioctl that is equivalent) so that may be a better solution.

That is why the #ifdef __linux__ is there. I agree with using
struct_io_manager file descriptor.

>
> (defect) It makes sense to start with a blk64_t for this function, instead of a blk_t that needs to be fixed immediately for > 16TB filesystems, or the block number will be truncated and accidentally discard the wrong data. Oops.

Oh, I have missed that out.

>
>
> Cheers, Andreas

Thanks for review Andreas.

-Lukas

2010-10-22 11:29:33

by Ric Wheeler

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On 10/22/2010 05:12 AM, Lukas Czerner wrote:
> On Thu, 21 Oct 2010, Andreas Dilger wrote:
>
>> On 2010-10-21, at 08:15, Lukas Czerner wrote:
>>> In Pass 5 when we are checking block and inode bitmaps we have great
>>> opportunity to discard free space and unused inodes on the device,
>>> because bitmaps has just been verified as valid. This commit takes
>>> advantage of this opportunity and discards both, all free space and
>>> unused inodes.
>>>
>>> I have added new option '-K' which when set, disables discard. Also when
>>> the underlying device does not support discard, or BLKDISCARD ioctl
>>> returns any kind of error, or when some errors occurred in bitmaps, the
>>> discard is disabled.
>> I'm always a bit nervous with patches like this, that will prevent data recovery after an e2fsck run (which seems like the opposite of what we want from e2fsck).
>>
>> Two suggestions:
>> - it probably makes sense to disable this by default, and allow it to be
>> specified on the command-line and e2fsck.conf
>> - should we really have a short option, or a "-E discard" and "-E nodiscard"
>> options, which allow us to change the default easily at some later time
>> (which we can't do with a single -K flag)
> Right, I agree it would be probably better to disable this by default.
>
>

If we do disable it by default, I think that we might also want to be consistent
and disable the discard support in mkfs by default as well?

thanks!

Ric


2010-10-22 11:43:27

by Lukas Czerner

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On Fri, 22 Oct 2010, Ric Wheeler wrote:

> On 10/22/2010 05:12 AM, Lukas Czerner wrote:
> > On Thu, 21 Oct 2010, Andreas Dilger wrote:
> >
> > > On 2010-10-21, at 08:15, Lukas Czerner wrote:
> > > > In Pass 5 when we are checking block and inode bitmaps we have great
> > > > opportunity to discard free space and unused inodes on the device,
> > > > because bitmaps has just been verified as valid. This commit takes
> > > > advantage of this opportunity and discards both, all free space and
> > > > unused inodes.
> > > >
> > > > I have added new option '-K' which when set, disables discard. Also when
> > > > the underlying device does not support discard, or BLKDISCARD ioctl
> > > > returns any kind of error, or when some errors occurred in bitmaps, the
> > > > discard is disabled.
> > > I'm always a bit nervous with patches like this, that will prevent data
> > > recovery after an e2fsck run (which seems like the opposite of what we
> > > want from e2fsck).
> > >
> > > Two suggestions:
> > > - it probably makes sense to disable this by default, and allow it to be
> > > specified on the command-line and e2fsck.conf
> > > - should we really have a short option, or a "-E discard" and "-E
> > > nodiscard"
> > > options, which allow us to change the default easily at some later time
> > > (which we can't do with a single -K flag)
> > Right, I agree it would be probably better to disable this by default.
> >
> >
>
> If we do disable it by default, I think that we might also want to be
> consistent and disable the discard support in mkfs by default as well?
>
> thanks!
>
> Ric
>

I think that this will not be necessary. There is a concern that it might
prevent data recovery after fsck because it might be already discarded
(some weird fs corruption?) in pass 5. However in my opinion this is a
very small window (if there even is any), because we have already passed
check 1-4 and we have just confirmed that group descriptors should be ok.
But when there is an even slight chance this might happen I would suggest
that we really disable it by default (at least for a while - we will see
then).

On the other hand there is nothing to be afraid of in the case of mkfs,
because we can not possibly lose any relevant data, because discard is
done before the filesystem gets created.

-Lukas

2010-10-22 14:12:21

by Ric Wheeler

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On 10/22/2010 07:43 AM, Lukas Czerner wrote:
> On Fri, 22 Oct 2010, Ric Wheeler wrote:
>
>> On 10/22/2010 05:12 AM, Lukas Czerner wrote:
>>> On Thu, 21 Oct 2010, Andreas Dilger wrote:
>>>
>>>> On 2010-10-21, at 08:15, Lukas Czerner wrote:
>>>>> In Pass 5 when we are checking block and inode bitmaps we have great
>>>>> opportunity to discard free space and unused inodes on the device,
>>>>> because bitmaps has just been verified as valid. This commit takes
>>>>> advantage of this opportunity and discards both, all free space and
>>>>> unused inodes.
>>>>>
>>>>> I have added new option '-K' which when set, disables discard. Also when
>>>>> the underlying device does not support discard, or BLKDISCARD ioctl
>>>>> returns any kind of error, or when some errors occurred in bitmaps, the
>>>>> discard is disabled.
>>>> I'm always a bit nervous with patches like this, that will prevent data
>>>> recovery after an e2fsck run (which seems like the opposite of what we
>>>> want from e2fsck).
>>>>
>>>> Two suggestions:
>>>> - it probably makes sense to disable this by default, and allow it to be
>>>> specified on the command-line and e2fsck.conf
>>>> - should we really have a short option, or a "-E discard" and "-E
>>>> nodiscard"
>>>> options, which allow us to change the default easily at some later time
>>>> (which we can't do with a single -K flag)
>>> Right, I agree it would be probably better to disable this by default.
>>>
>>>
>> If we do disable it by default, I think that we might also want to be
>> consistent and disable the discard support in mkfs by default as well?
>>
>> thanks!
>>
>> Ric
>>
> I think that this will not be necessary. There is a concern that it might
> prevent data recovery after fsck because it might be already discarded
> (some weird fs corruption?) in pass 5. However in my opinion this is a
> very small window (if there even is any), because we have already passed
> check 1-4 and we have just confirmed that group descriptors should be ok.
> But when there is an even slight chance this might happen I would suggest
> that we really disable it by default (at least for a while - we will see
> then).
>
> On the other hand there is nothing to be afraid of in the case of mkfs,
> because we can not possibly lose any relevant data, because discard is
> done before the filesystem gets created.
>
> -Lukas

My concern with mkfs is that we have seen several devices which don't handle
this well.

We will be using this TRIM (or UNMAP, etc) on lots of old, creaky hardware with
old firmware, so having it try on all devices is almost certainly going to cause
breakages, hangs, etc in the field....

Ric


2010-10-22 14:32:59

by Lukas Czerner

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On Fri, 22 Oct 2010, Ric Wheeler wrote:

> On 10/22/2010 07:43 AM, Lukas Czerner wrote:
> > On Fri, 22 Oct 2010, Ric Wheeler wrote:
> >
> > > On 10/22/2010 05:12 AM, Lukas Czerner wrote:
> > > > On Thu, 21 Oct 2010, Andreas Dilger wrote:
> > > >
> > > > > On 2010-10-21, at 08:15, Lukas Czerner wrote:
> > > > > > In Pass 5 when we are checking block and inode bitmaps we have great
> > > > > > opportunity to discard free space and unused inodes on the device,
> > > > > > because bitmaps has just been verified as valid. This commit takes
> > > > > > advantage of this opportunity and discards both, all free space and
> > > > > > unused inodes.
> > > > > >
> > > > > > I have added new option '-K' which when set, disables discard. Also
> > > > > > when
> > > > > > the underlying device does not support discard, or BLKDISCARD ioctl
> > > > > > returns any kind of error, or when some errors occurred in bitmaps,
> > > > > > the
> > > > > > discard is disabled.
> > > > > I'm always a bit nervous with patches like this, that will prevent
> > > > > data
> > > > > recovery after an e2fsck run (which seems like the opposite of what we
> > > > > want from e2fsck).
> > > > >
> > > > > Two suggestions:
> > > > > - it probably makes sense to disable this by default, and allow it to
> > > > > be
> > > > > specified on the command-line and e2fsck.conf
> > > > > - should we really have a short option, or a "-E discard" and "-E
> > > > > nodiscard"
> > > > > options, which allow us to change the default easily at some later
> > > > > time
> > > > > (which we can't do with a single -K flag)
> > > > Right, I agree it would be probably better to disable this by default.
> > > >
> > > >
> > > If we do disable it by default, I think that we might also want to be
> > > consistent and disable the discard support in mkfs by default as well?
> > >
> > > thanks!
> > >
> > > Ric
> > >
> > I think that this will not be necessary. There is a concern that it might
> > prevent data recovery after fsck because it might be already discarded
> > (some weird fs corruption?) in pass 5. However in my opinion this is a
> > very small window (if there even is any), because we have already passed
> > check 1-4 and we have just confirmed that group descriptors should be ok.
> > But when there is an even slight chance this might happen I would suggest
> > that we really disable it by default (at least for a while - we will see
> > then).
> >
> > On the other hand there is nothing to be afraid of in the case of mkfs,
> > because we can not possibly lose any relevant data, because discard is
> > done before the filesystem gets created.
> >
> > -Lukas
>
> My concern with mkfs is that we have seen several devices which don't handle
> this well.
>
> We will be using this TRIM (or UNMAP, etc) on lots of old, creaky hardware
> with old firmware, so having it try on all devices is almost certainly going
> to cause breakages, hangs, etc in the field....
>
> Ric
>

Well, so far the only breakages I have seen was with lots of small TRIMs
(or UNMAPs, etc) issued in random pattern, never in case of mkfs which
is quite a opposite - big sequential ranges.

Hangs should be covered by those two patches:

http://marc.info/?l=linux-ext4&m=128774558623608&w=2
http://marc.info/?l=linux-ext4&m=128767099123375&w=2

if, of course, they get upstream. Also there is a big win, when discard
also zeroes data, because in that case we can just skip inode table
initialization (zeroing) without any need of in-kernel lazyinit code
enabled. And we get all this for free. It was introduced with Sandeens
patch:

http://marc.info/?l=linux-ext4&m=128234048208327&w=2

So, I would rather leave it on by default.

-Lukas

2010-10-22 14:46:13

by Ric Wheeler

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On 10/22/2010 10:32 AM, Lukas Czerner wrote:
> On Fri, 22 Oct 2010, Ric Wheeler wrote:
>
>> On 10/22/2010 07:43 AM, Lukas Czerner wrote:
>>> On Fri, 22 Oct 2010, Ric Wheeler wrote:
>>>
>>>> On 10/22/2010 05:12 AM, Lukas Czerner wrote:
>>>>> On Thu, 21 Oct 2010, Andreas Dilger wrote:
>>>>>
>>>>>> On 2010-10-21, at 08:15, Lukas Czerner wrote:
>>>>>>> In Pass 5 when we are checking block and inode bitmaps we have great
>>>>>>> opportunity to discard free space and unused inodes on the device,
>>>>>>> because bitmaps has just been verified as valid. This commit takes
>>>>>>> advantage of this opportunity and discards both, all free space and
>>>>>>> unused inodes.
>>>>>>>
>>>>>>> I have added new option '-K' which when set, disables discard. Also
>>>>>>> when
>>>>>>> the underlying device does not support discard, or BLKDISCARD ioctl
>>>>>>> returns any kind of error, or when some errors occurred in bitmaps,
>>>>>>> the
>>>>>>> discard is disabled.
>>>>>> I'm always a bit nervous with patches like this, that will prevent
>>>>>> data
>>>>>> recovery after an e2fsck run (which seems like the opposite of what we
>>>>>> want from e2fsck).
>>>>>>
>>>>>> Two suggestions:
>>>>>> - it probably makes sense to disable this by default, and allow it to
>>>>>> be
>>>>>> specified on the command-line and e2fsck.conf
>>>>>> - should we really have a short option, or a "-E discard" and "-E
>>>>>> nodiscard"
>>>>>> options, which allow us to change the default easily at some later
>>>>>> time
>>>>>> (which we can't do with a single -K flag)
>>>>> Right, I agree it would be probably better to disable this by default.
>>>>>
>>>>>
>>>> If we do disable it by default, I think that we might also want to be
>>>> consistent and disable the discard support in mkfs by default as well?
>>>>
>>>> thanks!
>>>>
>>>> Ric
>>>>
>>> I think that this will not be necessary. There is a concern that it might
>>> prevent data recovery after fsck because it might be already discarded
>>> (some weird fs corruption?) in pass 5. However in my opinion this is a
>>> very small window (if there even is any), because we have already passed
>>> check 1-4 and we have just confirmed that group descriptors should be ok.
>>> But when there is an even slight chance this might happen I would suggest
>>> that we really disable it by default (at least for a while - we will see
>>> then).
>>>
>>> On the other hand there is nothing to be afraid of in the case of mkfs,
>>> because we can not possibly lose any relevant data, because discard is
>>> done before the filesystem gets created.
>>>
>>> -Lukas
>> My concern with mkfs is that we have seen several devices which don't handle
>> this well.
>>
>> We will be using this TRIM (or UNMAP, etc) on lots of old, creaky hardware
>> with old firmware, so having it try on all devices is almost certainly going
>> to cause breakages, hangs, etc in the field....
>>
>> Ric
>>
> Well, so far the only breakages I have seen was with lots of small TRIMs
> (or UNMAPs, etc) issued in random pattern, never in case of mkfs which
> is quite a opposite - big sequential ranges.
>
> Hangs should be covered by those two patches:
>
> http://marc.info/?l=linux-ext4&m=128774558623608&w=2
> http://marc.info/?l=linux-ext4&m=128767099123375&w=2
>
> if, of course, they get upstream. Also there is a big win, when discard
> also zeroes data, because in that case we can just skip inode table
> initialization (zeroing) without any need of in-kernel lazyinit code
> enabled. And we get all this for free. It was introduced with Sandeens
> patch:
>
> http://marc.info/?l=linux-ext4&m=128234048208327&w=2
>
> So, I would rather leave it on by default.
>
> -Lukas

You cannot 100% depend on discard zeroing blocks - that is not a universal
requirement of devices that support it. Specifically, for ATA devices, I think
that there are optional bits that specify how a device will behave when you read
from a trimmed region.

Ric


2010-10-22 15:38:01

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

Ric Wheeler wrote:

...

>> Well, so far the only breakages I have seen was with lots of small TRIMs
>> (or UNMAPs, etc) issued in random pattern, never in case of mkfs which
>> is quite a opposite - big sequential ranges.
>>
>> Hangs should be covered by those two patches:
>>
>> http://marc.info/?l=linux-ext4&m=128774558623608&w=2
>> http://marc.info/?l=linux-ext4&m=128767099123375&w=2
>>
>> if, of course, they get upstream. Also there is a big win, when discard
>> also zeroes data, because in that case we can just skip inode table
>> initialization (zeroing) without any need of in-kernel lazyinit code
>> enabled. And we get all this for free. It was introduced with Sandeens
>> patch:
>>
>> http://marc.info/?l=linux-ext4&m=128234048208327&w=2
>>
>> So, I would rather leave it on by default.
>>
>> -Lukas
>
> You cannot 100% depend on discard zeroing blocks - that is not a
> universal requirement of devices that support it. Specifically, for ATA
> devices, I think that there are optional bits that specify how a device
> will behave when you read from a trimmed region.

But don't we have the ability to test whether discard -does- zero blocks,
as advertised by the device? And honestly if the device mis-reports, that
sounds like a device vendor problem to fix.

The proposal wasn't to discard and assume zero, but to check for that
behavior:

http://kerneltrap.org/mailarchive/linux-ext4/2010/9/21/6885628/thread

+ if (!retval && mke2fs_discard_zeroes_data(fs)) {
+ if (verbose)
+ printf(_("Discard succeeded and will return 0s "
+ " - enabling lazy_itable_init\n"));
+ lazy_itable_init = 1;
+ lazy_itable_zeroed = 1;
+ }

so we're not depending on it zeroing blocks, we're just depending on it
advertising correctly whether or not it -does- zero.

-Eric

> Ric
>


2010-10-22 15:41:11

by Ric Wheeler

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On 10/22/2010 11:37 AM, Eric Sandeen wrote:
> Ric Wheeler wrote:
>
> ...
>
>>> Well, so far the only breakages I have seen was with lots of small TRIMs
>>> (or UNMAPs, etc) issued in random pattern, never in case of mkfs which
>>> is quite a opposite - big sequential ranges.
>>>
>>> Hangs should be covered by those two patches:
>>>
>>> http://marc.info/?l=linux-ext4&m=128774558623608&w=2
>>> http://marc.info/?l=linux-ext4&m=128767099123375&w=2
>>>
>>> if, of course, they get upstream. Also there is a big win, when discard
>>> also zeroes data, because in that case we can just skip inode table
>>> initialization (zeroing) without any need of in-kernel lazyinit code
>>> enabled. And we get all this for free. It was introduced with Sandeens
>>> patch:
>>>
>>> http://marc.info/?l=linux-ext4&m=128234048208327&w=2
>>>
>>> So, I would rather leave it on by default.
>>>
>>> -Lukas
>> You cannot 100% depend on discard zeroing blocks - that is not a
>> universal requirement of devices that support it. Specifically, for ATA
>> devices, I think that there are optional bits that specify how a device
>> will behave when you read from a trimmed region.
> But don't we have the ability to test whether discard -does- zero blocks,
> as advertised by the device? And honestly if the device mis-reports, that
> sounds like a device vendor problem to fix.
>
> The proposal wasn't to discard and assume zero, but to check for that
> behavior:
>
> http://kerneltrap.org/mailarchive/linux-ext4/2010/9/21/6885628/thread
>
> + if (!retval&& mke2fs_discard_zeroes_data(fs)) {
> + if (verbose)
> + printf(_("Discard succeeded and will return 0s "
> + " - enabling lazy_itable_init\n"));
> + lazy_itable_init = 1;
> + lazy_itable_zeroed = 1;
> + }
>
> so we're not depending on it zeroing blocks, we're just depending on it
> advertising correctly whether or not it -does- zero.
>
> -Eric
>
>

I think that ATA devices have historically not done this correctly, but the T13
committee is working on it. The question is whether the bit we check and rely on
has the right semantics (and then if the device will reliably implement this).

Historically, array vendors did rely on SCSI commands like the old fashioned
"WRITE_SAME" to initialize storage for them, but that takes a *long* time to run :)

Ric



2010-10-22 17:04:25

by Martin K. Petersen

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

>>>>> "Ric" == Ric Wheeler <[email protected]> writes:

>> so we're not depending on it zeroing blocks, we're just depending on
>> it advertising correctly whether or not it -does- zero.

If the relevant bits are set (ATA: DRAT and RZAT, SCSI: TPRZ) we'll set
the bdev's discard_zeroes_data flag.

[root@test ~]# lsscsi | grep SSD | awk '{ print $7 }'
/dev/sde
[root@test ~]# grep . /sys/block/sde/queue/discard_zeroes_data
1

The relevant ioctl is BLKDISCARDZEROES.


Ric> I think that ATA devices have historically not done this correctly,

I'm only aware of one drive that advertised RZAT and got it wrong. I
believe a firmware update fixed it.

Generally we assume that if the firmware writers go through the effort
of reporting things correctly then they have also implemented the
feature. We have quite a few sanity checks in place in libata so we
won't trigger if the firmware guys just put all ones in a word, for
instance. There are several things that need to line up for us to
actually set the discard flags.


PS. http://oss.oracle.com/~mkp/docs/linux-advanced-storage.pdf

--
Martin K. Petersen Oracle Linux Engineering

2010-10-22 17:14:19

by Ric Wheeler

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On 10/22/2010 01:03 PM, Martin K. Petersen wrote:
>>>>>> "Ric" == Ric Wheeler<[email protected]> writes:
>>> so we're not depending on it zeroing blocks, we're just depending on
>>> it advertising correctly whether or not it -does- zero.
> If the relevant bits are set (ATA: DRAT and RZAT, SCSI: TPRZ) we'll set
> the bdev's discard_zeroes_data flag.
>
> [root@test ~]# lsscsi | grep SSD | awk '{ print $7 }'
> /dev/sde
> [root@test ~]# grep . /sys/block/sde/queue/discard_zeroes_data
> 1
>
> The relevant ioctl is BLKDISCARDZEROES.
>
>
> Ric> I think that ATA devices have historically not done this correctly,
>
> I'm only aware of one drive that advertised RZAT and got it wrong. I
> believe a firmware update fixed it.
>
> Generally we assume that if the firmware writers go through the effort
> of reporting things correctly then they have also implemented the
> feature. We have quite a few sanity checks in place in libata so we
> won't trigger if the firmware guys just put all ones in a word, for
> instance. There are several things that need to line up for us to
> actually set the discard flags.
>
>
> PS. http://oss.oracle.com/~mkp/docs/linux-advanced-storage.pdf
>

Hi Martin,

I think that the T13 people are still reworking (or have just finished?) the
spec. For example,

http://www.t13.org/documents/UploadedDocuments/docs2010/e09158r2-Trim_Clarifications.pdf

has more detail on how to interpret those bits. I certainly cannot claim to
have followed all of this recently, but do recall that early on the ATA devices
specifically had various behaviors that caused concern.

Ric


2010-10-22 17:31:43

by Martin K. Petersen

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

>>>>> "Ric" == Ric Wheeler <[email protected]> writes:

Ric> I think that the T13 people are still reworking (or have just
Ric> finished?) the spec. For example,

ACS-2 is pretty much set in stone (it's in ballot). The trim bits
haven't changed for quite a while and we're compliant with the latest
draft.

--
Martin K. Petersen Oracle Linux Engineering

2010-10-22 17:50:43

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On 2010-10-22, at 08:46, Ric Wheeler wrote:
> On 10/22/2010 10:32 AM, Lukas Czerner wrote:
>>
>> Also there is a big win, when discard
>> also zeroes data, because in that case we can just skip inode table
>> initialization (zeroing) without any need of in-kernel lazyinit code
>> enabled. And we get all this for free. It was introduced with Sandeens
>> patch:
>>
>> http://marc.info/?l=linux-ext4&m=128234048208327&w=2
>>
>> So, I would rather leave it on by default.
>
> You cannot 100% depend on discard zeroing blocks - that is not a universal requirement of devices that support it. Specifically, for ATA devices, I think that there are optional bits that specify how a device will behave when you read from a trimmed region.

That patch also checks for the zeroing feature. When this patch was first under discussion, I proposed that we validate that the device is actually zeroed by doing a write a non-zero block to the disk and then calling discard+zero for that region, and reading back the block and verifying it.

Eric wasn't convinced that was necessary, maybe you can convince him more...

Cheers, Andreas






2010-10-22 18:00:49

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On 2010-10-22, at 08:32, Lukas Czerner wrote:
>>> There is a concern that discard might
>>> prevent data recovery after fsck because it might be already discarded
>>> (some weird fs corruption?) in pass 5. However in my opinion this is a
>>> very small window (if there even is any), because we have already passed
>>> check 1-4 and we have just confirmed that group descriptors should be ok.

I don't totally agree. When users have a serious filesystem problem, the first thing they normally do is run e2fsck to see if it is corrected (it may even be done automatically at boot after errors=panic causing a reboot.

After that, they may want to recover some more data (e.g. with ext3grep, or restore an e2image of the metadata, and re-run e2fsck). If e2fsck will discard all of the data then any data recovery will be impossible.

>>> On the other hand there is nothing to be afraid of in the case of mkfs,
>>> because we can not possibly lose any relevant data, because discard is
>>> done before the filesystem gets created.

Well, I've worked several times with users that have accidentally repartitioned and/or reformatted over top of their important data, and it is usually possible to recover some data from this. Again, with discard that would be impossible.

I agree that with mke2fs there is less often a need to do that, and the normal intent of mke2fs is to destroy the previously-existing data, which is why I don't totally object to allowing discard by default for it. The intent of e2fsck is different however, since it is usually used for recovery purposes.

I'm trying to think of a good heuristic for when discard could be chosen automatically, but have a hard time doing so:

- the current heuristic of "no block bitmap errors" is not strong enough...
- even a completely clean e2fsck is not sufficient, because sometimes/often
in the case of serious corruption a second e2fsck is run to ensure all
the problems have been fixed
- after "N" consecutive clean e2fsck runs might be enough, but we don't have
a counter for that today (but it isn't hard to add, if we are changing the
e2fsck code anyway). That said, e2fsck is run so rarely on some systems
that this would equate to "never" in some cases


Cheers, Andreas






2010-10-22 18:02:00

by Lukas Czerner

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On Fri, 22 Oct 2010, Andreas Dilger wrote:

> On 2010-10-22, at 08:46, Ric Wheeler wrote:
> > On 10/22/2010 10:32 AM, Lukas Czerner wrote:
> >>
> >> Also there is a big win, when discard
> >> also zeroes data, because in that case we can just skip inode table
> >> initialization (zeroing) without any need of in-kernel lazyinit code
> >> enabled. And we get all this for free. It was introduced with Sandeens
> >> patch:
> >>
> >> http://marc.info/?l=linux-ext4&m=128234048208327&w=2
> >>
> >> So, I would rather leave it on by default.
> >
> > You cannot 100% depend on discard zeroing blocks - that is not a universal requirement of devices that support it. Specifically, for ATA devices, I think that there are optional bits that specify how a device will behave when you read from a trimmed region.
>
> That patch also checks for the zeroing feature. When this patch was first under discussion, I proposed that we validate that the device is actually zeroed by doing a write a non-zero block to the disk and then calling discard+zero for that region, and reading back the block and verifying it.
>
> Eric wasn't convinced that was necessary, maybe you can convince him more...
>
> Cheers, Andreas

One of the counter arguments was, that some devices does not preserve
this behavior through power cycles. I think Ted was the one talking
about that.

-Lukas

2010-10-22 18:17:33

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On 2010-10-22, at 12:01, Lukas Czerner wrote:
>> That patch also checks for the zeroing feature. When this patch was first under discussion, I proposed that we validate that the device is actually zeroed by doing a write a non-zero block to the disk and then calling discard+zero for that region, and reading back the block and verifying it.
>>
>> Eric wasn't convinced that was necessary, maybe you can convince him more...
>
> One of the counter arguments was, that some devices does not preserve
> this behavior through power cycles. I think Ted was the one talking
> about that.

Sure, I don't think we can handle every pathology, but doing a write/discard/read of a few blocks (when it has the potential to avoid many GB of writes for zeroing) is surely easy and worthwhile?

In any case, I thought that discussion was about a device that didn't report BLKDISCARDSZEROES=1, but only that a normal DISCARD would read back zero until the next restart? That prevents optimizations like "read until we see non-zero data, then start writing zeroes", which would still be faster for many RAID devices (or older kernels that don't have DISCARD/ZERO support at all).

Cheers, Andreas






2010-10-22 18:23:22

by Ric Wheeler

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On 10/22/2010 02:17 PM, Andreas Dilger wrote:
> On 2010-10-22, at 12:01, Lukas Czerner wrote:
>>> That patch also checks for the zeroing feature. When this patch was first under discussion, I proposed that we validate that the device is actually zeroed by doing a write a non-zero block to the disk and then calling discard+zero for that region, and reading back the block and verifying it.
>>>
>>> Eric wasn't convinced that was necessary, maybe you can convince him more...
>> One of the counter arguments was, that some devices does not preserve
>> this behavior through power cycles. I think Ted was the one talking
>> about that.
> Sure, I don't think we can handle every pathology, but doing a write/discard/read of a few blocks (when it has the potential to avoid many GB of writes for zeroing) is surely easy and worthwhile?
>
> In any case, I thought that discussion was about a device that didn't report BLKDISCARDSZEROES=1, but only that a normal DISCARD would read back zero until the next restart? That prevents optimizations like "read until we see non-zero data, then start writing zeroes", which would still be faster for many RAID devices (or older kernels that don't have DISCARD/ZERO support at all).
>
> Cheers, Andreas

Just to further confuse things, if we just want to zero a device, there is the
(relatively old) WRITE_SAME command that arrays use. Note that it is quite a bit
faster than doing this from the server since you only transfer over one block of
data and the disk firmware does the rest - no data transfer for each block once
you start.

It can certainly take a long, long time, but would be faster than zeroing a
drive with write() system calls :)

ric


2010-10-22 18:23:44

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

Martin K. Petersen wrote:
>>>>>> "Ric" == Ric Wheeler <[email protected]> writes:
>
>>> so we're not depending on it zeroing blocks, we're just depending on
>>> it advertising correctly whether or not it -does- zero.
>
> If the relevant bits are set (ATA: DRAT and RZAT, SCSI: TPRZ) we'll set
> the bdev's discard_zeroes_data flag.
>
> [root@test ~]# lsscsi | grep SSD | awk '{ print $7 }'
> /dev/sde
> [root@test ~]# grep . /sys/block/sde/queue/discard_zeroes_data
> 1
>
> The relevant ioctl is BLKDISCARDZEROES.

which is what that patch I pointed to uses, BTW...

I really think it's on firm footing here, we're not assuming
discard == zeroed data, we are checking whether the drive
reports exactly that behavior ...

-Eric

2010-10-22 18:27:26

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

Andreas Dilger wrote:
> On 2010-10-22, at 08:32, Lukas Czerner wrote:
>>>> There is a concern that discard might prevent data recovery
>>>> after fsck because it might be already discarded (some weird fs
>>>> corruption?) in pass 5. However in my opinion this is a very
>>>> small window (if there even is any), because we have already
>>>> passed check 1-4 and we have just confirmed that group
>>>> descriptors should be ok.
>
> I don't totally agree. When users have a serious filesystem problem,
> the first thing they normally do is run e2fsck to see if it is
> corrected (it may even be done automatically at boot after
> errors=panic causing a reboot.
>
> After that, they may want to recover some more data (e.g. with
> ext3grep, or restore an e2image of the metadata, and re-run e2fsck).
> If e2fsck will discard all of the data then any data recovery will be
> impossible.

Could set it to only issue discard when the check was clean....

It could still be an option of course, but that might be safer still.

-Eric

2010-10-22 18:29:47

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

Andreas Dilger wrote:
> On 2010-10-22, at 08:46, Ric Wheeler wrote:
>> On 10/22/2010 10:32 AM, Lukas Czerner wrote:
>>> Also there is a big win, when discard also zeroes data, because
>>> in that case we can just skip inode table initialization
>>> (zeroing) without any need of in-kernel lazyinit code enabled.
>>> And we get all this for free. It was introduced with Sandeens
>>> patch:
>>>
>>> http://marc.info/?l=linux-ext4&m=128234048208327&w=2
>>>
>>> So, I would rather leave it on by default.
>> You cannot 100% depend on discard zeroing blocks - that is not a
>> universal requirement of devices that support it. Specifically, for
>> ATA devices, I think that there are optional bits that specify how
>> a device will behave when you read from a trimmed region.
>
> That patch also checks for the zeroing feature. When this patch was
> first under discussion, I proposed that we validate that the device
> is actually zeroed by doing a write a non-zero block to the disk and
> then calling discard+zero for that region, and reading back the block
> and verifying it.
>
> Eric wasn't convinced that was necessary, maybe you can convince him
> more...

*grin* I wouldn't -veto- it, but I think we have to maintain a balance
between defensive programming and trying to handle every possible piece
of buggy hardware by doing too many extra gyrations in the code...

-Eric

> Cheers, Andreas


2010-10-22 18:31:53

by Lukas Czerner

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

On Fri, 22 Oct 2010, Eric Sandeen wrote:

> Andreas Dilger wrote:
> > On 2010-10-22, at 08:32, Lukas Czerner wrote:
> >>>> There is a concern that discard might prevent data recovery
> >>>> after fsck because it might be already discarded (some weird fs
> >>>> corruption?) in pass 5. However in my opinion this is a very
> >>>> small window (if there even is any), because we have already
> >>>> passed check 1-4 and we have just confirmed that group
> >>>> descriptors should be ok.
> >
> > I don't totally agree. When users have a serious filesystem problem,
> > the first thing they normally do is run e2fsck to see if it is
> > corrected (it may even be done automatically at boot after
> > errors=panic causing a reboot.
> >
> > After that, they may want to recover some more data (e.g. with
> > ext3grep, or restore an e2image of the metadata, and re-run e2fsck).
> > If e2fsck will discard all of the data then any data recovery will be
> > impossible.
>
> Could set it to only issue discard when the check was clean....
>
> It could still be an option of course, but that might be safer still.
>
> -Eric
>

Well, this is the way I am doing it now: in pass 5 (which is the last
one), after the group descriptors has been checked, so there is an
assumption that it should be clean. But I agree, that it should be off
by default in fsck.

-Lukas

2010-10-22 21:02:11

by Martin K. Petersen

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

>>>>> "Andreas" == Andreas Dilger <[email protected]> writes:

Andreas> When this patch was first under discussion, I proposed that we
Andreas> validate that the device is actually zeroed by doing a write a
Andreas> non-zero block to the disk and then calling discard+zero for
Andreas> that region, and reading back the block and verifying it.

The problem is that the devices that get this wrong don't
deterministically return zeroes. So they may return zeroes for the
discarded test block but garbage for the stuff you actually depend on
being cleared.

--
Martin K. Petersen Oracle Linux Engineering

2010-10-22 21:20:54

by Martin K. Petersen

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: Discard free data and inode blocks.

>>>>> "Ric" == Ric Wheeler <[email protected]> writes:

Ric> Just to further confuse things, if we just want to zero a device,
Ric> there is the (relatively old) WRITE_SAME command that arrays
Ric> use. Note that it is quite a bit faster than doing this from the
Ric> server since you only transfer over one block of data and the disk
Ric> firmware does the rest - no data transfer for each block once you
Ric> start.

Ric> It can certainly take a long, long time, but would be faster than
Ric> zeroing a drive with write() system calls :)

I took some stabs at this in the spring. And while it looked like a good
idea on paper it turned out not to be a huge win unless the FC link was
heavily congested due to traffic to other devices.

First of all many drives have a cap on the maximum number of blocks
that can be written using one WRITE SAME command. Typically you can only
write 16-32 megs at a time. So I needed to have a bunch of magic to
scale down and retry while attempting to find the sweet spot.

Fred tried to convince T10 that it would be nice to have a field in the
block limits VPD that would indicate the max WRITE SAME blocks a device
supported. But T10 thought that was a bad idea and the proposal was
rejected. Otherwise I would have wired that up and we could have handled
generic WRITE SAME like we do the discard case.

The other problem is that the WRITE SAME may take a looong time. And so
we need special timeouts in place to prevent regular error handling from
kicking in while the drive is busy wiping stuff.

I guess we could just pick a number (16 MB, maybe) and define that as
the max. Picking a low number also has the benefit of being less likely
to interfere with timeouts.

If there's interest I'll be happy to revisit my patches...

--
Martin K. Petersen Oracle Linux Engineering