2017-08-21 23:23:18

by Doug Porter

[permalink] [raw]
Subject: [PATCH] e2fsck: improve performance of region_allocate()

Use a red-black tree to track allocations instead of a linked list.
This provides a substantial performance boost when the number of
allocations in a region is large. This change resulted in a reduction
in runtime from 4821s to 6s on one filesystem.

Signed-off-by: Doug Porter <[email protected]>
---
e2fsck/Makefile.in | 4 +-
e2fsck/region.c | 109 ++++++++++++++++++++++++++++-------------------------
lib/support/dict.c | 6 ---
3 files changed, 60 insertions(+), 59 deletions(-)

diff --git a/e2fsck/Makefile.in b/e2fsck/Makefile.in
index e43d340..cf60082 100644
--- a/e2fsck/Makefile.in
+++ b/e2fsck/Makefile.in
@@ -148,7 +148,7 @@ tst_region: region.c $(DEPLIBCOM_ERR)
$(E) " LD $@"
$(Q) $(CC) -o tst_region $(srcdir)/region.c \
$(ALL_CFLAGS) $(ALL_LDFLAGS) -DTEST_PROGRAM \
- $(LIBCOM_ERR) $(SYSLIBS)
+ $(LIBCOM_ERR) $(LIBSUPPORT) $(SYSLIBS)

fullcheck check:: tst_refcount tst_region tst_problem
$(TESTENV) ./tst_refcount
@@ -499,7 +499,7 @@ region.o: $(srcdir)/region.c $(top_builddir)/lib/config.h \
$(top_srcdir)/lib/ext2fs/ext2_ext_attr.h $(top_srcdir)/lib/ext2fs/bitops.h \
$(top_srcdir)/lib/support/profile.h $(top_builddir)/lib/support/prof_err.h \
$(top_srcdir)/lib/support/quotaio.h $(top_srcdir)/lib/support/dqblk_v2.h \
- $(top_srcdir)/lib/support/quotaio_tree.h
+ $(top_srcdir)/lib/support/quotaio_tree.h $(top_srcdir)/lib/support/dict.h
sigcatcher.o: $(srcdir)/sigcatcher.c $(top_builddir)/lib/config.h \
$(top_builddir)/lib/dirpaths.h $(srcdir)/e2fsck.h \
$(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h \
diff --git a/e2fsck/region.c b/e2fsck/region.c
index e32f89d..41bdc9f 100644
--- a/e2fsck/region.c
+++ b/e2fsck/region.c
@@ -19,19 +19,37 @@
#undef ENABLE_NLS
#endif
#include "e2fsck.h"
+#include "support/dict.h"

struct region_el {
region_addr_t start;
region_addr_t end;
- struct region_el *next;
};

struct region_struct {
region_addr_t min;
region_addr_t max;
- struct region_el *allocated;
+ dict_t dict;
};

+static int region_dict_cmp(const void *a, const void *b)
+{
+ if (*(region_addr_t *)a > *(region_addr_t *)b)
+ return 1;
+ if (*(region_addr_t *)a < *(region_addr_t *)b)
+ return -1;
+ return 0;
+}
+
+static void region_dnode_free(dnode_t *node, void *context)
+{
+ void *data;
+
+ data = dnode_get(node);
+ free(data);
+ free(node);
+}
+
region_t region_create(region_addr_t min, region_addr_t max)
{
region_t region;
@@ -42,24 +60,23 @@ region_t region_create(region_addr_t min, region_addr_t max)
memset(region, 0, sizeof(struct region_struct));
region->min = min;
region->max = max;
+
+ dict_init(&region->dict, DICTCOUNT_T_MAX, region_dict_cmp);
+ dict_set_allocator(&region->dict, NULL, region_dnode_free, NULL);
+
return region;
}

void region_free(region_t region)
{
- struct region_el *r, *next;
-
- for (r = region->allocated; r; r = next) {
- next = r->next;
- free(r);
- }
- memset(region, 0, sizeof(struct region_struct));
+ dict_free_nodes(&region->dict);
free(region);
}

int region_allocate(region_t region, region_addr_t start, int n)
{
- struct region_el *r, *new_region, *prev, *next;
+ struct dnode_t *lower_node = NULL, *upper_node = NULL;
+ struct region_el *new_region, *lower = NULL, *upper = NULL;
region_addr_t end;

end = start+n;
@@ -68,52 +85,38 @@ int region_allocate(region_t region, region_addr_t start, int n)
if (n == 0)
return 1;

- /*
- * Search through the linked list. If we find that it
- * conflicts witih something that's already allocated, return
- * 1; if we can find an existing region which we can grow, do
- * so. Otherwise, stop when we find the appropriate place
- * insert a new region element into the linked list.
- */
- for (r = region->allocated, prev=NULL; r; prev = r, r = r->next) {
- if (((start >= r->start) && (start < r->end)) ||
- ((end > r->start) && (end <= r->end)) ||
- ((start <= r->start) && (end >= r->end)))
+ /* Return 1 if we conflict with something that's already allocated. */
+ lower_node = dict_upper_bound(&region->dict, &start);
+ if (lower_node) {
+ lower = dnode_get(lower_node);
+ if (start < lower->end)
return 1;
- if (end == r->start) {
- r->start = start;
- return 0;
- }
- if (start == r->end) {
- if ((next = r->next)) {
- if (end > next->start)
- return 1;
- if (end == next->start) {
- r->end = next->end;
- r->next = next->next;
- free(next);
- return 0;
- }
- }
- r->end = end;
- return 0;
- }
- if (start < r->start)
- break;
}
- /*
- * Insert a new region element structure into the linked list
- */
+ upper_node = dict_lower_bound(&region->dict, &start);
+ if (upper_node) {
+ upper = dnode_get(upper_node);
+ if (end > upper->start)
+ return 1;
+ }
+
+ /* Consolidate continguous allocations. */
+ if (lower && start == lower->end) {
+ start = lower->start;
+ dict_delete_free(&region->dict, lower_node);
+ }
+ if (upper && end == upper->start) {
+ end = upper->end;
+ dict_delete_free(&region->dict, upper_node);
+ }
+
+ /* Add new allocation. */
new_region = malloc(sizeof(struct region_el));
if (!new_region)
return -1;
new_region->start = start;
- new_region->end = start + n;
- new_region->next = r;
- if (prev)
- prev->next = new_region;
- else
- region->allocated = new_region;
+ new_region->end = end;
+ if (!dict_alloc_insert(&region->dict, &new_region->start, new_region))
+ return -1;
return 0;
}

@@ -156,15 +159,19 @@ int bcode_program[] = {

void region_print(region_t region, FILE *f)
{
+ dnode_t *node = NULL;
struct region_el *r;
int i = 0;

fprintf(f, "Printing region (min=%llu. max=%llu)\n\t", region->min,
region->max);
- for (r = region->allocated; r; r = r->next) {
+ node = dict_first(&region->dict);
+ while (node) {
+ r = dnode_get(node);
fprintf(f, "(%llu, %llu) ", r->start, r->end);
if (++i >= 8)
fprintf(f, "\n\t");
+ node = dict_next(&region->dict, node);
}
fprintf(f, "\n");
}
diff --git a/lib/support/dict.c b/lib/support/dict.c
index 6a5c15c..e3b2972 100644
--- a/lib/support/dict.c
+++ b/lib/support/dict.c
@@ -490,7 +490,6 @@ dnode_t *dict_lookup(dict_t *dict, const void *key)
return NULL;
}

-#ifdef E2FSCK_NOTUSED
/*
* Look for the node corresponding to the lowest key that is equal to or
* greater than the given key. If there is no such node, return null.
@@ -554,7 +553,6 @@ dnode_t *dict_upper_bound(dict_t *dict, const void *key)

return tentative;
}
-#endif

/*
* Insert a node into the dictionary. The node should have been
@@ -655,7 +653,6 @@ void dict_insert(dict_t *dict, dnode_t *node, const void *key)
dict_assert (dict_verify(dict));
}

-#ifdef E2FSCK_NOTUSED
/*
* Delete the given node from the dictionary. If the given node does not belong
* to the given dictionary, undefined behavior results. A pointer to the
@@ -830,7 +827,6 @@ dnode_t *dict_delete(dict_t *dict, dnode_t *delete)

return delete;
}
-#endif /* E2FSCK_NOTUSED */

/*
* Allocate a node using the dictionary's allocator routine, give it
@@ -849,13 +845,11 @@ int dict_alloc_insert(dict_t *dict, const void *key, void *data)
return 0;
}

-#ifdef E2FSCK_NOTUSED
void dict_delete_free(dict_t *dict, dnode_t *node)
{
dict_delete(dict, node);
dict->freenode(node, dict->context);
}
-#endif

/*
* Return the node with the lowest (leftmost) key. If the dictionary is empty
--
2.9.5


2017-08-22 02:29:56

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: improve performance of region_allocate()

On Fri, Aug 18, 2017 at 06:16:35PM -0700, Doug Porter wrote:
> Use a red-black tree to track allocations instead of a linked list.
> This provides a substantial performance boost when the number of
> allocations in a region is large. This change resulted in a reduction
> in runtime from 4821s to 6s on one filesystem.
>
> Signed-off-by: Doug Porter <[email protected]>

Hi Doug, as it turns out, Jaco Kroon and I had been debugging the same
problem as oen you were working on. We came up with a different way
of solving this problem (see below). It works by observing that
unless the extent tree is terribly corrupted, region_allocate() will
always be appending to the very end of the list.

However, it has since occurred to me that since we are doing an
breadth-first traversal of the extent tree, the starting lba for each
leaf node *must* always be monotonically increasing, and we already
check to make sure this is true within an extent tree block. So I
think it might be possible to dispense with using any kind of data
structure, whether it's an rbtree or a linked list, and instead just
simply make sure we enforce the start+len of the last entry in an
extent tree block is < the starting lba of the first entry in the next
extent tree block.

We are already checking all of the necessary other conditions in
scan_extent_node, so with this additional check, I believe that using
the region tracking code in scan_extent_node (which was originally
written to make sure that extended attribute block did not have any
parts of a string shared between more than one EA key or value pair)
can be made entirely unnecessary for scan_extent_node().

I haven't had a chance to code this alternate fix, but I believe it
should be superior to either your patch or the one which I had drafted
below. Does this make sense to you?

- Ted

commit 8a48ce07a5923242fecc5dc04d6e30dd59a8f07d
Author: Theodore Ts'o <[email protected]>
Date: Mon Aug 14 19:52:39 2017 -0400

e2fsck: add optimization for large, fragmented sparse files

The code which checks for overlapping logical blocks in an extent tree
is O(h*e) in time, where h is the number of holes in the file, and e
is the number of extents in the file. So a file with a large number
of holes can take e2fsck a long time process. Optimize this taking
advantage of the fact the vast majority of the time, region_allocate()
is called with increasing logical block numbers, so we are almost
always append onto the end of the region list.

Signed-off-by: Theodore Ts'o <[email protected]>

diff --git a/e2fsck/region.c b/e2fsck/region.c
index e32f89db0..95d23be4f 100644
--- a/e2fsck/region.c
+++ b/e2fsck/region.c
@@ -30,6 +30,7 @@ struct region_struct {
region_addr_t min;
region_addr_t max;
struct region_el *allocated;
+ struct region_el *last;
};

region_t region_create(region_addr_t min, region_addr_t max)
@@ -42,6 +43,7 @@ region_t region_create(region_addr_t min, region_addr_t max)
memset(region, 0, sizeof(struct region_struct));
region->min = min;
region->max = max;
+ region->last = NULL;
return region;
}

@@ -68,6 +70,18 @@ int region_allocate(region_t region, region_addr_t start, int n)
if (n == 0)
return 1;

+ if (region->last && region->last->end == start &&
+ !region->last->next) {
+ region->last->end = end;
+ return 0;
+ }
+ if (region->last && start > region->last->end &&
+ !region->last->next) {
+ r = NULL;
+ prev = region->last;
+ goto append_to_list;
+ }
+
/*
* Search through the linked list. If we find that it
* conflicts witih something that's already allocated, return
@@ -92,6 +106,8 @@ int region_allocate(region_t region, region_addr_t start, int n)
r->end = next->end;
r->next = next->next;
free(next);
+ if (!r->next)
+ region->last = r;
return 0;
}
}
@@ -104,12 +120,15 @@ int region_allocate(region_t region, region_addr_t start, int n)
/*
* Insert a new region element structure into the linked list
*/
+append_to_list:
new_region = malloc(sizeof(struct region_el));
if (!new_region)
return -1;
new_region->start = start;
new_region->end = start + n;
new_region->next = r;
+ if (!new_region->next)
+ region->last = new_region;
if (prev)
prev->next = new_region;
else

2017-08-22 09:52:46

by Jaco Kroon

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: improve performance of region_allocate()

Hi Ted, Doug,

Ted, I already emailed you the patch for the latter discussion here,
including my rudimentary benchmarks.

Unfortunately I'm having trouble with inline formatting of the patch, so
I have attached it instead of providing inline (sorry - I know that
makes discussion difficult). Was originally emailed to you as a series
of three independent patches, with the below as 0001. We still need to
discuss the other optimization.

The attached improves CPU performance from O(e*h) to O(e) and memory
from O(h) to O(1). The patch below does similar for CPU but nothing for
memory (In my case it took fsck down by a significant margin).

Previously my fsck got stuck on 0.5% (we both know it got stuck on a
single 340GB file with numerous holes in it, of which I have multiple
such files on my filesystem) and stayed there for a few hours. With
this (and the memory map link-count for pass2) I managed to finish a
fsck on a 40TB filesystem in 508 minutes.

Ted - the provided patch by Doug may still improve performance for the
other uses of region.c as well, but the impact will probably not be as
severe since (as far as I know) there are usually not a great many
number of EAs for each file.

Kind Regards,
Jaco


On 22/08/2017 04:29, Theodore Ts'o wrote:
> On Fri, Aug 18, 2017 at 06:16:35PM -0700, Doug Porter wrote:
>> Use a red-black tree to track allocations instead of a linked list.
>> This provides a substantial performance boost when the number of
>> allocations in a region is large. This change resulted in a reduction
>> in runtime from 4821s to 6s on one filesystem.
>>
>> Signed-off-by: Doug Porter <[email protected]>
> Hi Doug, as it turns out, Jaco Kroon and I had been debugging the same
> problem as oen you were working on. We came up with a different way
> of solving this problem (see below). It works by observing that
> unless the extent tree is terribly corrupted, region_allocate() will
> always be appending to the very end of the list.
>
> However, it has since occurred to me that since we are doing an
> breadth-first traversal of the extent tree, the starting lba for each
> leaf node *must* always be monotonically increasing, and we already
> check to make sure this is true within an extent tree block. So I
> think it might be possible to dispense with using any kind of data
> structure, whether it's an rbtree or a linked list, and instead just
> simply make sure we enforce the start+len of the last entry in an
> extent tree block is < the starting lba of the first entry in the next
> extent tree block.
>
> We are already checking all of the necessary other conditions in
> scan_extent_node, so with this additional check, I believe that using
> the region tracking code in scan_extent_node (which was originally
> written to make sure that extended attribute block did not have any
> parts of a string shared between more than one EA key or value pair)
> can be made entirely unnecessary for scan_extent_node().
>
> I haven't had a chance to code this alternate fix, but I believe it
> should be superior to either your patch or the one which I had drafted
> below. Does this make sense to you?
>
> - Ted
>
> commit 8a48ce07a5923242fecc5dc04d6e30dd59a8f07d
> Author: Theodore Ts'o <[email protected]>
> Date: Mon Aug 14 19:52:39 2017 -0400
>
> e2fsck: add optimization for large, fragmented sparse files
>
> The code which checks for overlapping logical blocks in an extent tree
> is O(h*e) in time, where h is the number of holes in the file, and e
> is the number of extents in the file. So a file with a large number
> of holes can take e2fsck a long time process. Optimize this taking
> advantage of the fact the vast majority of the time, region_allocate()
> is called with increasing logical block numbers, so we are almost
> always append onto the end of the region list.
>
> Signed-off-by: Theodore Ts'o <[email protected]>
>
> diff --git a/e2fsck/region.c b/e2fsck/region.c
> index e32f89db0..95d23be4f 100644
> --- a/e2fsck/region.c
> +++ b/e2fsck/region.c
> @@ -30,6 +30,7 @@ struct region_struct {
> region_addr_t min;
> region_addr_t max;
> struct region_el *allocated;
> + struct region_el *last;
> };
>
> region_t region_create(region_addr_t min, region_addr_t max)
> @@ -42,6 +43,7 @@ region_t region_create(region_addr_t min, region_addr_t max)
> memset(region, 0, sizeof(struct region_struct));
> region->min = min;
> region->max = max;
> + region->last = NULL;
> return region;
> }
>
> @@ -68,6 +70,18 @@ int region_allocate(region_t region, region_addr_t start, int n)
> if (n == 0)
> return 1;
>
> + if (region->last && region->last->end == start &&
> + !region->last->next) {
> + region->last->end = end;
> + return 0;
> + }
> + if (region->last && start > region->last->end &&
> + !region->last->next) {
> + r = NULL;
> + prev = region->last;
> + goto append_to_list;
> + }
> +
> /*
> * Search through the linked list. If we find that it
> * conflicts witih something that's already allocated, return
> @@ -92,6 +106,8 @@ int region_allocate(region_t region, region_addr_t start, int n)
> r->end = next->end;
> r->next = next->next;
> free(next);
> + if (!r->next)
> + region->last = r;
> return 0;
> }
> }
> @@ -104,12 +120,15 @@ int region_allocate(region_t region, region_addr_t start, int n)
> /*
> * Insert a new region element structure into the linked list
> */
> +append_to_list:
> new_region = malloc(sizeof(struct region_el));
> if (!new_region)
> return -1;
> new_region->start = start;
> new_region->end = start + n;
> new_region->next = r;
> + if (!new_region->next)
> + region->last = new_region;
> if (prev)
> prev->next = new_region;
> else


Attachments:
0003-e2fsk-Optimize-out-the-use-of-region.c-for-logical-b.patch (2.52 kB)

2017-08-22 12:45:11

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: improve performance of region_allocate()

On Tue, Aug 22, 2017 at 11:36:54AM +0200, Jaco Kroon wrote:
>
> Unfortunately I'm having trouble with inline formatting of the patch, so I
> have attached it instead of providing inline (sorry - I know that makes
> discussion difficult). Was originally emailed to you as a series of three
> independent patches, with the below as 0001. We still need to discuss the
> other optimization.

Yeah, sorry for not getting back to you. I took a long weekend
vacation and this week has been super busy at work, so I haven't had a
chance to pursue the approach I've outlined in an earlier message ---
that is, instead of optimizing the region code (your patch and my
patch, where your patch is more general, but mine is simpler and only
optimizes the one case that is important for optimizing CPU
performance) or replacing it with an rbtree (Doug's patch), but
instead removing it altogether and replacing it with some better check
that avoids needing the memory usage altogether.

> The attached improves CPU performance from O(e*h) to O(e) and memory from
> O(h) to O(1). The patch below does similar for CPU but nothing for memory
> (In my case it took fsck down by a significant margin).

I thought you were saying you had some false positives (where it was
causing e2fsck to complain about some valid extent trees in your file
system)? That's why I've not acted on your proposed patch until I had
time to study the code more closely to understand what was going on
with the errors I thought you had reported.

> Ted - the provided patch by Doug may still improve performance for the other
> uses of region.c as well, but the impact will probably not be as severe
> since (as far as I know) there are usually not a great many number of EAs
> for each file.

Correct; we are limited to at most 64k if the new (experimental)
ea_inode feature is enabled; and without that feature, we are limited
to the 4k block size. So I'm not particularly worried about the xattr
region use case.

Indeed, the region code was originally designed and implemented for
the xattr use case, and the use of a linked list approach was
deliberately chosen for simplicity because normally there are only a
handful of xattrs, and how they are laid out (keys contiguously
growing from one direction, values contiguously grown from the other
direction) is very friendly for that particular use case. Hence, in
the common, non-error case, there are only going to be two entries on
the linked list when handling xattrs.


As far as the other (icount) optimization is concerned, that's on my
todo list but I'm trying to understand how much of a win it's going to
be on a densly hard linked file system, and whether the complexity due
to the handling the potential increased memory usage is worth it. If
we leave to be something that has to be manually enabled using
/etc/e2fsck.conf, very few people will use it. If we need to somehow
figure out how it's safe / necessary to activate the icount
optimization, especially if there are potentially multiple fsck's
running in parallel, this starts to get really complicated. So
perhaps all we can do is leave it as a manually enabled option, but I
still need to think about that a bit.

Cheers,

- Ted

2017-08-22 13:47:49

by Jaco Kroon

[permalink] [raw]
Subject: Re: *** SPAM *** Re: [PATCH] e2fsck: improve performance of region_allocate()

Hi Ted,

On 22/08/2017 14:45, Theodore Ts'o wrote:
>> The attached improves CPU performance from O(e*h) to O(e) and memory from
>> O(h) to O(1). The patch below does similar for CPU but nothing for memory
>> (In my case it took fsck down by a significant margin).
> I thought you were saying you had some false positives (where it was
> causing e2fsck to complain about some valid extent trees in your file
> system)? That's why I've not acted on your proposed patch until I had
> time to study the code more closely to understand what was going on
> with the errors I thought you had reported.
I did in fact manage to clear it, but raised it so that you could
confirm. With all three patches I sent you directly applied:

338 tests succeeded 0 tests failed

> As far as the other (icount) optimization is concerned, that's on my
> todo list but I'm trying to understand how much of a win it's going to
> be on a densly hard linked file system, and whether the complexity due
> to the handling the potential increased memory usage is worth it. If
> we leave to be something that has to be manually enabled using
> /etc/e2fsck.conf, very few people will use it. If we need to somehow
> figure out how it's safe / necessary to activate the icount
> optimization, especially if there are potentially multiple fsck's
> running in parallel, this starts to get really complicated. So
> perhaps all we can do is leave it as a manually enabled option, but I
> still need to think about that a bit.
My (purely theoretical) assessment on that (attached - proof-of-concept
quality at best) below.

n = the number of files with a link count > 1;
t = the total number of possible inodes;
g = sum of link counts for all inodes with link count >1; and
c = the average link-count for inodes with link count >1 (g/n).

my case n = ~100 million and t = ~2.8 billion. I don't have an estimate
of g for my system, but reckon it can be in the range of 2 billion quite
trivially.

Pass1 assessment: insertions are always onto the end of the list
(inodes traversed sequentially), and cpu and memory wise reduces to O(n)
(memory = 96/8 * n = 12n bytes, 1.2GB in my case). I don't think there
is improvements to be had during pass1.

Pass2, current code: we clone the list of inodes with link-count>1,
avoiding expensive mid-array insertions, this reduces to a really fast
O(n). "increments" are expensive, and each increment results in a
binary search for the correct entry in the array, costing O(log(n)) - 27
comparison ops for my 40TB filesystem. We perform this g number of
times, so overall cpu is O(g.log(n)). memory remains identical to above.

Pass2, full map (possible "optimization"):

We allocate an array of __u16 for t inodes. In my use-case this
requires 2.8bn index size, or 5.6GB RAM. Memory *normally* goes up to O(t).

Only if n > 16.7% of t does memory usage decrease - based on the fact
that my filesystem has average file size of 394KB, and is <1TB remaining
on a 40TB file system at 107m files, n/t in my case = 3.5%, I find this
use-case extremely unlikely. We can thus argue we ALWAYS sacrifice
memory, from O(n) to O(t).

In terms of CPU however the increase operation now becomes O(1), from
O(log(n)). Depending on the value of g this can be a huge gain, and
since the O(1) here is a VERY FAST O(1) I suspect the 27x factor in my
use-case is an underestimate of the actual factor (I suspect even if we
have only 1 entry in the list the fact that we have to go through just
one search iteration is at least 3-5x more expensive in terms of the
constant, thus I *suspect* that a sensible cpu speedup for this check
for comparing link counts in inodes compared to actual directory
structures is probably around 100x during pass2. I could very well be
wrong.

That said, assuming that we will need to go out to disk to retrieve
directory structures we're probably going to be IO bound at this point
anyway, but you're the better person to comment on how disk/cpu bound
pass2 is in general at this stage.

Performing CPU benchmarks that doesn't require disk should be relatively
trivial (iterate through a set of values for n and g and generate random
data - at no point does the cpu utilization depend on the value of t, so
we can measure the effective difference on even super-crazy values of n
and g (constrained by g <= n*Max_Link_Count). If this is something that
would be interesting I'll happily see if I can generate something.

My opinion: if pass2 is currently generally CPU bound for such use-cases
we should look at this further, if disk bound then there is no point.
We can easily calculate the values of n, t and g during pass1 (or even
as setup code for pass2, everything we need is available to icount.c
during construction time of pass2 data structure). Perhaps it makes
sense to switch to full_map implementation heuristically based on those
values, but we'd need to understand the potential impact. We can
trivially fail back to the current implementation if the memory
allocation fails.

Kind Regards,
Jaco


Attachments:
0002-e2fsck-add-optimization-for-heavy-hard-linked-pass2.patch (6.84 kB)

2017-08-23 23:21:40

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: improve performance of region_allocate()

Jaco, here's the version of your patch I plan to use. Please take a
look.

The only real changes are mostly whitespace and formatting.

By the way, it would be nice if you could send me permission to add a

Signed-off-by: Jaco Kroon <[email protected]>

See http://elinux.org/Developer_Certificate_Of_Origin for an
explanation of what this means.

- Ted

commit 320d436c006dc2550ebd01084b6e823f0cea8bc2
Author: Jaco Kroon <[email protected]>
Date: Wed Aug 23 13:54:25 2017 -0400

e2fsck: optimize out the use region_t in scan_extent_node()

Since extents have a guarantee of being monotonically increasing we
merely need to check that block n+1 starts after block n. This is a
simple enough check and we can perform this by calculating the next
expected logical block instead of using the region usage tracking data
abstraction.

Signed-off-by: Theodore Ts'o <[email protected]>

diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c
index 8044beed6..bc26beb99 100644
--- a/e2fsck/pass1.c
+++ b/e2fsck/pass1.c
@@ -96,7 +96,7 @@ struct process_block_struct {
struct problem_context *pctx;
ext2fs_block_bitmap fs_meta_blocks;
e2fsck_t ctx;
- region_t region;
+ blk64_t next_lblock;
struct extent_tree_info eti;
};

@@ -2628,9 +2628,18 @@ static void scan_extent_node(e2fsck_t ctx, struct problem_context *pctx,
(1U << (21 - ctx->fs->super->s_log_block_size))))
problem = PR_1_TOOBIG_DIR;

- if (is_leaf && problem == 0 && extent.e_len > 0 &&
- region_allocate(pb->region, extent.e_lblk, extent.e_len))
- problem = PR_1_EXTENT_COLLISION;
+ if (is_leaf && problem == 0 && extent.e_len > 0) {
+#if 0
+ printf("extent_region(ino=%u, expect=%llu, "
+ "lblk=%llu, len=%u)\n",
+ pb->ino, pb->next_lblock,
+ extent.e_lblk, extent.e_len);
+#endif
+ if (extent.e_lblk < pb->next_lblock)
+ problem = PR_1_EXTENT_COLLISION;
+ else if (extent.e_lblk + extent.e_len > pb->next_lblock)
+ pb->next_lblock = extent.e_lblk + extent.e_len;
+ }

/*
* Uninitialized blocks in a directory? Clear the flag and
@@ -2963,13 +2972,7 @@ static void check_blocks_extents(e2fsck_t ctx, struct problem_context *pctx,
memset(pb->eti.ext_info, 0, sizeof(pb->eti.ext_info));
pb->eti.ino = pb->ino;

- pb->region = region_create(0, info.max_lblk);
- if (!pb->region) {
- ext2fs_extent_free(ehandle);
- fix_problem(ctx, PR_1_EXTENT_ALLOC_REGION_ABORT, pctx);
- ctx->flags |= E2F_FLAG_ABORT;
- return;
- }
+ pb->next_lblock = 0;

eof_lblk = ((EXT2_I_SIZE(inode) + fs->blocksize - 1) >>
EXT2_BLOCK_SIZE_BITS(fs->super)) - 1;
@@ -2982,8 +2985,6 @@ static void check_blocks_extents(e2fsck_t ctx, struct problem_context *pctx,
"check_blocks_extents");
pctx->errcode = 0;
}
- region_free(pb->region);
- pb->region = NULL;
ext2fs_extent_free(ehandle);

/* Rebuild unless it's a dir and we're rehashing it */

2017-08-23 23:23:53

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH] e2fsck: add optimization for heavily hard-linked file systems

Jaco,

Here's my revised version of your patch. The main differences are in
how to enable this optimization, and the fact that this optimizatoin
is disabled by default. Users will have to either explicitly request
it by using e2fsck -E inode_count_fullmap or via /etc/e2fsck.conf.

Please take a look and let me know what you think. (And again, if you
could give me permission to add your Signed-off-by, I would appreciate
it.)

Thanks,

- Ted

commit 75941780c27190c6507aa7ca813caa79638926c8
Author: Jaco Kroon <[email protected]>
Date: Wed Aug 23 14:21:43 2017 -0400

e2fsck: add optimization for heavily hard-linked file systems

In the case of file system with large number of hard links, e2fsck can
take a large amount of time in pass 2 due to binary search lookup of
inode numbers. This implements a memory trade-off (storing 2 bytes
in-memory for each inode to store inode counts).

For a 40TB filesystem with 2.8bn inodes this map alone requires 5.7GB
of RAM. For this reason, we don't enable this optimization by
default. It can be enabled using either an extended option to e2fsck
or via a seting in e2fsck.conf.

Even when the fullmap optimization is enabled, we don't use this for
the icount structure in pass 1. This is because the gain CPU gain is
nearly nil for that pass and the sacrificed memory does not justify
the increase in RAM.

(It could be that during pass 1, if more than 17% if possible inodes
has link_count>1 (466m inodes in the 40TB with 2.8bn possible inodes
case) then it becomes more memory efficient to use the full map
implementation in terms of memory. However, this is extremely
unlikely given that most file systems are heavily over-provisioned in
terms of the number of inodes in the system.)

Signed-off-by: Theodore Ts'o <[email protected]>

diff --git a/e2fsck/e2fsck.8.in b/e2fsck/e2fsck.8.in
index 786eb15e7..4c29943d3 100644
--- a/e2fsck/e2fsck.8.in
+++ b/e2fsck/e2fsck.8.in
@@ -228,7 +228,29 @@ exactly the opposite of discard option. This is set as default.
.TP
.BI no_optimize_extents
Do not offer to optimize the extent tree by eliminating unnecessary
-width or depth.
+width or depth. This can also be enabled in the options section of
+.BR /etc/e2fsck.conf .
+.TP
+.BI optimize_extents
+Offer to optimize the extent tree by eliminating unnecessary
+width or depth. This is the default unless otherwise specified in
+.BR /etc/e2fsck.conf .
+.TP
+.BI inode_count_fullmap
+Trade off using memory for speed when checking a file system with a
+large number of hard-linked files. The amount of memory required is
+proportional to the number of inodes in the file system. For large file
+systems, this can be gigabytes of memory. (For example, a 40TB file system
+with 2.8 billion inodes will consume an additional 5.7 GB memory if this
+optimization is enabled.) This optimization can also be enabled in the
+options section of
+.BR /etc/e2fsck.conf .
+.TP
+.BI no_inode_count_fullmap
+Disable the
+.B inode_count_fullmap
+optimization. This is the default unless otherwise specified in
+.BR /etc/e2fsck.conf .
.TP
.BI readahead_kb
Use this many KiB of memory to pre-fetch metadata in the hopes of reducing
diff --git a/e2fsck/e2fsck.conf.5.in b/e2fsck/e2fsck.conf.5.in
index fbde7ef0b..d8205bcf1 100644
--- a/e2fsck/e2fsck.conf.5.in
+++ b/e2fsck/e2fsck.conf.5.in
@@ -157,6 +157,15 @@ the average fill ratio of directories can be maintained at a
higher, more efficient level. This relation defaults to 20
percent.
.TP
+.I inode_count_fullmap
+If this boolean relation is true, trade off using memory for speed when
+checking a file system with a large number of hard-linked files. The
+amount of memory required is proportional to the number of inodes in the
+file system. For large file systems, this can be gigabytes of memory.
+(For example a 40TB file system with 2.8 billion inodes will consume an
+additional 5.7 GB memory if this optimization is enabled.) This setting
+defaults to false.
+.TP
.I log_dir
If the
.I log_filename
@@ -206,8 +215,8 @@ of that type are squelched. This can be useful if the console is slow
end up delaying the boot process for a long time (potentially hours).
.TP
.I no_optimize_extents
-Do not offer to optimize the extent tree by eliminating unnecessary
-width or depth.
+If this boolean relation is true, do not offer to optimize the extent
+tree by reducing the tree's width or depth. This setting defaults to false.
.TP
.I readahead_mem_pct
Use this percentage of memory to try to read in metadata blocks ahead of the
diff --git a/e2fsck/e2fsck.h b/e2fsck/e2fsck.h
index 81c09d7eb..12855453d 100644
--- a/e2fsck/e2fsck.h
+++ b/e2fsck/e2fsck.h
@@ -170,6 +170,7 @@ struct resource_track {
#define E2F_OPT_CONVERT_BMAP 0x4000 /* convert blockmap to extent */
#define E2F_OPT_FIXES_ONLY 0x8000 /* skip all optimizations */
#define E2F_OPT_NOOPT_EXTENTS 0x10000 /* don't optimize extents */
+#define E2F_OPT_ICOUNT_FULLMAP 0x20000 /* don't optimize extents */

/*
* E2fsck flags
diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c
index bc26beb99..6442c9449 100644
--- a/e2fsck/pass1.c
+++ b/e2fsck/pass1.c
@@ -711,6 +711,7 @@ extern errcode_t e2fsck_setup_icount(e2fsck_t ctx, const char *icount_name,
errcode_t retval;
char *tdb_dir;
int enable;
+ int full_map;

*ret = 0;

@@ -734,6 +735,8 @@ extern errcode_t e2fsck_setup_icount(e2fsck_t ctx, const char *icount_name,
}
e2fsck_set_bitmap_type(ctx->fs, EXT2FS_BMAP64_RBTREE, icount_name,
&save_type);
+ if (ctx->options & E2F_OPT_ICOUNT_FULLMAP)
+ flags |= EXT2_ICOUNT_OPT_FULLMAP;
retval = ext2fs_create_icount2(ctx->fs, flags, 0, hint, ret);
ctx->fs->default_bitmap_type = save_type;
return retval;
diff --git a/e2fsck/unix.c b/e2fsck/unix.c
index ff961483b..939862f1a 100644
--- a/e2fsck/unix.c
+++ b/e2fsck/unix.c
@@ -709,9 +709,18 @@ static void parse_extended_opts(e2fsck_t ctx, const char *opts)
} else if (strcmp(token, "nodiscard") == 0) {
ctx->options &= ~E2F_OPT_DISCARD;
continue;
+ } else if (strcmp(token, "optimize_extents") == 0) {
+ ctx->options &= ~E2F_OPT_NOOPT_EXTENTS;
+ continue;
} else if (strcmp(token, "no_optimize_extents") == 0) {
ctx->options |= E2F_OPT_NOOPT_EXTENTS;
continue;
+ } else if (strcmp(token, "inode_count_fullmap") == 0) {
+ ctx->options |= E2F_OPT_ICOUNT_FULLMAP;
+ continue;
+ } else if (strcmp(token, "no_inode_count_fullmap") == 0) {
+ ctx->options &= ~E2F_OPT_ICOUNT_FULLMAP;
+ continue;
} else if (strcmp(token, "log_filename") == 0) {
if (!arg)
extended_usage++;
@@ -733,17 +742,21 @@ static void parse_extended_opts(e2fsck_t ctx, const char *opts)
free(buf);

if (extended_usage) {
- fputs(("\nExtended options are separated by commas, "
+ fputs(_("\nExtended options are separated by commas, "
"and may take an argument which\n"
"is set off by an equals ('=') sign. "
- "Valid extended options are:\n"), stderr);
- fputs(("\tea_ver=<ea_version (1 or 2)>\n"), stderr);
- fputs(("\tfragcheck\n"), stderr);
- fputs(("\tjournal_only\n"), stderr);
- fputs(("\tdiscard\n"), stderr);
- fputs(("\tnodiscard\n"), stderr);
- fputs(("\treadahead_kb=<buffer size>\n"), stderr);
- fputs(("\tbmap2extent\n"), stderr);
+ "Valid extended options are:\n\n"), stderr);
+ fputs(_("\tea_ver=<ea_version (1 or 2)>\n"), stderr);
+ fputs("\tfragcheck\n", stderr);
+ fputs("\tjournal_only\n", stderr);
+ fputs("\tdiscard\n", stderr);
+ fputs("\tnodiscard\n", stderr);
+ fputs("\toptimize_extents\n", stderr);
+ fputs("\tno_optimize_extents\n", stderr);
+ fputs("\tinode_count_fullmap\n", stderr);
+ fputs("\tno_inode_count_fullmap\n", stderr);
+ fputs(_("\treadahead_kb=<buffer size>\n"), stderr);
+ fputs("\tbmap2extent\n", stderr);
fputc('\n', stderr);
exit(1);
}
@@ -1015,6 +1028,11 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
if (c)
ctx->options |= E2F_OPT_NOOPT_EXTENTS;

+ profile_get_boolean(ctx->profile, "options", "inode_count_fullmap",
+ 0, 0, &c);
+ if (c)
+ ctx->options |= E2F_OPT_ICOUNT_FULLMAP;
+
if (ctx->readahead_kb == ~0ULL) {
profile_get_integer(ctx->profile, "options",
"readahead_mem_pct", 0, -1, &c);
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index 8ff49ca66..68d0e2a57 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -532,6 +532,7 @@ typedef struct ext2_struct_inode_scan *ext2_inode_scan;
* ext2_icount_t abstraction
*/
#define EXT2_ICOUNT_OPT_INCREMENT 0x01
+#define EXT2_ICOUNT_OPT_FULLMAP 0x02

typedef struct ext2_icount *ext2_icount_t;

diff --git a/lib/ext2fs/icount.c b/lib/ext2fs/icount.c
index 594b1cc2b..65420e9f3 100644
--- a/lib/ext2fs/icount.c
+++ b/lib/ext2fs/icount.c
@@ -61,6 +61,7 @@ struct ext2_icount {
char *tdb_fn;
TDB_CONTEXT *tdb;
#endif
+ __u16 *fullmap;
};

/*
@@ -93,6 +94,9 @@ void ext2fs_free_icount(ext2_icount_t icount)
}
#endif

+ if (icount->fullmap)
+ ext2fs_free_mem(&icount->fullmap);
+
ext2fs_free_mem(&icount);
}

@@ -108,10 +112,24 @@ static errcode_t alloc_icount(ext2_filsys fs, int flags, ext2_icount_t *ret)
return retval;
memset(icount, 0, sizeof(struct ext2_icount));

+ if ((flags & EXT2_ICOUNT_OPT_FULLMAP) &&
+ (flags & EXT2_ICOUNT_OPT_INCREMENT)) {
+ retval = ext2fs_get_mem((sizeof(__u32) *
+ fs->super->s_inodes_count),
+ &icount->fullmap);
+ /* If we can't allocate, fall back */
+ if (!retval) {
+ memset(icount->fullmap, 0,
+ sizeof(__u32) * fs->super->s_inodes_count);
+ goto successout;
+ }
+ }
+
retval = ext2fs_allocate_inode_bitmap(fs, "icount", &icount->single);
if (retval)
goto errout;

+
if (flags & EXT2_ICOUNT_OPT_INCREMENT) {
retval = ext2fs_allocate_inode_bitmap(fs, "icount_inc",
&icount->multiple);
@@ -120,6 +138,7 @@ static errcode_t alloc_icount(ext2_filsys fs, int flags, ext2_icount_t *ret)
} else
icount->multiple = 0;

+successout:
icount->magic = EXT2_ET_MAGIC_ICOUNT;
icount->num_inodes = fs->super->s_inodes_count;

@@ -256,6 +275,9 @@ errcode_t ext2fs_create_icount2(ext2_filsys fs, int flags, unsigned int size,
if (retval)
return retval;

+ if (icount->fullmap)
+ goto successout;
+
if (size) {
icount->size = size;
} else {
@@ -295,6 +317,7 @@ errcode_t ext2fs_create_icount2(ext2_filsys fs, int flags, unsigned int size,
icount->count = hint->count;
}

+successout:
*ret = icount;
return 0;

@@ -433,6 +456,11 @@ static errcode_t set_inode_count(ext2_icount_t icount, ext2_ino_t ino,
return 0;
}
#endif
+ if (icount->fullmap) {
+ icount->fullmap[ino] = icount_16_xlate(count);
+ return 0;
+ }
+
el = get_icount_el(icount, ino, 1);
if (!el)
return EXT2_ET_NO_MEMORY;
@@ -463,6 +491,11 @@ static errcode_t get_inode_count(ext2_icount_t icount, ext2_ino_t ino,
return 0;
}
#endif
+ if (icount->fullmap) {
+ *count = icount->fullmap[ino];
+ return 0;
+ }
+
el = get_icount_el(icount, ino, 0);
if (!el) {
*count = 0;
@@ -504,14 +537,16 @@ errcode_t ext2fs_icount_fetch(ext2_icount_t icount, ext2_ino_t ino, __u16 *ret)
if (!ino || (ino > icount->num_inodes))
return EXT2_ET_INVALID_ARGUMENT;

- if (ext2fs_test_inode_bitmap2(icount->single, ino)) {
- *ret = 1;
- return 0;
- }
- if (icount->multiple &&
- !ext2fs_test_inode_bitmap2(icount->multiple, ino)) {
- *ret = 0;
- return 0;
+ if (!icount->fullmap) {
+ if (ext2fs_test_inode_bitmap2(icount->single, ino)) {
+ *ret = 1;
+ return 0;
+ }
+ if (icount->multiple &&
+ !ext2fs_test_inode_bitmap2(icount->multiple, ino)) {
+ *ret = 0;
+ return 0;
+ }
}
get_inode_count(icount, ino, &val);
*ret = icount_16_xlate(val);
@@ -528,7 +563,10 @@ errcode_t ext2fs_icount_increment(ext2_icount_t icount, ext2_ino_t ino,
if (!ino || (ino > icount->num_inodes))
return EXT2_ET_INVALID_ARGUMENT;

- if (ext2fs_test_inode_bitmap2(icount->single, ino)) {
+ if (icount->fullmap) {
+ curr_value = icount_16_xlate(icount->fullmap[ino] + 1);
+ icount->fullmap[ino] = curr_value;
+ } else if (ext2fs_test_inode_bitmap2(icount->single, ino)) {
/*
* If the existing count is 1, then we know there is
* no entry in the list.
@@ -585,6 +623,16 @@ errcode_t ext2fs_icount_decrement(ext2_icount_t icount, ext2_ino_t ino,

EXT2_CHECK_MAGIC(icount, EXT2_ET_MAGIC_ICOUNT);

+ if (icount->fullmap) {
+ if (!icount->fullmap[ino])
+ return EXT2_ET_INVALID_ARGUMENT;
+
+ curr_value = --icount->fullmap[ino];
+ if (ret)
+ *ret = icount_16_xlate(curr_value);
+ return 0;
+ }
+
if (ext2fs_test_inode_bitmap2(icount->single, ino)) {
ext2fs_unmark_inode_bitmap2(icount->single, ino);
if (icount->multiple)
@@ -626,6 +674,9 @@ errcode_t ext2fs_icount_store(ext2_icount_t icount, ext2_ino_t ino,

EXT2_CHECK_MAGIC(icount, EXT2_ET_MAGIC_ICOUNT);

+ if (icount->fullmap)
+ return set_inode_count(icount, ino, count);
+
if (count == 1) {
ext2fs_mark_inode_bitmap2(icount->single, ino);
if (icount->multiple)

2017-08-24 08:18:31

by Jaco Kroon

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: improve performance of region_allocate()

Hi Ted,

On 24/08/2017 01:21, Theodore Ts'o wrote:
> Jaco, here's the version of your patch I plan to use. Please take a
> look.
>
> The only real changes are mostly whitespace and formatting.
Variable name change + formatting :).
>
> By the way, it would be nice if you could send me permission to add a
>
> Signed-off-by: Jaco Kroon <[email protected]>
Approved.

Kind Regards,
Jaco

>
> See http://elinux.org/Developer_Certificate_Of_Origin for an
> explanation of what this means.
>
> - Ted
>
> commit 320d436c006dc2550ebd01084b6e823f0cea8bc2
> Author: Jaco Kroon <[email protected]>
> Date: Wed Aug 23 13:54:25 2017 -0400
>
> e2fsck: optimize out the use region_t in scan_extent_node()
>
> Since extents have a guarantee of being monotonically increasing we
> merely need to check that block n+1 starts after block n. This is a
> simple enough check and we can perform this by calculating the next
> expected logical block instead of using the region usage tracking data
> abstraction.
>
> Signed-off-by: Theodore Ts'o <[email protected]>
>
> diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c
> index 8044beed6..bc26beb99 100644
> --- a/e2fsck/pass1.c
> +++ b/e2fsck/pass1.c
> @@ -96,7 +96,7 @@ struct process_block_struct {
> struct problem_context *pctx;
> ext2fs_block_bitmap fs_meta_blocks;
> e2fsck_t ctx;
> - region_t region;
> + blk64_t next_lblock;
> struct extent_tree_info eti;
> };
>
> @@ -2628,9 +2628,18 @@ static void scan_extent_node(e2fsck_t ctx, struct problem_context *pctx,
> (1U << (21 - ctx->fs->super->s_log_block_size))))
> problem = PR_1_TOOBIG_DIR;
>
> - if (is_leaf && problem == 0 && extent.e_len > 0 &&
> - region_allocate(pb->region, extent.e_lblk, extent.e_len))
> - problem = PR_1_EXTENT_COLLISION;
> + if (is_leaf && problem == 0 && extent.e_len > 0) {
> +#if 0
> + printf("extent_region(ino=%u, expect=%llu, "
> + "lblk=%llu, len=%u)\n",
> + pb->ino, pb->next_lblock,
> + extent.e_lblk, extent.e_len);
> +#endif
> + if (extent.e_lblk < pb->next_lblock)
> + problem = PR_1_EXTENT_COLLISION;
> + else if (extent.e_lblk + extent.e_len > pb->next_lblock)
> + pb->next_lblock = extent.e_lblk + extent.e_len;
> + }
>
> /*
> * Uninitialized blocks in a directory? Clear the flag and
> @@ -2963,13 +2972,7 @@ static void check_blocks_extents(e2fsck_t ctx, struct problem_context *pctx,
> memset(pb->eti.ext_info, 0, sizeof(pb->eti.ext_info));
> pb->eti.ino = pb->ino;
>
> - pb->region = region_create(0, info.max_lblk);
> - if (!pb->region) {
> - ext2fs_extent_free(ehandle);
> - fix_problem(ctx, PR_1_EXTENT_ALLOC_REGION_ABORT, pctx);
> - ctx->flags |= E2F_FLAG_ABORT;
> - return;
> - }
> + pb->next_lblock = 0;
>
> eof_lblk = ((EXT2_I_SIZE(inode) + fs->blocksize - 1) >>
> EXT2_BLOCK_SIZE_BITS(fs->super)) - 1;
> @@ -2982,8 +2985,6 @@ static void check_blocks_extents(e2fsck_t ctx, struct problem_context *pctx,
> "check_blocks_extents");
> pctx->errcode = 0;
> }
> - region_free(pb->region);
> - pb->region = NULL;
> ext2fs_extent_free(ehandle);
>
> /* Rebuild unless it's a dir and we're rehashing it */
>

2017-08-26 15:33:18

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: add optimization for heavily hard-linked file systems

On Thu, Aug 24, 2017 at 11:08:52AM +0200, Jaco Kroon wrote:
> > +#define E2F_OPT_ICOUNT_FULLMAP 0x20000 /* don't optimize extents */
> description here seems wrong? Perhaps "use fullmap for inode link count" ?

Oops, fixed.

> > --- a/e2fsck/pass1.c
> > +++ b/e2fsck/pass1.c
> > @@ -711,6 +711,7 @@ extern errcode_t e2fsck_setup_icount(e2fsck_t ctx, const char *icount_name,
> > errcode_t retval;
> > char *tdb_dir;
> > int enable;
> > + int full_map;
> > *ret = 0;
> > @@ -734,6 +735,8 @@ extern errcode_t e2fsck_setup_icount(e2fsck_t ctx, const char *icount_name,
> > }
> > e2fsck_set_bitmap_type(ctx->fs, EXT2FS_BMAP64_RBTREE, icount_name,
> > &save_type);
> > + if (ctx->options & E2F_OPT_ICOUNT_FULLMAP)
> > + flags |= EXT2_ICOUNT_OPT_FULLMAP;
> > retval = ext2fs_create_icount2(ctx->fs, flags, 0, hint, ret);
> > ctx->fs->default_bitmap_type = save_type;
> > return retval;
> This isn't used for pass1 (I did originally use full_map either way then
> decided to only use it for pass2) - and we to receive flags coming into this
> function - shouldn't this perhaps rather go into pass2.c (e2fsck_pass2
> function to be more specific)? This really is cosmetic imho.

The e2fsck_setup_icount() function is used for both pass1 and pass2.

> > diff --git a/e2fsck/unix.c b/e2fsck/unix.c
> > index ff961483b..939862f1a 100644
> > --- a/e2fsck/unix.c
> > +++ b/e2fsck/unix.c
> > @@ -709,9 +709,18 @@ static void parse_extended_opts(e2fsck_t ctx, const char *opts)
> > } else if (strcmp(token, "nodiscard") == 0) {
> > ctx->options &= ~E2F_OPT_DISCARD;
> > continue;
> > + } else if (strcmp(token, "optimize_extents") == 0) {
> > + ctx->options &= ~E2F_OPT_NOOPT_EXTENTS;
> > + continue;
> unrelated, but makes sense :).

Yes, this was a cleanup I was doing as I weant.

> > + if (!retval) {
> > + memset(icount->fullmap, 0,
> > + *sizeof(__u32)* * fs->super->s_inodes_count); <-------
> this of course should be sizeof(__u16), or sizeof(*count->fullmap) which
> will track the type pointed to. It might be better practice to store
> "sizeof(*icount->fullmap) * fs->super->s_inodes_count" to a variable and
> just re-use it.

Good point. Fixed.

> > goto errout;
> > +
> Extra line can be removed.
> > if (flags & EXT2_ICOUNT_OPT_INCREMENT) {

Removed.

Thanks for taking a look at the patch!

- Ted

2017-09-11 10:19:26

by Jaco Kroon

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: add optimization for heavily hard-linked file systems

Hi Theodore,

I got severely side-tracked after the previous look at this.

With the mentions below, ACK.

Kind Regards,
Jaco


On 26/08/2017 17:33, Theodore Ts'o wrote:
> On Thu, Aug 24, 2017 at 11:08:52AM +0200, Jaco Kroon wrote:
>>> +#define E2F_OPT_ICOUNT_FULLMAP 0x20000 /* don't optimize extents */
>> description here seems wrong? Perhaps "use fullmap for inode link count" ?
> Oops, fixed.
>
>>> --- a/e2fsck/pass1.c
>>> +++ b/e2fsck/pass1.c
>>> @@ -711,6 +711,7 @@ extern errcode_t e2fsck_setup_icount(e2fsck_t ctx, const char *icount_name,
>>> errcode_t retval;
>>> char *tdb_dir;
>>> int enable;
>>> + int full_map;
>>> *ret = 0;
>>> @@ -734,6 +735,8 @@ extern errcode_t e2fsck_setup_icount(e2fsck_t ctx, const char *icount_name,
>>> }
>>> e2fsck_set_bitmap_type(ctx->fs, EXT2FS_BMAP64_RBTREE, icount_name,
>>> &save_type);
>>> + if (ctx->options & E2F_OPT_ICOUNT_FULLMAP)
>>> + flags |= EXT2_ICOUNT_OPT_FULLMAP;
>>> retval = ext2fs_create_icount2(ctx->fs, flags, 0, hint, ret);
>>> ctx->fs->default_bitmap_type = save_type;
>>> return retval;
>> This isn't used for pass1 (I did originally use full_map either way then
>> decided to only use it for pass2) - and we to receive flags coming into this
>> function - shouldn't this perhaps rather go into pass2.c (e2fsck_pass2
>> function to be more specific)? This really is cosmetic imho.
> The e2fsck_setup_icount() function is used for both pass1 and pass2.
>
>>> diff --git a/e2fsck/unix.c b/e2fsck/unix.c
>>> index ff961483b..939862f1a 100644
>>> --- a/e2fsck/unix.c
>>> +++ b/e2fsck/unix.c
>>> @@ -709,9 +709,18 @@ static void parse_extended_opts(e2fsck_t ctx, const char *opts)
>>> } else if (strcmp(token, "nodiscard") == 0) {
>>> ctx->options &= ~E2F_OPT_DISCARD;
>>> continue;
>>> + } else if (strcmp(token, "optimize_extents") == 0) {
>>> + ctx->options &= ~E2F_OPT_NOOPT_EXTENTS;
>>> + continue;
>> unrelated, but makes sense :).
> Yes, this was a cleanup I was doing as I weant.
>
>>> + if (!retval) {
>>> + memset(icount->fullmap, 0,
>>> + *sizeof(__u32)* * fs->super->s_inodes_count); <-------
>> this of course should be sizeof(__u16), or sizeof(*count->fullmap) which
>> will track the type pointed to. It might be better practice to store
>> "sizeof(*icount->fullmap) * fs->super->s_inodes_count" to a variable and
>> just re-use it.
> Good point. Fixed.
>
>>> goto errout;
>>> +
>> Extra line can be removed.
>>> if (flags & EXT2_ICOUNT_OPT_INCREMENT) {
> Removed.
>
> Thanks for taking a look at the patch!
>
> - Ted