When using perf to profile large binaries, _bfd_dwarf2_find_nearest_line()
becomes a hotspot, as perf wants to get line number information
(for inline-detection purposes) for each and every sample. In Chromium
in particular (the content_shell binary), this entails going through
475k address ranges, which takes a long time when done repeatedly.
Add a radix-256 trie over the address space to quickly map address to
compilation unit spaces; for content_shell, which is 1.6 GB when some
(but not full) debug information turned is on, we go from 6 ms to
0.006 ms (6 µs) for each lookup from address to compilation unit, a 1000x
speedup.
There is a modest RAM increase of 180 MB in this binary (the existing
linked list over ranges uses about 10 MB, and the entire perf job uses
between 2–3 GB for a medium-size profile); for smaller binaries with few
ranges, there should be hardly any extra RAM usage at all.
---
bfd/dwarf2.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 354 insertions(+), 18 deletions(-)
diff --git a/bfd/dwarf2.c b/bfd/dwarf2.c
index 404f35df62b..61ca68d4e97 100644
--- a/bfd/dwarf2.c
+++ b/bfd/dwarf2.c
@@ -82,6 +82,76 @@ struct adjusted_section
bfd_vma adj_vma;
};
+/* A trie to map quickly from address range to compilation unit.
+
+ This is a fairly standard radix-256 trie, used to quickly locate which
+ compilation unit any given address belongs to. Given that each compilation
+ unit may register hundreds of very small and unaligned ranges (which may
+ potentially overlap, due to inlining and other concerns), and a large
+ program may end up containing hundreds of thousands of such ranges, we cannot
+ scan through them linearly without undue slowdown.
+
+ We use a hybrid trie to avoid memory explosion: There are two types of trie
+ nodes, leaves and interior nodes. (Almost all nodes are leaves, so they
+ take up the bulk of the memory usage.) Leaves contain a simple array of
+ ranges (high/low address) and which compilation unit contains those ranges,
+ and when we get to a leaf, we scan through it linearly. Interior nodes
+ contain pointers to 256 other nodes, keyed by the next byte of the address.
+ So for a 64-bit address like 0x1234567abcd, we would start at the root and go
+ down child[0x00]->child[0x00]->child[0x01]->child[0x23]->child[0x45] etc.,
+ until we hit a leaf. (Nodes are, in general, leaves until they exceed the
+ default allocation of 16 elements, at which point they are converted to
+ interior node if possible.) This gives us near-constant lookup times;
+ the only thing that can be costly is if there are lots of overlapping ranges
+ within a single 256-byte segment of the binary, in which case we have to
+ scan through them all to find the best match.
+
+ For a binary with few ranges, we will in practice only have a single leaf node
+ at the root, containing a simple array. Thus, the scheme is efficient for
+ both small and large binaries.
+ */
+
+/* Experiments have shown 16 to be a memory-efficient default leaf size.
+ The only case where a leaf will hold more memory than this, is at the
+ bottomost level (covering 256 bytes in the binary), where we'll expand
+ the leaf to be able to hold more ranges if needed.
+ */
+#define TRIE_LEAF_SIZE 16
+
+/* All trie_node pointers will really be trie_leaf or trie_interior,
+ but they have this common head. */
+struct trie_node
+{
+ /* If zero, we are an interior node.
+ Otherwise, how many ranges we have room for in this leaf. */
+ int num_room_in_leaf;
+};
+
+struct trie_leaf
+{
+ struct trie_node head;
+ int num_stored_in_leaf;
+ struct {
+ const struct comp_unit *unit;
+ bfd_vma low_pc, high_pc;
+ } ranges[TRIE_LEAF_SIZE];
+};
+
+struct trie_interior
+{
+ struct trie_node head;
+ struct trie_node *children[256];
+};
+
+static struct trie_node *alloc_trie_leaf (bfd *abfd)
+{
+ struct trie_leaf *leaf =
+ (struct trie_leaf *) bfd_zalloc (abfd, sizeof (struct trie_leaf));
+ if (leaf != NULL)
+ leaf->head.num_room_in_leaf = TRIE_LEAF_SIZE;
+ return (struct trie_node *) leaf;
+}
+
struct dwarf2_debug_file
{
/* The actual bfd from which debug info was loaded. Might be
@@ -139,6 +209,9 @@ struct dwarf2_debug_file
/* A list of all previously read comp_units. */
struct comp_unit *all_comp_units;
+ /* A list of all previously read comp_units with no ranges (yet). */
+ struct comp_unit *all_comp_units_without_ranges;
+
/* Last comp unit in list above. */
struct comp_unit *last_comp_unit;
@@ -147,6 +220,9 @@ struct dwarf2_debug_file
/* Hash table to map offsets to decoded abbrevs. */
htab_t abbrev_offsets;
+
+ /* Root of a trie to map addresses to compilation units. */
+ struct trie_node *trie_root;
};
struct dwarf2_debug
@@ -220,6 +296,11 @@ struct comp_unit
/* Chain the previously read compilation units. */
struct comp_unit *next_unit;
+ /* Chain the previously read compilation units that have no ranges yet.
+ We scan these separately when we have a trie over the ranges.
+ Unused if arange.high != 0. */
+ struct comp_unit *next_unit_without_ranges;
+
/* Likewise, chain the compilation unit read after this one.
The comp units are stored in reversed reading order. */
struct comp_unit *prev_unit;
@@ -296,6 +377,10 @@ struct comp_unit
/* TRUE if symbols are cached in hash table for faster lookup by name. */
bool cached;
+
+ /* Used when iterating over trie leaves to know which units we have
+ already seen in this iteration. */
+ bool mark;
};
/* This data structure holds the information of an abbrev. */
@@ -1767,9 +1852,183 @@ concat_filename (struct line_info_table *table, unsigned int file)
return strdup (filename);
}
+/* Number of bits in a bfd_vma. */
+#define VMA_BITS (8 * sizeof (bfd_vma))
+
+/* Check whether [low1, high1) can be combined with [low2, high2),
+ i.e., they touch or overlap. */
+static bool ranges_overlap(bfd_vma low1, bfd_vma high1, bfd_vma low2, bfd_vma high2)
+{
+ if (low1 == low2 || high1 == high2)
+ return true;
+
+ /* Sort so that low1 is below low2. */
+ if (low1 > low2)
+ {
+ bfd_vma tmp;
+
+ tmp = low1;
+ low1 = low2;
+ low2 = tmp;
+
+ tmp = high1;
+ high1 = high2;
+ high2 = tmp;
+ }
+
+ /* We touch iff low2 == high1.
+ We overlap iff low2 is within [low1, high1). */
+ return (low2 <= high1);
+}
+
+/* Insert an address range in the trie mapping addresses to compilation units.
+ Will return the new trie node (usually the same as is being sent in, but
+ in case of a leaf-to-interior conversion, or expansion of a leaf, it may be
+ different), or NULL on failure.
+ */
+static struct trie_node *insert_arange_in_trie(bfd *abfd,
+ struct trie_node *trie,
+ bfd_vma trie_pc,
+ int trie_pc_bits,
+ const struct comp_unit *unit,
+ bfd_vma low_pc,
+ bfd_vma high_pc)
+{
+ bfd_vma clamped_low_pc, clamped_high_pc;
+ int ch, from_ch, to_ch;
+ bool is_full_leaf = false;
+
+ /* See if we can extend any of the existing ranges. This merging
+ isn't perfect (if merging opens up the possibility of merging two existing
+ ranges, we won't find them), but it takes the majority of the cases. */
+ if (trie->num_room_in_leaf > 0)
+ {
+ struct trie_leaf *leaf = (struct trie_leaf *) trie;
+ int i;
+
+ for (i = 0; i < leaf->num_stored_in_leaf; ++i)
+ {
+ if (leaf->ranges[i].unit == unit &&
+ ranges_overlap(low_pc, high_pc,
+ leaf->ranges[i].low_pc, leaf->ranges[i].high_pc))
+ {
+ if (low_pc < leaf->ranges[i].low_pc)
+ leaf->ranges[i].low_pc = low_pc;
+ if (high_pc > leaf->ranges[i].high_pc)
+ leaf->ranges[i].high_pc = high_pc;
+ return trie;
+ }
+ }
+
+ is_full_leaf = leaf->num_stored_in_leaf == trie->num_room_in_leaf;
+ }
+
+ /* If we're a leaf with no more room and we're _not_ at the bottom,
+ convert to an interior node. */
+ if (is_full_leaf && trie_pc_bits < (int)VMA_BITS)
+ {
+ const struct trie_leaf *leaf = (struct trie_leaf *) trie;
+ int i;
+
+ trie = (struct trie_node *) bfd_zalloc (abfd, sizeof (struct trie_interior));
+ if (!trie)
+ return NULL;
+ is_full_leaf = false;
+
+ /* TODO: If we wanted to save a little more memory at the cost of
+ complexity, we could have reused the old leaf node as one of the
+ children of the new interior node, instead of throwing it away. */
+ for (i = 0; i < leaf->num_stored_in_leaf; ++i)
+ {
+ if (!insert_arange_in_trie (abfd, trie, trie_pc, trie_pc_bits,
+ leaf->ranges[i].unit, leaf->ranges[i].low_pc,
+ leaf->ranges[i].high_pc))
+ return NULL;
+ }
+ }
+
+ /* If we're a leaf with no more room and we _are_ at the bottom,
+ we have no choice but to just make it larger. */
+ if (is_full_leaf)
+ {
+ const struct trie_leaf *leaf = (struct trie_leaf *) trie;
+ int new_room_in_leaf = trie->num_room_in_leaf * 2;
+ struct trie_leaf *new_leaf;
+
+ new_leaf = (struct trie_leaf *) bfd_zalloc (abfd,
+ sizeof (struct trie_leaf) + (new_room_in_leaf - TRIE_LEAF_SIZE) * sizeof (leaf->ranges[0]));
+ new_leaf->head.num_room_in_leaf = new_room_in_leaf;
+ new_leaf->num_stored_in_leaf = leaf->num_stored_in_leaf;
+
+ memcpy (new_leaf->ranges,
+ leaf->ranges,
+ leaf->num_stored_in_leaf * sizeof (leaf->ranges[0]));
+ trie = (struct trie_node *) new_leaf;
+ is_full_leaf = false;
+
+ /* Now the insert below will go through. */
+ }
+
+ /* If we're a leaf (now with room), we can just insert at the end. */
+ if (trie->num_room_in_leaf > 0)
+ {
+ struct trie_leaf *leaf = (struct trie_leaf *) trie;
+
+ int i = leaf->num_stored_in_leaf++;
+ leaf->ranges[i].unit = unit;
+ leaf->ranges[i].low_pc = low_pc;
+ leaf->ranges[i].high_pc = high_pc;
+ return trie;
+ }
+
+ /* Now we are definitely an interior node, so recurse into all the relevant buckets. */
+
+ /* Clamp the range to the current trie bucket. */
+ clamped_low_pc = low_pc;
+ clamped_high_pc = high_pc;
+ if (trie_pc_bits > 0)
+ {
+ bfd_vma bucket_high_pc = trie_pc + ((bfd_vma)-1 >> trie_pc_bits); /* Inclusive. */
+ if (clamped_low_pc < trie_pc)
+ clamped_low_pc = trie_pc;
+ if (clamped_high_pc > bucket_high_pc)
+ clamped_high_pc = bucket_high_pc;
+ }
+
+ /* Insert the ranges in all buckets that it spans. */
+ from_ch = (clamped_low_pc >> (VMA_BITS - trie_pc_bits - 8)) & 0xff;
+ to_ch = ((clamped_high_pc - 1) >> (VMA_BITS - trie_pc_bits - 8)) & 0xff;
+ for (ch = from_ch; ch <= to_ch; ++ch)
+ {
+ struct trie_interior *interior = (struct trie_interior *) trie;
+ struct trie_node *child = interior->children[ch];
+
+ if (child == NULL)
+ {
+ child = alloc_trie_leaf (abfd);
+ if (!child)
+ return NULL;
+ }
+ child = insert_arange_in_trie (abfd,
+ child,
+ trie_pc + ((bfd_vma)ch << (VMA_BITS - trie_pc_bits - 8)),
+ trie_pc_bits + 8,
+ unit,
+ low_pc,
+ high_pc);
+ if (!child)
+ return NULL;
+
+ interior->children[ch] = child;
+ }
+
+ return trie;
+}
+
+
static bool
arange_add (const struct comp_unit *unit, struct arange *first_arange,
- bfd_vma low_pc, bfd_vma high_pc)
+ struct trie_node **trie_root, bfd_vma low_pc, bfd_vma high_pc)
{
struct arange *arange;
@@ -1777,6 +2036,19 @@ arange_add (const struct comp_unit *unit, struct arange *first_arange,
if (low_pc == high_pc)
return true;
+ if (trie_root != NULL)
+ {
+ *trie_root = insert_arange_in_trie (unit->file->bfd_ptr,
+ *trie_root,
+ 0,
+ 0,
+ unit,
+ low_pc,
+ high_pc);
+ if (*trie_root == NULL)
+ return false;
+ }
+
/* If the first arange is empty, use it. */
if (first_arange->high == 0)
{
@@ -2411,7 +2683,7 @@ decode_line_info (struct comp_unit *unit)
low_pc = address;
if (address > high_pc)
high_pc = address;
- if (!arange_add (unit, &unit->arange, low_pc, high_pc))
+ if (!arange_add (unit, &unit->arange, &unit->file->trie_root, low_pc, high_pc))
goto line_fail;
break;
case DW_LNE_set_address:
@@ -3134,7 +3406,7 @@ find_abstract_instance (struct comp_unit *unit,
static bool
read_ranges (struct comp_unit *unit, struct arange *arange,
- bfd_uint64_t offset)
+ struct trie_node **trie_root, bfd_uint64_t offset)
{
bfd_byte *ranges_ptr;
bfd_byte *ranges_end;
@@ -3169,7 +3441,7 @@ read_ranges (struct comp_unit *unit, struct arange *arange,
base_address = high_pc;
else
{
- if (!arange_add (unit, arange,
+ if (!arange_add (unit, arange, trie_root,
base_address + low_pc, base_address + high_pc))
return false;
}
@@ -3179,7 +3451,7 @@ read_ranges (struct comp_unit *unit, struct arange *arange,
static bool
read_rnglists (struct comp_unit *unit, struct arange *arange,
- bfd_uint64_t offset)
+ struct trie_node **trie_root, bfd_uint64_t offset)
{
bfd_byte *rngs_ptr;
bfd_byte *rngs_end;
@@ -3253,19 +3525,19 @@ read_rnglists (struct comp_unit *unit, struct arange *arange,
return false;
}
- if (!arange_add (unit, arange, low_pc, high_pc))
+ if (!arange_add (unit, arange, trie_root, low_pc, high_pc))
return false;
}
}
static bool
read_rangelist (struct comp_unit *unit, struct arange *arange,
- bfd_uint64_t offset)
+ struct trie_node **trie_root, bfd_uint64_t offset)
{
if (unit->version <= 4)
- return read_ranges (unit, arange, offset);
+ return read_ranges (unit, arange, trie_root, offset);
else
- return read_rnglists (unit, arange, offset);
+ return read_rnglists (unit, arange, trie_root, offset);
}
static struct funcinfo *
@@ -3617,7 +3889,7 @@ scan_unit_for_symbols (struct comp_unit *unit)
case DW_AT_ranges:
if (is_int_form (&attr)
- && !read_rangelist (unit, &func->arange, attr.u.val))
+ && !read_rangelist (unit, &func->arange, &unit->file->trie_root, attr.u.val))
goto fail;
break;
@@ -3733,7 +4005,7 @@ scan_unit_for_symbols (struct comp_unit *unit)
if (func && high_pc != 0)
{
- if (!arange_add (unit, &func->arange, low_pc, high_pc))
+ if (!arange_add (unit, &func->arange, &unit->file->trie_root, low_pc, high_pc))
goto fail;
}
}
@@ -3931,7 +4203,7 @@ parse_comp_unit (struct dwarf2_debug *stash,
case DW_AT_ranges:
if (is_int_form (&attr)
- && !read_rangelist (unit, &unit->arange, attr.u.val))
+ && !read_rangelist (unit, &unit->arange, &unit->file->trie_root, attr.u.val))
return NULL;
break;
@@ -3973,7 +4245,7 @@ parse_comp_unit (struct dwarf2_debug *stash,
high_pc += low_pc;
if (high_pc != 0)
{
- if (!arange_add (unit, &unit->arange, low_pc, high_pc))
+ if (!arange_add (unit, &unit->arange, &unit->file->trie_root, low_pc, high_pc))
return NULL;
}
@@ -4772,6 +5044,14 @@ _bfd_dwarf2_slurp_debug_info (bfd *abfd, bfd *debug_bfd,
if (!stash->alt.abbrev_offsets)
return false;
+ stash->f.trie_root = alloc_trie_leaf (abfd);
+ if (!stash->f.trie_root)
+ return false;
+
+ stash->alt.trie_root = alloc_trie_leaf (abfd);
+ if (!stash->alt.trie_root)
+ return false;
+
*pinfo = stash;
if (debug_bfd == NULL)
@@ -4943,6 +5223,12 @@ stash_comp_unit (struct dwarf2_debug *stash, struct dwarf2_debug_file *file)
each->next_unit = file->all_comp_units;
file->all_comp_units = each;
+ if (each->arange.high == 0)
+ {
+ each->next_unit_without_ranges = file->all_comp_units_without_ranges;
+ file->all_comp_units_without_ranges = each->next_unit_without_ranges;
+ }
+
file->info_ptr += length;
return each;
}
@@ -5185,17 +5471,67 @@ _bfd_dwarf2_find_nearest_line (bfd *abfd,
}
else
{
- for (each = stash->f.all_comp_units; each; each = each->next_unit)
+ struct trie_node *trie = stash->f.trie_root;
+ int bits = VMA_BITS - 8;
+ struct comp_unit **prev_each;
+
+ /* Traverse interior nodes until we get to a leaf. */
+ while (trie && trie->num_room_in_leaf == 0)
{
- found = ((each->arange.high == 0
- || comp_unit_contains_address (each, addr))
- && comp_unit_find_nearest_line (each, addr,
+ int ch = (addr >> bits) & 0xff;
+ trie = ((struct trie_interior *) trie)->children[ch];
+ bits -= 8;
+ }
+
+ if (trie)
+ {
+ const struct trie_leaf *leaf = (struct trie_leaf *) trie;
+ int i;
+
+ for (i = 0; i < leaf->num_stored_in_leaf; ++i)
+ {
+ struct comp_unit *unit = (struct comp_unit *) leaf->ranges[i].unit;
+ unit->mark = false;
+ }
+
+ for (i = 0; i < leaf->num_stored_in_leaf; ++i)
+ {
+ struct comp_unit *unit = (struct comp_unit *) leaf->ranges[i].unit;
+ if (unit->mark ||
+ addr < leaf->ranges[i].low_pc ||
+ addr >= leaf->ranges[i].high_pc)
+ continue;
+ unit->mark = true;
+
+ found = comp_unit_find_nearest_line (unit, addr,
filename_ptr,
&function,
linenumber_ptr,
- discriminator_ptr));
+ discriminator_ptr);
+ if (found)
+ goto done;
+ }
+ }
+
+ /* Also scan through all compilation units without any ranges,
+ taking them out of the list if they have acquired any since last time. */
+ prev_each = &stash->f.all_comp_units_without_ranges;
+ for (each = *prev_each; each; each = each->next_unit_without_ranges)
+ {
+ if (each->arange.high != 0)
+ {
+ *prev_each = each->next_unit_without_ranges;
+ continue;
+ }
+
+ found = comp_unit_find_nearest_line (each, addr,
+ filename_ptr,
+ &function,
+ linenumber_ptr,
+ discriminator_ptr);
if (found)
goto done;
+ prev_each = &each->next_unit_without_ranges;
}
}
--
2.35.1
On Sat, May 18, 2024 at 06:53:11PM +0200, Steinar H. Gunderson wrote:
> When using perf to profile large binaries, _bfd_dwarf2_find_nearest_line()
> becomes a hotspot, as perf wants to get line number information
> (for inline-detection purposes) for each and every sample. In Chromium
> in particular (the content_shell binary), this entails going through
> 475k address ranges, which takes a long time when done repeatedly.
I accidentally sent out an ancient patch anew; sorry, please ignore.
/* Steinar */
On Sat, May 18, 2024 at 9:54 AM Steinar H. Gunderson <[email protected]> wrote:
>
> On Sat, May 18, 2024 at 06:53:11PM +0200, Steinar H. Gunderson wrote:
> > When using perf to profile large binaries, _bfd_dwarf2_find_nearest_line()
> > becomes a hotspot, as perf wants to get line number information
> > (for inline-detection purposes) for each and every sample. In Chromium
> > in particular (the content_shell binary), this entails going through
> > 475k address ranges, which takes a long time when done repeatedly.
>
> I accidentally sent out an ancient patch anew; sorry, please ignore.
Presumably v1 should be reviewed?
Thanks,
Ian
> /* Steinar */
On Sun, May 19, 2024 at 08:50:37AM -0700, Ian Rogers wrote:
>> I accidentally sent out an ancient patch anew; sorry, please ignore.
> Presumably v1 should be reviewed?
v3 was merged into binutils two years ago. :-)
/* Steinar */