From: Frank Rowand <[email protected]>
Create a cache of the nodes that contain a phandle property. Use this
cache to find the node for a given phandle value instead of scanning
the devicetree to find the node. If the phandle value is not found
in the cache, of_find_node_by_phandle() will fall back to the tree
scan algorithm.
The cache is initialized in of_core_init().
The cache is freed via a late_initcall_sync() if modules are not
enabled.
Signed-off-by: Frank Rowand <[email protected]>
---
Changes since v1:
- change short description from
of: cache phandle nodes to reduce cost of of_find_node_by_phandle()
- rebase on v4.16-rc1
- reorder new functions in base.c to avoid forward declaration
- add locking around kfree(phandle_cache) for memory ordering
- add explicit check for non-null of phandle_cache in
of_find_node_by_phandle(). There is already a check for !handle,
which prevents accessing a null phandle_cache, but that dependency
is not obvious, so this check makes it more apparent.
- do not free phandle_cache if modules are enabled, so that
cached phandles will be available when modules are loaded
drivers/of/base.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++---
drivers/of/of_private.h | 5 +++
drivers/of/resolver.c | 21 -----------
3 files changed, 94 insertions(+), 25 deletions(-)
diff --git a/drivers/of/base.c b/drivers/of/base.c
index ad28de96e13f..b3597c250837 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -91,10 +91,88 @@ int __weak of_node_to_nid(struct device_node *np)
}
#endif
+static struct device_node **phandle_cache;
+static u32 max_phandle_cache;
+
+phandle live_tree_max_phandle(void)
+{
+ struct device_node *node;
+ phandle max_phandle;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&devtree_lock, flags);
+
+ max_phandle = 0;
+ for_each_of_allnodes(node) {
+ if (node->phandle != OF_PHANDLE_ILLEGAL &&
+ node->phandle > max_phandle)
+ max_phandle = node->phandle;
+ }
+
+ raw_spin_unlock_irqrestore(&devtree_lock, flags);
+
+ return max_phandle;
+}
+
+static void of_populate_phandle_cache(void)
+{
+ unsigned long flags;
+ phandle max_phandle;
+ u32 nodes = 0;
+ struct device_node *np;
+
+ if (phandle_cache)
+ return;
+
+ max_phandle = live_tree_max_phandle();
+
+ raw_spin_lock_irqsave(&devtree_lock, flags);
+
+ for_each_of_allnodes(np)
+ nodes++;
+
+ /* sanity cap for malformed tree */
+ if (max_phandle > nodes)
+ max_phandle = nodes;
+
+ phandle_cache = kzalloc((max_phandle + 1) * sizeof(*phandle_cache),
+ GFP_ATOMIC);
+
+ for_each_of_allnodes(np)
+ if (np->phandle != OF_PHANDLE_ILLEGAL &&
+ np->phandle <= max_phandle &&
+ np->phandle)
+ phandle_cache[np->phandle] = np;
+
+ max_phandle_cache = max_phandle;
+
+ raw_spin_unlock_irqrestore(&devtree_lock, flags);
+}
+
+#ifndef CONFIG_MODULES
+static int __init of_free_phandle_cache(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&devtree_lock, flags);
+
+ max_phandle_cache = 0;
+ kfree(phandle_cache);
+ phandle_cache = NULL;
+
+ raw_spin_unlock_irqrestore(&devtree_lock, flags);
+
+ return 0;
+}
+late_initcall_sync(of_free_phandle_cache);
+#endif
+
void __init of_core_init(void)
{
struct device_node *np;
+ of_populate_phandle_cache();
+
/* Create the kset, and register existing nodes */
mutex_lock(&of_mutex);
of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
@@ -1021,16 +1099,23 @@ int of_modalias_node(struct device_node *node, char *modalias, int len)
*/
struct device_node *of_find_node_by_phandle(phandle handle)
{
- struct device_node *np;
+ struct device_node *np = NULL;
unsigned long flags;
if (!handle)
return NULL;
raw_spin_lock_irqsave(&devtree_lock, flags);
- for_each_of_allnodes(np)
- if (np->phandle == handle)
- break;
+
+ if (phandle_cache && handle <= max_phandle_cache)
+ np = phandle_cache[handle];
+
+ if (!np || np->phandle != handle) {
+ for_each_of_allnodes(np)
+ if (np->phandle == handle)
+ break;
+ }
+
of_node_get(np);
raw_spin_unlock_irqrestore(&devtree_lock, flags);
return np;
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index 0c609e7d0334..18e03c9d4b55 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -131,6 +131,11 @@ extern void __of_update_property_sysfs(struct device_node *np,
extern void __of_sysfs_remove_bin_file(struct device_node *np,
struct property *prop);
+/* illegal phandle value (set when unresolved) */
+#define OF_PHANDLE_ILLEGAL 0xdeadbeef
+
+extern phandle live_tree_max_phandle(void);
+
/* iterators for transactions, used for overlays */
/* forward iterator */
#define for_each_transaction_entry(_oft, _te) \
diff --git a/drivers/of/resolver.c b/drivers/of/resolver.c
index 740d19bde601..a7580c24737a 100644
--- a/drivers/of/resolver.c
+++ b/drivers/of/resolver.c
@@ -19,27 +19,6 @@
#include "of_private.h"
-/* illegal phandle value (set when unresolved) */
-#define OF_PHANDLE_ILLEGAL 0xdeadbeef
-
-static phandle live_tree_max_phandle(void)
-{
- struct device_node *node;
- phandle phandle;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&devtree_lock, flags);
- phandle = 0;
- for_each_of_allnodes(node) {
- if (node->phandle != OF_PHANDLE_ILLEGAL &&
- node->phandle > phandle)
- phandle = node->phandle;
- }
- raw_spin_unlock_irqrestore(&devtree_lock, flags);
-
- return phandle;
-}
-
static void adjust_overlay_phandles(struct device_node *overlay,
int phandle_delta)
{
--
Frank Rowand <[email protected]>
Hi Rob,
On 02/11/18 22:27, [email protected] wrote:
> From: Frank Rowand <[email protected]>
>
> Create a cache of the nodes that contain a phandle property. Use this
> cache to find the node for a given phandle value instead of scanning
> the devicetree to find the node. If the phandle value is not found
> in the cache, of_find_node_by_phandle() will fall back to the tree
> scan algorithm.
>
> The cache is initialized in of_core_init().
>
> The cache is freed via a late_initcall_sync() if modules are not
> enabled.
>
> Signed-off-by: Frank Rowand <[email protected]>
> ---
>
> Changes since v1:
> - change short description from
> of: cache phandle nodes to reduce cost of of_find_node_by_phandle()
> - rebase on v4.16-rc1
> - reorder new functions in base.c to avoid forward declaration
> - add locking around kfree(phandle_cache) for memory ordering
> - add explicit check for non-null of phandle_cache in
> of_find_node_by_phandle(). There is already a check for !handle,
> which prevents accessing a null phandle_cache, but that dependency
> is not obvious, so this check makes it more apparent.
> - do not free phandle_cache if modules are enabled, so that
> cached phandles will be available when modules are loaded
< snip >
In a previous thread, you said:
> We should be able to do this earlier. We already walk the tree twice
> in unflattening. We can get the max phandle (or number of phandles
> IMO) on the first pass, allocate with the early allocator and then
> populate the cache in the 2nd pass. AIUI, you can alloc with memblock
> and then free with kfree as the memblock allocations get transferred
> to the slab.
And I replied:
Thanks for pointing out that kfree() can be used for memory alloced
with memblock. I'll change to populate the cache earlier.
I started to make this change when I moved forward to v4.16-rc1. There
are two obvious ways to do this.
The first is to create and populate
the phandle cache during the two phases of unflattening. To do this
would require counting the number of nodes to be unflattened and find
the largest phandle value during phase 1, then allocate the cache before
phase 2, then populate the cache during phase 2. This is not difficult,
but it is intrusive.
The second obvious way is to modify of_populate_phandle_cache() to
use the early allocator if called early in the boot and call this
immediately after unflattening. Then if of_populate_phandle_cache()
is called later (eg for applying overlays) it would have to change
to using kzalloc(). It did not seem like the larger overhead of the
very few calls of of_find_node_by_phandle() before the slightly later
phandle cache population justify adding the extra complexity in
of_populate_phandle_cache().
-Frank
On 02/11/18 22:27, [email protected] wrote:
> From: Frank Rowand <[email protected]>
>
> Create a cache of the nodes that contain a phandle property. Use this
> cache to find the node for a given phandle value instead of scanning
> the devicetree to find the node. If the phandle value is not found
> in the cache, of_find_node_by_phandle() will fall back to the tree
> scan algorithm.
>
> The cache is initialized in of_core_init().
>
> The cache is freed via a late_initcall_sync() if modules are not
> enabled.
>
> Signed-off-by: Frank Rowand <[email protected]>
> ---
>
> Changes since v1:
> - change short description from
> of: cache phandle nodes to reduce cost of of_find_node_by_phandle()
> - rebase on v4.16-rc1
> - reorder new functions in base.c to avoid forward declaration
> - add locking around kfree(phandle_cache) for memory ordering
> - add explicit check for non-null of phandle_cache in
> of_find_node_by_phandle(). There is already a check for !handle,
> which prevents accessing a null phandle_cache, but that dependency
> is not obvious, so this check makes it more apparent.
> - do not free phandle_cache if modules are enabled, so that
> cached phandles will be available when modules are loaded
>
> drivers/of/base.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++---
> drivers/of/of_private.h | 5 +++
> drivers/of/resolver.c | 21 -----------
> 3 files changed, 94 insertions(+), 25 deletions(-)
>
< snip >
Hi Chintan,
Just as in the previous version, this part of the patch will not apply
on older kernel versions, and can just be deleted:
> diff --git a/drivers/of/resolver.c b/drivers/of/resolver.c
> index 740d19bde601..a7580c24737a 100644
> --- a/drivers/of/resolver.c
> +++ b/drivers/of/resolver.c
> @@ -19,27 +19,6 @@
>
> #include "of_private.h"
>
> -/* illegal phandle value (set when unresolved) */
> -#define OF_PHANDLE_ILLEGAL 0xdeadbeef
> -
> -static phandle live_tree_max_phandle(void)
> -{
> - struct device_node *node;
> - phandle phandle;
> - unsigned long flags;
> -
> - raw_spin_lock_irqsave(&devtree_lock, flags);
> - phandle = 0;
> - for_each_of_allnodes(node) {
> - if (node->phandle != OF_PHANDLE_ILLEGAL &&
> - node->phandle > phandle)
> - phandle = node->phandle;
> - }
> - raw_spin_unlock_irqrestore(&devtree_lock, flags);
> -
> - return phandle;
> -}
> -
> static void adjust_overlay_phandles(struct device_node *overlay,
> int phandle_delta)
> {
>
On 2018-02-12 07:27, [email protected] wrote:
> From: Frank Rowand <[email protected]>
>
> Create a cache of the nodes that contain a phandle property. Use this
> cache to find the node for a given phandle value instead of scanning
> the devicetree to find the node. If the phandle value is not found
> in the cache, of_find_node_by_phandle() will fall back to the tree
> scan algorithm.
>
> The cache is initialized in of_core_init().
>
> The cache is freed via a late_initcall_sync() if modules are not
> enabled.
Maybe a few words about the memory consumption of this solution versus
the other proposed ones. Other nits below.
> +static void of_populate_phandle_cache(void)
> +{
> + unsigned long flags;
> + phandle max_phandle;
> + u32 nodes = 0;
> + struct device_node *np;
> +
> + if (phandle_cache)
> + return;
What's the point of that check? And shouldn't it be done inside the
spinlock if at all?
> + max_phandle = live_tree_max_phandle();
> +
> + raw_spin_lock_irqsave(&devtree_lock, flags);
> +
> + for_each_of_allnodes(np)
> + nodes++;
Why not save a walk over all nodes and a spin_lock/unlock pair by
combining the node count with the max_phandle computation? But you've
just moved the existing live_tree_max_phandle, so probably better as a
followup patch.
> + /* sanity cap for malformed tree */
> + if (max_phandle > nodes)
> + max_phandle = nodes;
> +
> + phandle_cache = kzalloc((max_phandle + 1) * sizeof(*phandle_cache),
> + GFP_ATOMIC);
Maybe kcalloc. Sure, you've capped max_phandle so there's no real risk
of overflow.
> + for_each_of_allnodes(np)
> + if (np->phandle != OF_PHANDLE_ILLEGAL &&
> + np->phandle <= max_phandle &&
> + np->phandle)
I'd reverse the order of these conditions so that for all the nodes with
no phandle we only do the np->phandle check. Also, extra whitespace
before &&.
> + phandle_cache[np->phandle] = np;
> +
> + max_phandle_cache = max_phandle;
> +
> + raw_spin_unlock_irqrestore(&devtree_lock, flags);
> +}
> +
Rasmus
On 2/12/2018 11:57 AM, [email protected] wrote:
> From: Frank Rowand <[email protected]>
>
> Create a cache of the nodes that contain a phandle property. Use this
> cache to find the node for a given phandle value instead of scanning
> the devicetree to find the node. If the phandle value is not found
> in the cache, of_find_node_by_phandle() will fall back to the tree
> scan algorithm.
>
> The cache is initialized in of_core_init().
>
> The cache is freed via a late_initcall_sync() if modules are not
> enabled.
>
> Signed-off-by: Frank Rowand <[email protected]>
> ---
>
> Changes since v1:
> - change short description from
> of: cache phandle nodes to reduce cost of of_find_node_by_phandle()
> - rebase on v4.16-rc1
> - reorder new functions in base.c to avoid forward declaration
> - add locking around kfree(phandle_cache) for memory ordering
> - add explicit check for non-null of phandle_cache in
> of_find_node_by_phandle(). There is already a check for !handle,
> which prevents accessing a null phandle_cache, but that dependency
> is not obvious, so this check makes it more apparent.
> - do not free phandle_cache if modules are enabled, so that
> cached phandles will be available when modules are loaded
Most of the gain from this cache will already be achieved for kernel
init calls. As the discussion started with boot_time_gain vs
mem_overhead, we seem to be loosing here as mem_overhead will be for
eternal for small savings. We can live without this, IMO.
>
> drivers/of/base.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++---
> drivers/of/of_private.h | 5 +++
> drivers/of/resolver.c | 21 -----------
> 3 files changed, 94 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/of/base.c b/drivers/of/base.c
> index ad28de96e13f..b3597c250837 100644
> --- a/drivers/of/base.c
> +++ b/drivers/of/base.c
> @@ -91,10 +91,88 @@ int __weak of_node_to_nid(struct device_node *np)
> }
> #endif
>
> +static struct device_node **phandle_cache;
> +static u32 max_phandle_cache;
> +
> +phandle live_tree_max_phandle(void)
> +{
> + struct device_node *node;
> + phandle max_phandle;
> + unsigned long flags;
> +
> + raw_spin_lock_irqsave(&devtree_lock, flags);
> +
> + max_phandle = 0;
> + for_each_of_allnodes(node) {
> + if (node->phandle != OF_PHANDLE_ILLEGAL &&
> + node->phandle > max_phandle)
> + max_phandle = node->phandle;
> + }
This closing curly bracket needs proper indentation.
> +
> + raw_spin_unlock_irqrestore(&devtree_lock, flags);
> +
> + return max_phandle;
> +}
> +
> +static void of_populate_phandle_cache(void)
> +{
> + unsigned long flags;
> + phandle max_phandle;
> + u32 nodes = 0;
> + struct device_node *np;
> +
> + if (phandle_cache)
> + return;
> +
> + max_phandle = live_tree_max_phandle();
> +
> + raw_spin_lock_irqsave(&devtree_lock, flags);
> +
> + for_each_of_allnodes(np)
This is 1ms call.
> + nodes++;
We can calculate total nodes in live_tree_max_phandle to save 1ms.
> +
> + /* sanity cap for malformed tree */
> + if (max_phandle > nodes)
> + max_phandle = nodes;
Or rather, we can take this entire thing to live_tree_max_phandle().
> +
> + phandle_cache = kzalloc((max_phandle + 1) * sizeof(*phandle_cache),
> + GFP_ATOMIC);
> +
> + for_each_of_allnodes(np)
> + if (np->phandle != OF_PHANDLE_ILLEGAL &&
> + np->phandle <= max_phandle &&
> + np->phandle)
> + phandle_cache[np->phandle] = np;
> +
> + max_phandle_cache = max_phandle;
> +
> + raw_spin_unlock_irqrestore(&devtree_lock, flags);
> +}
> +
> +#ifndef CONFIG_MODULES
> +static int __init of_free_phandle_cache(void)
> +{
> + unsigned long flags;
> +
> + raw_spin_lock_irqsave(&devtree_lock, flags);
> +
> + max_phandle_cache = 0;
> + kfree(phandle_cache);
> + phandle_cache = NULL;
> +
> + raw_spin_unlock_irqrestore(&devtree_lock, flags);
> +
> + return 0;
> +}
> +late_initcall_sync(of_free_phandle_cache);
> +#endif
> +
> void __init of_core_init(void)
> {
> struct device_node *np;
>
> + of_populate_phandle_cache();
> +
> /* Create the kset, and register existing nodes */
> mutex_lock(&of_mutex);
> of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
> @@ -1021,16 +1099,23 @@ int of_modalias_node(struct device_node *node, char *modalias, int len)
> */
> struct device_node *of_find_node_by_phandle(phandle handle)
> {
> - struct device_node *np;
> + struct device_node *np = NULL;
> unsigned long flags;
>
> if (!handle)
> return NULL;
>
> raw_spin_lock_irqsave(&devtree_lock, flags);
> - for_each_of_allnodes(np)
> - if (np->phandle == handle)
> - break;
> +
> + if (phandle_cache && handle <= max_phandle_cache)
> + np = phandle_cache[handle];
> +
> + if (!np || np->phandle != handle) {
> + for_each_of_allnodes(np)
> + if (np->phandle == handle)
> + break;
> + }
> +
> of_node_get(np);
> raw_spin_unlock_irqrestore(&devtree_lock, flags);
> return np;
> diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
> index 0c609e7d0334..18e03c9d4b55 100644
> --- a/drivers/of/of_private.h
> +++ b/drivers/of/of_private.h
> @@ -131,6 +131,11 @@ extern void __of_update_property_sysfs(struct device_node *np,
> extern void __of_sysfs_remove_bin_file(struct device_node *np,
> struct property *prop);
>
> +/* illegal phandle value (set when unresolved) */
> +#define OF_PHANDLE_ILLEGAL 0xdeadbeef
> +
> +extern phandle live_tree_max_phandle(void);
Why do we need this to be public API ? It was not previously and no good
use-case for this now.
> +
> /* iterators for transactions, used for overlays */
> /* forward iterator */
> #define for_each_transaction_entry(_oft, _te) \
> diff --git a/drivers/of/resolver.c b/drivers/of/resolver.c
> index 740d19bde601..a7580c24737a 100644
> --- a/drivers/of/resolver.c
> +++ b/drivers/of/resolver.c
> @@ -19,27 +19,6 @@
>
> #include "of_private.h"
>
> -/* illegal phandle value (set when unresolved) */
> -#define OF_PHANDLE_ILLEGAL 0xdeadbeef
> -
> -static phandle live_tree_max_phandle(void)
> -{
> - struct device_node *node;
> - phandle phandle;
> - unsigned long flags;
> -
> - raw_spin_lock_irqsave(&devtree_lock, flags);
> - phandle = 0;
> - for_each_of_allnodes(node) {
> - if (node->phandle != OF_PHANDLE_ILLEGAL &&
> - node->phandle > phandle)
> - phandle = node->phandle;
> - }
> - raw_spin_unlock_irqrestore(&devtree_lock, flags);
> -
> - return phandle;
> -}
> -
> static void adjust_overlay_phandles(struct device_node *overlay,
> int phandle_delta)
> {
>
Chintan
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center,
Inc. is a member of the Code Aurora Forum, a Linux Foundation
Collaborative Project
On 02/12/18 00:58, Rasmus Villemoes wrote:
> On 2018-02-12 07:27, [email protected] wrote:
>> From: Frank Rowand <[email protected]>
>>
>> Create a cache of the nodes that contain a phandle property. Use this
>> cache to find the node for a given phandle value instead of scanning
>> the devicetree to find the node. If the phandle value is not found
>> in the cache, of_find_node_by_phandle() will fall back to the tree
>> scan algorithm.
>>
>> The cache is initialized in of_core_init().
>>
>> The cache is freed via a late_initcall_sync() if modules are not
>> enabled.
>
> Maybe a few words about the memory consumption of this solution versus
> the other proposed ones.
The patch comment is about this patch, not the other proposals.
Please do not take that as a snippy response. There were several
emails in the previous thread that discussed memory. In that
thread I responded as to how I would address the concerns. If
anyone wants to raise concerns about memory usage as a result of
this version of the patch they should do so in this current thread.
> Other nits below.
>
>> +static void of_populate_phandle_cache(void)
>> +{
>> + unsigned long flags;
>> + phandle max_phandle;
>> + u32 nodes = 0;
>> + struct device_node *np;
>> +
>> + if (phandle_cache)
>> + return;
>
> What's the point of that check?
Sanity check to make sure a memory leak of a previous cache
does not occur. I'll change it to free the cache if it
exists.
There is only one instance of of_populate_cache() being called,
so this is a theoretical issue. I intend to add another caller
in the devicetree overlay code in the future, but do not want
to do that now, to avoid a conflict with the overlay patch
series that has been in parallel development, and for which
I submitted v2 shortly after this set of patches.
> And shouldn't it be done inside the
> spinlock if at all?
Not an issue yet, but I'll keep my eye on the possibility of races
when I add a call to of_populate_cache() from the overlay code.
>> + max_phandle = live_tree_max_phandle();
>> +
>> + raw_spin_lock_irqsave(&devtree_lock, flags);
>> +
>> + for_each_of_allnodes(np)
>> + nodes++;
>
> Why not save a walk over all nodes and a spin_lock/unlock pair by
> combining the node count with the max_phandle computation? But you've
> just moved the existing live_tree_max_phandle, so probably better as a
> followup patch.
I'll consider adding the node counting into live_tree_max_phandle() later.
The other user of live_tree_max_phandle() is being modified in my overlay
patch series (see mention above). I don't want to create a conflict between
the two series.
>> + /* sanity cap for malformed tree */
>> + if (max_phandle > nodes)
>> + max_phandle = nodes;
>> +
>> + phandle_cache = kzalloc((max_phandle + 1) * sizeof(*phandle_cache),
>> + GFP_ATOMIC);
>
> Maybe kcalloc. Sure, you've capped max_phandle so there's no real risk
> of overflow.
OK, will do.
>> + for_each_of_allnodes(np)
>> + if (np->phandle != OF_PHANDLE_ILLEGAL &&
>> + np->phandle <= max_phandle &&
>> + np->phandle)
>
> I'd reverse the order of these conditions so that for all the nodes with
> no phandle we only do the np->phandle check. Also, extra whitespace
> before &&.
Will do.
>> + phandle_cache[np->phandle] = np;
>> +
>> + max_phandle_cache = max_phandle;
>> +
>> + raw_spin_unlock_irqrestore(&devtree_lock, flags);
>> +}
>> +
>
> Rasmus
>
On 02/12/18 02:51, Chintan Pandya wrote:
>
>
> On 2/12/2018 11:57 AM, [email protected] wrote:
>> From: Frank Rowand <[email protected]>
>>
>> Create a cache of the nodes that contain a phandle property. Use this
>> cache to find the node for a given phandle value instead of scanning
>> the devicetree to find the node. If the phandle value is not found
>> in the cache, of_find_node_by_phandle() will fall back to the tree
>> scan algorithm.
>>
>> The cache is initialized in of_core_init().
>>
>> The cache is freed via a late_initcall_sync() if modules are not
>> enabled.
>>
>> Signed-off-by: Frank Rowand <[email protected]>
>> ---
>>
>> Changes since v1:
>> - change short description from
>> of: cache phandle nodes to reduce cost of of_find_node_by_phandle()
>> - rebase on v4.16-rc1
>> - reorder new functions in base.c to avoid forward declaration
>> - add locking around kfree(phandle_cache) for memory ordering
>> - add explicit check for non-null of phandle_cache in
>> of_find_node_by_phandle(). There is already a check for !handle,
>> which prevents accessing a null phandle_cache, but that dependency
>> is not obvious, so this check makes it more apparent.
>> - do not free phandle_cache if modules are enabled, so that
>> cached phandles will be available when modules are loaded
>
> Most of the gain from this cache will already be achieved for kernel
> init calls. As the discussion started with boot_time_gain vs
> mem_overhead, we seem to be loosing here as mem_overhead will be for
> eternal for small savings. We can live without this, IMO.
Please line wrap your emails so I don't have to.
I was addressing the other maintainer's concern. This was an attempt
to free the memory if modules are not a factor, otherwise provide the
reduced overhead for drivers that are loaded as modules.
I won't bike shed over this.
The memory cost is quite small relative to the boot time reduction gain
For systems with little boot time speedup, the memory used is especially
small. For systems with larger boot time speedup, the memory used is still
not large, though it is "forever" if modules are configured on. If modules
are configured on, then the system designer has already decided that they
are willing to use more memory to gain the module functionality.
If modules are not configured on, then the cache is freed as late as
possible using the init functions. There are some calls to
of_find_node_by_phandle() after this point (on the system that I tested
on), but those are in asynchronous threads that are not preventing
the boot from completing (and thus not an issue for those who are
tuning for minimal boot time).
So the question for when modules are configured on comes down to: which
is more important, the reduced phandle lookup overhead provided by the
cache or the additional memory used by the cache? My preference is to
just free the memory instead of providing the phandle access overhead
reduction for a theoretical case of lots of modular drivers (though the
patch implements the opposite). If a system with lots of modular drivers
demonstrates a phandle lookup overhead issue then the implementation can
be revisited.
Whichever way Rob wants to answer this question is fine with me, I
consider it a theoretical non-issue at the moment.
>>
>> drivers/of/base.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++---
>> drivers/of/of_private.h | 5 +++
>> drivers/of/resolver.c | 21 -----------
>> 3 files changed, 94 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/of/base.c b/drivers/of/base.c
>> index ad28de96e13f..b3597c250837 100644
>> --- a/drivers/of/base.c
>> +++ b/drivers/of/base.c
>> @@ -91,10 +91,88 @@ int __weak of_node_to_nid(struct device_node *np)
>> }
>> #endif
>> +static struct device_node **phandle_cache;
>> +static u32 max_phandle_cache;
>> +
>> +phandle live_tree_max_phandle(void)
>> +{
>> + struct device_node *node;
>> + phandle max_phandle;
>> + unsigned long flags;
>> +
>> + raw_spin_lock_irqsave(&devtree_lock, flags);
>> +
>> + max_phandle = 0;
>> + for_each_of_allnodes(node) {
>> + if (node->phandle != OF_PHANDLE_ILLEGAL &&
>> + node->phandle > max_phandle)
>> + max_phandle = node->phandle;
>> + }
>
> This closing curly bracket needs proper indentation.
Thanks, will fix.
>> +
>> + raw_spin_unlock_irqrestore(&devtree_lock, flags);
>> +
>> + return max_phandle;
>> +}
>> +
>> +static void of_populate_phandle_cache(void)
>> +{
>> + unsigned long flags;
>> + phandle max_phandle;
>> + u32 nodes = 0;
>> + struct device_node *np;
>> +
>> + if (phandle_cache)
>> + return;
>> +
>> + max_phandle = live_tree_max_phandle();
>> +
>> + raw_spin_lock_irqsave(&devtree_lock, flags);
>> +
>> + for_each_of_allnodes(np)
>
> This is 1ms call.
>
>> + nodes++;
>
> We can calculate total nodes in live_tree_max_phandle to save 1ms.>
>> +
>> + /* sanity cap for malformed tree */
>> + if (max_phandle > nodes)
>> + max_phandle = nodes;
>
> Or rather, we can take this entire thing to live_tree_max_phandle().
See my answer to Rasmus' review comments to see why I'll consider this
in the future, but not for this patch series.
>> +
>> + phandle_cache = kzalloc((max_phandle + 1) * sizeof(*phandle_cache),
>> + GFP_ATOMIC);
>> +
>> + for_each_of_allnodes(np)
>> + if (np->phandle != OF_PHANDLE_ILLEGAL &&
>> + np->phandle <= max_phandle &&
>> + np->phandle)
>> + phandle_cache[np->phandle] = np;
>> +
>> + max_phandle_cache = max_phandle;
>> +
>> + raw_spin_unlock_irqrestore(&devtree_lock, flags);
>> +}
>> +
>> +#ifndef CONFIG_MODULES
>> +static int __init of_free_phandle_cache(void)
>> +{
>> + unsigned long flags;
>> +
>> + raw_spin_lock_irqsave(&devtree_lock, flags);
>> +
>> + max_phandle_cache = 0;
>> + kfree(phandle_cache);
>> + phandle_cache = NULL;
>> +
>> + raw_spin_unlock_irqrestore(&devtree_lock, flags);
>> +
>> + return 0;
>> +}
>> +late_initcall_sync(of_free_phandle_cache);
>> +#endif
>> +
>> void __init of_core_init(void)
>> {
>> struct device_node *np;
>> + of_populate_phandle_cache();
>> +
>> /* Create the kset, and register existing nodes */
>> mutex_lock(&of_mutex);
>> of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
>> @@ -1021,16 +1099,23 @@ int of_modalias_node(struct device_node *node, char *modalias, int len)
>> */
>> struct device_node *of_find_node_by_phandle(phandle handle)
>> {
>> - struct device_node *np;
>> + struct device_node *np = NULL;
>> unsigned long flags;
>> if (!handle)
>> return NULL;
>> raw_spin_lock_irqsave(&devtree_lock, flags);
>> - for_each_of_allnodes(np)
>> - if (np->phandle == handle)
>> - break;
>> +
>> + if (phandle_cache && handle <= max_phandle_cache)
>> + np = phandle_cache[handle];
>> +
>> + if (!np || np->phandle != handle) {
>> + for_each_of_allnodes(np)
>> + if (np->phandle == handle)
>> + break;
>> + }
>> +
>> of_node_get(np);
>> raw_spin_unlock_irqrestore(&devtree_lock, flags);
>> return np;
>> diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
>> index 0c609e7d0334..18e03c9d4b55 100644
>> --- a/drivers/of/of_private.h
>> +++ b/drivers/of/of_private.h
>> @@ -131,6 +131,11 @@ extern void __of_update_property_sysfs(struct device_node *np,
>> extern void __of_sysfs_remove_bin_file(struct device_node *np,
>> struct property *prop);
>> +/* illegal phandle value (set when unresolved) */
>> +#define OF_PHANDLE_ILLEGAL 0xdeadbeef
>> +
>> +extern phandle live_tree_max_phandle(void);
>
> Why do we need this to be public API ? It was not previously and no
> good use-case for this now.
It is not public. It is private ("of_private.h") to the devicetree
code. It is used by the overlay code (look at top of tree, not
an old kernel).
>> +
>> /* iterators for transactions, used for overlays */
>> /* forward iterator */
>> #define for_each_transaction_entry(_oft, _te) \
>> diff --git a/drivers/of/resolver.c b/drivers/of/resolver.c
>> index 740d19bde601..a7580c24737a 100644
>> --- a/drivers/of/resolver.c
>> +++ b/drivers/of/resolver.c
>> @@ -19,27 +19,6 @@
>> #include "of_private.h"
>> -/* illegal phandle value (set when unresolved) */
>> -#define OF_PHANDLE_ILLEGAL 0xdeadbeef
>> -
>> -static phandle live_tree_max_phandle(void)
>> -{
>> - struct device_node *node;
>> - phandle phandle;
>> - unsigned long flags;
>> -
>> - raw_spin_lock_irqsave(&devtree_lock, flags);
>> - phandle = 0;
>> - for_each_of_allnodes(node) {
>> - if (node->phandle != OF_PHANDLE_ILLEGAL &&
>> - node->phandle > phandle)
>> - phandle = node->phandle;
>> - }
>> - raw_spin_unlock_irqrestore(&devtree_lock, flags);
>> -
>> - return phandle;
>> -}
>> -
>> static void adjust_overlay_phandles(struct device_node *overlay,
>> int phandle_delta)
>> {
>>
>
> Chintan
On Mon, Feb 12, 2018 at 12:27 AM, <[email protected]> wrote:
> From: Frank Rowand <[email protected]>
>
> Create a cache of the nodes that contain a phandle property. Use this
> cache to find the node for a given phandle value instead of scanning
> the devicetree to find the node. If the phandle value is not found
> in the cache, of_find_node_by_phandle() will fall back to the tree
> scan algorithm.
Please add some data here as to what speed up we see.
> The cache is initialized in of_core_init().
>
> The cache is freed via a late_initcall_sync() if modules are not
> enabled.
>
> Signed-off-by: Frank Rowand <[email protected]>
> ---
>
> Changes since v1:
> - change short description from
> of: cache phandle nodes to reduce cost of of_find_node_by_phandle()
> - rebase on v4.16-rc1
> - reorder new functions in base.c to avoid forward declaration
> - add locking around kfree(phandle_cache) for memory ordering
> - add explicit check for non-null of phandle_cache in
> of_find_node_by_phandle(). There is already a check for !handle,
> which prevents accessing a null phandle_cache, but that dependency
> is not obvious, so this check makes it more apparent.
> - do not free phandle_cache if modules are enabled, so that
> cached phandles will be available when modules are loaded
>
> drivers/of/base.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++---
> drivers/of/of_private.h | 5 +++
> drivers/of/resolver.c | 21 -----------
> 3 files changed, 94 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/of/base.c b/drivers/of/base.c
> index ad28de96e13f..b3597c250837 100644
> --- a/drivers/of/base.c
> +++ b/drivers/of/base.c
> @@ -91,10 +91,88 @@ int __weak of_node_to_nid(struct device_node *np)
> }
> #endif
>
> +static struct device_node **phandle_cache;
> +static u32 max_phandle_cache;
> +
> +phandle live_tree_max_phandle(void)
> +{
> + struct device_node *node;
> + phandle max_phandle;
> + unsigned long flags;
> +
> + raw_spin_lock_irqsave(&devtree_lock, flags);
> +
> + max_phandle = 0;
> + for_each_of_allnodes(node) {
> + if (node->phandle != OF_PHANDLE_ILLEGAL &&
> + node->phandle > max_phandle)
> + max_phandle = node->phandle;
> + }
> +
> + raw_spin_unlock_irqrestore(&devtree_lock, flags);
> +
> + return max_phandle;
> +}
> +
> +static void of_populate_phandle_cache(void)
> +{
> + unsigned long flags;
> + phandle max_phandle;
> + u32 nodes = 0;
> + struct device_node *np;
> +
> + if (phandle_cache)
> + return;
> +
> + max_phandle = live_tree_max_phandle();
> +
> + raw_spin_lock_irqsave(&devtree_lock, flags);
> +
> + for_each_of_allnodes(np)
> + nodes++;
> +
> + /* sanity cap for malformed tree */
Or any tree that doesn't match your assumptions.
> + if (max_phandle > nodes)
> + max_phandle = nodes;
I fail to see how setting max_phandle to number of nodes helps you
here. Either just bail out or mask the phandle values so this works
with any contiguous range.
Please capture somewhere what the assumptions are for the cache to
work (i.e. phandles were allocated contiguously from 0).
Rob
Hi Rob,
On 02/11/18 22:56, Frank Rowand wrote:
> Hi Rob,
>
> On 02/11/18 22:27, [email protected] wrote:
>> From: Frank Rowand <[email protected]>
>>
>> Create a cache of the nodes that contain a phandle property. Use this
>> cache to find the node for a given phandle value instead of scanning
>> the devicetree to find the node. If the phandle value is not found
>> in the cache, of_find_node_by_phandle() will fall back to the tree
>> scan algorithm.
>>
>> The cache is initialized in of_core_init().
>>
>> The cache is freed via a late_initcall_sync() if modules are not
>> enabled.
>>
>> Signed-off-by: Frank Rowand <[email protected]>
>> ---
>>
>> Changes since v1:
>> - change short description from
>> of: cache phandle nodes to reduce cost of of_find_node_by_phandle()
>> - rebase on v4.16-rc1
>> - reorder new functions in base.c to avoid forward declaration
>> - add locking around kfree(phandle_cache) for memory ordering
>> - add explicit check for non-null of phandle_cache in
>> of_find_node_by_phandle(). There is already a check for !handle,
>> which prevents accessing a null phandle_cache, but that dependency
>> is not obvious, so this check makes it more apparent.
>> - do not free phandle_cache if modules are enabled, so that
>> cached phandles will be available when modules are loaded
>
> < snip >
>
>
>
> In a previous thread, you said:
>
>> We should be able to do this earlier. We already walk the tree twice
>> in unflattening. We can get the max phandle (or number of phandles
>> IMO) on the first pass, allocate with the early allocator and then
>> populate the cache in the 2nd pass. AIUI, you can alloc with memblock
>> and then free with kfree as the memblock allocations get transferred
>> to the slab.
>
> And I replied:
>
> Thanks for pointing out that kfree() can be used for memory alloced
> with memblock. I'll change to populate the cache earlier.
>
>
> I started to make this change when I moved forward to v4.16-rc1. There
> are two obvious ways to do this.
< snip >
And I did not finish the explanation, sorry. I meant to finish with saying
that given the drawbacks that I ended up not making the change for v2.
While modifying the patch to respond to the v2 comments, I decided to go
ahead and try using memblock to alloc memory earlier. The result I got
was that when I tried to kfree() the memory at late_initcall_sync I got
a panic. The alloc function I used is memblock_virt_alloc(). You mention
"slab" specifically. Maybe the problem is that my system is using "slub"
instead. Dunno...
-Frank
On 02/13/18 06:49, Rob Herring wrote:
> On Mon, Feb 12, 2018 at 12:27 AM, <[email protected]> wrote:
>> From: Frank Rowand <[email protected]>
>>
>> Create a cache of the nodes that contain a phandle property. Use this
>> cache to find the node for a given phandle value instead of scanning
>> the devicetree to find the node. If the phandle value is not found
>> in the cache, of_find_node_by_phandle() will fall back to the tree
>> scan algorithm.
>
> Please add some data here as to what speed up we see.
Will do.
>> The cache is initialized in of_core_init().
>>
>> The cache is freed via a late_initcall_sync() if modules are not
>> enabled.
>>
>> Signed-off-by: Frank Rowand <[email protected]>
>> ---
>>
>> Changes since v1:
>> - change short description from
>> of: cache phandle nodes to reduce cost of of_find_node_by_phandle()
>> - rebase on v4.16-rc1
>> - reorder new functions in base.c to avoid forward declaration
>> - add locking around kfree(phandle_cache) for memory ordering
>> - add explicit check for non-null of phandle_cache in
>> of_find_node_by_phandle(). There is already a check for !handle,
>> which prevents accessing a null phandle_cache, but that dependency
>> is not obvious, so this check makes it more apparent.
>> - do not free phandle_cache if modules are enabled, so that
>> cached phandles will be available when modules are loaded
>>
>> drivers/of/base.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++---
>> drivers/of/of_private.h | 5 +++
>> drivers/of/resolver.c | 21 -----------
>> 3 files changed, 94 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/of/base.c b/drivers/of/base.c
>> index ad28de96e13f..b3597c250837 100644
>> --- a/drivers/of/base.c
>> +++ b/drivers/of/base.c
>> @@ -91,10 +91,88 @@ int __weak of_node_to_nid(struct device_node *np)
>> }
>> #endif
>>
>> +static struct device_node **phandle_cache;
>> +static u32 max_phandle_cache;
>> +
>> +phandle live_tree_max_phandle(void)
>> +{
>> + struct device_node *node;
>> + phandle max_phandle;
>> + unsigned long flags;
>> +
>> + raw_spin_lock_irqsave(&devtree_lock, flags);
>> +
>> + max_phandle = 0;
>> + for_each_of_allnodes(node) {
>> + if (node->phandle != OF_PHANDLE_ILLEGAL &&
>> + node->phandle > max_phandle)
>> + max_phandle = node->phandle;
>> + }
>> +
>> + raw_spin_unlock_irqrestore(&devtree_lock, flags);
>> +
>> + return max_phandle;
>> +}
>> +
>> +static void of_populate_phandle_cache(void)
>> +{
>> + unsigned long flags;
>> + phandle max_phandle;
>> + u32 nodes = 0;
>> + struct device_node *np;
>> +
>> + if (phandle_cache)
>> + return;
>> +
>> + max_phandle = live_tree_max_phandle();
>> +
>> + raw_spin_lock_irqsave(&devtree_lock, flags);
>> +
>> + for_each_of_allnodes(np)
>> + nodes++;
>> +
>> + /* sanity cap for malformed tree */
>
> Or any tree that doesn't match your assumptions.
Will change to say so.
>> + if (max_phandle > nodes)
>> + max_phandle = nodes;
>
> I fail to see how setting max_phandle to number of nodes helps you
> here.
>
> Either just bail out or mask the phandle values so this works
> with any contiguous range.
I'll mask.
> Please capture somewhere what the assumptions are for the cache to
> work (i.e. phandles were allocated contiguously from 0).
Will do.
>
> Rob
>
On Tue, Feb 13, 2018 at 7:00 PM, Frank Rowand <[email protected]> wrote:
> Hi Rob,
>
> On 02/11/18 22:56, Frank Rowand wrote:
>> Hi Rob,
>>
>> On 02/11/18 22:27, [email protected] wrote:
>>> From: Frank Rowand <[email protected]>
>>>
>>> Create a cache of the nodes that contain a phandle property. Use this
>>> cache to find the node for a given phandle value instead of scanning
>>> the devicetree to find the node. If the phandle value is not found
>>> in the cache, of_find_node_by_phandle() will fall back to the tree
>>> scan algorithm.
>>>
>>> The cache is initialized in of_core_init().
>>>
>>> The cache is freed via a late_initcall_sync() if modules are not
>>> enabled.
>>>
>>> Signed-off-by: Frank Rowand <[email protected]>
>>> ---
>>>
>>> Changes since v1:
>>> - change short description from
>>> of: cache phandle nodes to reduce cost of of_find_node_by_phandle()
>>> - rebase on v4.16-rc1
>>> - reorder new functions in base.c to avoid forward declaration
>>> - add locking around kfree(phandle_cache) for memory ordering
>>> - add explicit check for non-null of phandle_cache in
>>> of_find_node_by_phandle(). There is already a check for !handle,
>>> which prevents accessing a null phandle_cache, but that dependency
>>> is not obvious, so this check makes it more apparent.
>>> - do not free phandle_cache if modules are enabled, so that
>>> cached phandles will be available when modules are loaded
>>
>> < snip >
>>
>>
>>
>> In a previous thread, you said:
>>
>>> We should be able to do this earlier. We already walk the tree twice
>>> in unflattening. We can get the max phandle (or number of phandles
>>> IMO) on the first pass, allocate with the early allocator and then
>>> populate the cache in the 2nd pass. AIUI, you can alloc with memblock
>>> and then free with kfree as the memblock allocations get transferred
>>> to the slab.
>>
>> And I replied:
>>
>> Thanks for pointing out that kfree() can be used for memory alloced
>> with memblock. I'll change to populate the cache earlier.
>>
>>
>> I started to make this change when I moved forward to v4.16-rc1. There
>> are two obvious ways to do this.
>
> < snip >
>
> And I did not finish the explanation, sorry. I meant to finish with saying
> that given the drawbacks that I ended up not making the change for v2.
>
> While modifying the patch to respond to the v2 comments, I decided to go
> ahead and try using memblock to alloc memory earlier. The result I got
> was that when I tried to kfree() the memory at late_initcall_sync I got
> a panic. The alloc function I used is memblock_virt_alloc(). You mention
> "slab" specifically. Maybe the problem is that my system is using "slub"
> instead. Dunno...
Maybe memblock_free() still works? Or there's something else that
needs to be done to transfer them. In any case, I guess doing it later
is fine.
And by slab, I mean the allocator, not the implementation (which can
be slab, slub, or slob).
Rob