Presently page_is_ram() relies on walk_system_ram_range() that performs a walk
on kernel iomem resources hierarchy with a dummy callback __is_ram(). Before
calling find_next_iomem_res(), walk_system_ram_range() does some book-keeping
which can be avoided for page_is_ram() use-case.
Hence this patch proposes to update page_is_ram() to directly call
find_next_iomem_res() with minimal book-keeping needed.
To avoid allocating a 'struct resource' the patch also updates
find_next_iomem_res() to not return -EINVAL in case 'res == NULL'. Instead
out 'struct resource *res' is only populated when its not NULL.
Signed-off-by: Vaibhav Jain <[email protected]>
---
kernel/resource.c | 19 ++++++++-----------
1 file changed, 8 insertions(+), 11 deletions(-)
diff --git a/kernel/resource.c b/kernel/resource.c
index 34eaee179689..ecf6b9a50adc 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -311,7 +311,7 @@ EXPORT_SYMBOL(release_resource);
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
- * -ENODEV. Returns -EINVAL for invalid parameters.
+ * -ENODEV.
*
* @start: start address of the resource searched for
* @end: end address of same resource
@@ -328,9 +328,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
{
struct resource *p;
- if (!res)
- return -EINVAL;
-
if (start >= end)
return -EINVAL;
@@ -356,7 +353,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
break;
}
- if (p) {
+ if (p && res) {
/* copy data */
*res = (struct resource) {
.start = max(start, p->start),
@@ -474,18 +471,18 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
return ret;
}
-static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
-{
- return 1;
-}
-
/*
* This generic page_is_ram() returns true if specified address is
* registered as System RAM in iomem_resource list.
*/
int __weak page_is_ram(unsigned long pfn)
{
- return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
+ const resource_size_t pfn_res = PFN_PHYS(pfn);
+
+ return find_next_iomem_res(pfn_res,
+ pfn_res + 1,
+ IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+ IORES_DESC_NONE, NULL) == 0;
}
EXPORT_SYMBOL_GPL(page_is_ram);
--
2.35.3
On 01.06.22 18:32, Vaibhav Jain wrote:
> Presently page_is_ram() relies on walk_system_ram_range() that performs a walk
> on kernel iomem resources hierarchy with a dummy callback __is_ram(). Before
> calling find_next_iomem_res(), walk_system_ram_range() does some book-keeping
> which can be avoided for page_is_ram() use-case.
>
> Hence this patch proposes to update page_is_ram() to directly call
> find_next_iomem_res() with minimal book-keeping needed.
I consider the code harder to get compared to just reusing the
more-generic and expressive walk_system_ram_range().
It somehow feels like we're duplicating the code here just to optimize
out a handful of instructions.
If it doesn't make the code easier to read (at least for me), why do we
care?
--
Thanks,
David / dhildenb
Hi David,
Thanks for looking into this patch,
David Hildenbrand <[email protected]> writes:
> On 01.06.22 18:32, Vaibhav Jain wrote:
>> Presently page_is_ram() relies on walk_system_ram_range() that performs a walk
>> on kernel iomem resources hierarchy with a dummy callback __is_ram(). Before
>> calling find_next_iomem_res(), walk_system_ram_range() does some book-keeping
>> which can be avoided for page_is_ram() use-case.
>>
>> Hence this patch proposes to update page_is_ram() to directly call
>> find_next_iomem_res() with minimal book-keeping needed.
>
> I consider the code harder to get compared to just reusing the
> more-generic and expressive walk_system_ram_range()
>
> It somehow feels like we're duplicating the code here just to optimize
> out a handful of instructions.
The only reason for existence of dummy callback __is_ram() is for
page_is_ram() to be able to use walk_system_ram_range(). For
page_is_ram() usecase what walk_system_ram_range() essentially does is
to iterate over find_next_iomem_res() and call __is_ram() which is not
really needed to page_is_ram().
The improvement to the gcc (v12.1.1) generated code (x86_64) for
page_is_ram is quite evident.
With the patch:
0x0000000000000920 <+0>: call 0x925 <page_is_ram+5>
0x0000000000000925 <+5>: shl $0xc,%rdi
0x0000000000000929 <+9>: xor %r8d,%r8d
0x000000000000092c <+12>: xor %ecx,%ecx
0x000000000000092e <+14>: mov $0x81000200,%edx
0x0000000000000933 <+19>: lea 0x1(%rdi),%rsi
0x0000000000000937 <+23>: call 0x7e0 <find_next_iomem_res>
0x000000000000093c <+28>: test %eax,%eax
0x000000000000093e <+30>: sete %al
0x0000000000000941 <+33>: movzbl %al,%eax
0x0000000000000944 <+36>: ret
0x0000000000000945 <+37>: int3
Without the patch:
0x0000000000001000 <+0>: call 0x1005 <page_is_ram+5>
0x0000000000001005 <+5>: shl $0xc,%rdi
0x0000000000001009 <+9>: lea 0xfff(%rdi),%rsi
0x0000000000001010 <+16>: cmp %rsi,%rdi
0x0000000000001013 <+19>: jae 0x1064 <page_is_ram+100>
0x0000000000001015 <+21>: sub $0x40,%rsp
0x0000000000001019 <+25>: xor %ecx,%ecx
0x000000000000101b <+27>: mov $0x81000200,%edx
0x0000000000001020 <+32>: mov %rsp,%r8
0x0000000000001023 <+35>: call 0x7e0 <find_next_iomem_res>
0x0000000000001028 <+40>: test %eax,%eax
0x000000000000102a <+42>: jne 0x105a <page_is_ram+90>
0x000000000000102c <+44>: mov (%rsp),%rax
0x0000000000001030 <+48>: mov $0x1,%ecx
0x0000000000001035 <+53>: lea 0xfff(%rax),%rdx
0x000000000000103c <+60>: mov 0x8(%rsp),%rax
0x0000000000001041 <+65>: shr $0xc,%rdx
0x0000000000001045 <+69>: add $0x1,%rax
0x0000000000001049 <+73>: shr $0xc,%rax
0x000000000000104d <+77>: cmp %rax,%rdx
0x0000000000001050 <+80>: jae 0x105a <page_is_ram+90>
0x0000000000001052 <+82>: mov %ecx,%eax
0x0000000000001054 <+84>: add $0x40,%rsp
0x0000000000001058 <+88>: ret
0x0000000000001059 <+89>: int3
0x000000000000105a <+90>: xor %ecx,%ecx
0x000000000000105c <+92>: add $0x40,%rsp
0x0000000000001060 <+96>: mov %ecx,%eax
0x0000000000001062 <+98>: ret
0x0000000000001063 <+99>: int3
0x0000000000001064 <+100>: xor %eax,%eax
0x0000000000001066 <+102>: ret
0x0000000000001067 <+103>: int3
Looking at the disassembly above, gcc has inlined both walk_system_ram_range()
and __is_ram() in page_is_ram(). This ends up in page_is_ram() calling
find_next_iomem_res() directly anyways with bunch of book-keeping
afterwards which can be avoided.
>
> If it doesn't make the code easier to read (at least for me), why do we
> care?
IMHO, calling find_next_iomem_res() from page_is_ram() instead of
calling walk_system_ram_range() makes it easy to trace the path of
page_is_ram(). Also the dummy callback makes the code flow seems strange
initially.
--
Cheers
~ Vaibhav
On 01.06.22 18:32, Vaibhav Jain wrote:
> Presently page_is_ram() relies on walk_system_ram_range() that performs a walk
> on kernel iomem resources hierarchy with a dummy callback __is_ram(). Before
> calling find_next_iomem_res(), walk_system_ram_range() does some book-keeping
> which can be avoided for page_is_ram() use-case.
>
> Hence this patch proposes to update page_is_ram() to directly call
> find_next_iomem_res() with minimal book-keeping needed.
>
> To avoid allocating a 'struct resource' the patch also updates
> find_next_iomem_res() to not return -EINVAL in case 'res == NULL'. Instead
> out 'struct resource *res' is only populated when its not NULL.
>
> Signed-off-by: Vaibhav Jain <[email protected]>
> ---
> kernel/resource.c | 19 ++++++++-----------
> 1 file changed, 8 insertions(+), 11 deletions(-)
>
> diff --git a/kernel/resource.c b/kernel/resource.c
> index 34eaee179689..ecf6b9a50adc 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -311,7 +311,7 @@ EXPORT_SYMBOL(release_resource);
> *
> * If a resource is found, returns 0 and @*res is overwritten with the part
> * of the resource that's within [@start..@end]; if none is found, returns
> - * -ENODEV. Returns -EINVAL for invalid parameters.
> + * -ENODEV.
> *
There is still another -EINVAL in that function ...
> * @start: start address of the resource searched for
> * @end: end address of same resource
> @@ -328,9 +328,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
> {
> struct resource *p;
>
> - if (!res)
> - return -EINVAL;
> -
> if (start >= end)
> return -EINVAL;
As all callers guarantee that, we might just remove it.
>
> @@ -356,7 +353,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
> break;
> }
>
> - if (p) {
> + if (p && res) {
> /* copy data */
> *res = (struct resource) {
> .start = max(start, p->start),
> @@ -474,18 +471,18 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
> return ret;
> }
>
> -static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
> -{
> - return 1;
> -}
> -
> /*
> * This generic page_is_ram() returns true if specified address is
> * registered as System RAM in iomem_resource list.
> */
> int __weak page_is_ram(unsigned long pfn)
> {
> - return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
> + const resource_size_t pfn_res = PFN_PHYS(pfn);
> +
> + return find_next_iomem_res(pfn_res,
> + pfn_res + 1,
> + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> + IORES_DESC_NONE, NULL) == 0;
> }
> EXPORT_SYMBOL_GPL(page_is_ram);
>
What about
a) A cleanup patch upfront that removes both -EINVAL cases from
find_next_iomem_res() followed by
b) The actual change to page_is_ram()
?
--
Thanks,
David / dhildenb
[sorry for the late reply]
>
> The improvement to the gcc (v12.1.1) generated code (x86_64) for
> page_is_ram is quite evident.
>
> With the patch:
> 0x0000000000000920 <+0>: call 0x925 <page_is_ram+5>
> 0x0000000000000925 <+5>: shl $0xc,%rdi
> 0x0000000000000929 <+9>: xor %r8d,%r8d
> 0x000000000000092c <+12>: xor %ecx,%ecx
> 0x000000000000092e <+14>: mov $0x81000200,%edx
> 0x0000000000000933 <+19>: lea 0x1(%rdi),%rsi
> 0x0000000000000937 <+23>: call 0x7e0 <find_next_iomem_res>
> 0x000000000000093c <+28>: test %eax,%eax
> 0x000000000000093e <+30>: sete %al
> 0x0000000000000941 <+33>: movzbl %al,%eax
> 0x0000000000000944 <+36>: ret
> 0x0000000000000945 <+37>: int3
>
> Without the patch:
> 0x0000000000001000 <+0>: call 0x1005 <page_is_ram+5>
> 0x0000000000001005 <+5>: shl $0xc,%rdi
> 0x0000000000001009 <+9>: lea 0xfff(%rdi),%rsi
> 0x0000000000001010 <+16>: cmp %rsi,%rdi
> 0x0000000000001013 <+19>: jae 0x1064 <page_is_ram+100>
> 0x0000000000001015 <+21>: sub $0x40,%rsp
> 0x0000000000001019 <+25>: xor %ecx,%ecx
> 0x000000000000101b <+27>: mov $0x81000200,%edx
> 0x0000000000001020 <+32>: mov %rsp,%r8
> 0x0000000000001023 <+35>: call 0x7e0 <find_next_iomem_res>
> 0x0000000000001028 <+40>: test %eax,%eax
> 0x000000000000102a <+42>: jne 0x105a <page_is_ram+90>
> 0x000000000000102c <+44>: mov (%rsp),%rax
> 0x0000000000001030 <+48>: mov $0x1,%ecx
> 0x0000000000001035 <+53>: lea 0xfff(%rax),%rdx
> 0x000000000000103c <+60>: mov 0x8(%rsp),%rax
> 0x0000000000001041 <+65>: shr $0xc,%rdx
> 0x0000000000001045 <+69>: add $0x1,%rax
> 0x0000000000001049 <+73>: shr $0xc,%rax
> 0x000000000000104d <+77>: cmp %rax,%rdx
> 0x0000000000001050 <+80>: jae 0x105a <page_is_ram+90>
> 0x0000000000001052 <+82>: mov %ecx,%eax
> 0x0000000000001054 <+84>: add $0x40,%rsp
> 0x0000000000001058 <+88>: ret
> 0x0000000000001059 <+89>: int3
> 0x000000000000105a <+90>: xor %ecx,%ecx
> 0x000000000000105c <+92>: add $0x40,%rsp
> 0x0000000000001060 <+96>: mov %ecx,%eax
> 0x0000000000001062 <+98>: ret
> 0x0000000000001063 <+99>: int3
> 0x0000000000001064 <+100>: xor %eax,%eax
> 0x0000000000001066 <+102>: ret
> 0x0000000000001067 <+103>: int3
>
> Looking at the disassembly above, gcc has inlined both walk_system_ram_range()
> and __is_ram() in page_is_ram(). This ends up in page_is_ram() calling
> find_next_iomem_res() directly anyways with bunch of book-keeping
> afterwards which can be avoided.
We usually don't care about such micro-optimizations unless you can
showcase actual performance numbers. Otherwise we'd have constant,
unnecessary code-churn all over the place.
Most probably, all that list walking dominates the runtime either way.
Feel free to proof me wrong ;)
> >>
>> If it doesn't make the code easier to read (at least for me), why do we
>> care?
> IMHO, calling find_next_iomem_res() from page_is_ram() instead of
> calling walk_system_ram_range() makes it easy to trace the path of
> page_is_ram(). Also the dummy callback makes the code flow seems strange
> initially.
>
I'm not convinced, but I don't care enough to object. I'll add more
review feedback to the patch.
--
Thanks,
David / dhildenb
David Hildenbrand wrote:
> On 01.06.22 18:32, Vaibhav Jain wrote:
> > Presently page_is_ram() relies on walk_system_ram_range() that performs a walk
> > on kernel iomem resources hierarchy with a dummy callback __is_ram(). Before
> > calling find_next_iomem_res(), walk_system_ram_range() does some book-keeping
> > which can be avoided for page_is_ram() use-case.
> >
> > Hence this patch proposes to update page_is_ram() to directly call
> > find_next_iomem_res() with minimal book-keeping needed.
> >
> > To avoid allocating a 'struct resource' the patch also updates
> > find_next_iomem_res() to not return -EINVAL in case 'res == NULL'. Instead
> > out 'struct resource *res' is only populated when its not NULL.
> >
> > Signed-off-by: Vaibhav Jain <[email protected]>
> > ---
> > kernel/resource.c | 19 ++++++++-----------
> > 1 file changed, 8 insertions(+), 11 deletions(-)
> >
> > diff --git a/kernel/resource.c b/kernel/resource.c
> > index 34eaee179689..ecf6b9a50adc 100644
> > --- a/kernel/resource.c
> > +++ b/kernel/resource.c
> > @@ -311,7 +311,7 @@ EXPORT_SYMBOL(release_resource);
> > *
> > * If a resource is found, returns 0 and @*res is overwritten with the part
> > * of the resource that's within [@start..@end]; if none is found, returns
> > - * -ENODEV. Returns -EINVAL for invalid parameters.
> > + * -ENODEV.
> > *
>
> There is still another -EINVAL in that function ...
>
> > * @start: start address of the resource searched for
> > * @end: end address of same resource
> > @@ -328,9 +328,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
> > {
> > struct resource *p;
> >
> > - if (!res)
> > - return -EINVAL;
> > -
> > if (start >= end)
> > return -EINVAL;
>
> As all callers guarantee that, we might just remove it.
>
> >
> > @@ -356,7 +353,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
> > break;
> > }
> >
> > - if (p) {
> > + if (p && res) {
> > /* copy data */
> > *res = (struct resource) {
> > .start = max(start, p->start),
> > @@ -474,18 +471,18 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
> > return ret;
> > }
> >
> > -static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
> > -{
> > - return 1;
> > -}
> > -
> > /*
> > * This generic page_is_ram() returns true if specified address is
> > * registered as System RAM in iomem_resource list.
> > */
> > int __weak page_is_ram(unsigned long pfn)
> > {
> > - return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
> > + const resource_size_t pfn_res = PFN_PHYS(pfn);
> > +
> > + return find_next_iomem_res(pfn_res,
> > + pfn_res + 1,
> > + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> > + IORES_DESC_NONE, NULL) == 0;
I tend to agree with David that this change makes the page_is_ram()
harder to read. I think the problem is that the "next" nature of
find_next_iomem_res() is meant to be handled by the caller. So it really
should be called find_iomem_res().
> > }
> > EXPORT_SYMBOL_GPL(page_is_ram);
> >
>
> What about
>
> a) A cleanup patch upfront that removes both -EINVAL cases from
> find_next_iomem_res() followed by
...a patch to rename find_next_iomem_res()
>
> b) The actual change to page_is_ram()
>
> ?