This patch improves the performance of event alternative lookup by
replacing the previous linear search with a more efficient binary
search. This change reduces the time complexity for the search process
from O(n) to O(log(n)). A pre-sorted table of event values and their
corresponding indices has been introduced to expedite the search
process.
Signed-off-by: Kuan-Wei Chiu <[email protected]>
---
arch/powerpc/perf/power6-pmu.c | 43 ++++++++++++++++++++--------------
1 file changed, 26 insertions(+), 17 deletions(-)
diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c
index 5729b6e059de..b6030ea130eb 100644
--- a/arch/powerpc/perf/power6-pmu.c
+++ b/arch/powerpc/perf/power6-pmu.c
@@ -335,25 +335,34 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
{ 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
};
-/*
- * This could be made more efficient with a binary search on
- * a presorted list, if necessary
- */
static int find_alternatives_list(u64 event)
{
- int i, j;
- unsigned int alt;
-
- for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
- if (event < event_alternatives[i][0])
- return -1;
- for (j = 0; j < MAX_ALT; ++j) {
- alt = event_alternatives[i][j];
- if (!alt || event < alt)
- break;
- if (event == alt)
- return i;
- }
+ const unsigned int presort_event_table[] = {
+ 0x0130e8, 0x080080, 0x080088, 0x10000a, 0x10000b, 0x10000d, 0x10000e,
+ 0x100010, 0x10001a, 0x100026, 0x100054, 0x100056, 0x1000f0, 0x1000f8,
+ 0x1000fc, 0x200008, 0x20000e, 0x200010, 0x200012, 0x200054, 0x2000f0,
+ 0x2000f2, 0x2000f4, 0x2000f5, 0x2000f6, 0x2000f8, 0x2000fc, 0x2000fe,
+ 0x2d0030, 0x30000a, 0x30000c, 0x300010, 0x300012, 0x30001a, 0x300056,
+ 0x3000f0, 0x3000f2, 0x3000f6, 0x3000f8, 0x3000fc, 0x3000fe, 0x400006,
+ 0x400007, 0x40000a, 0x40000e, 0x400010, 0x400018, 0x400056, 0x4000f0,
+ 0x4000f8, 0x600005};
+ const unsigned int event_index_table[] = {
+ 0, 1, 2, 3, 4, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 12, 14,
+ 7, 15, 2, 9, 16, 3, 4, 0, 17, 10, 18, 19, 20, 1, 17, 15, 19,
+ 18, 2, 16, 21, 8, 0, 22, 13, 14, 11, 21, 5, 20, 22, 1, 6, 3};
+ int lo = 0;
+ int hi = ARRAY_SIZE(presort_event_table) - 1;
+
+ while (lo <= hi) {
+ int mid = lo + (hi - lo) / 2;
+ unsigned int alt = presort_event_table[mid];
+
+ if (alt < event)
+ lo = mid + 1;
+ else if (alt > event)
+ hi = mid - 1;
+ else
+ return event_index_table[mid];
}
return -1;
}
--
2.25.1
Kuan-Wei Chiu <[email protected]> writes:
> This patch improves the performance of event alternative lookup by
> replacing the previous linear search with a more efficient binary
> search. This change reduces the time complexity for the search process
> from O(n) to O(log(n)). A pre-sorted table of event values and their
> corresponding indices has been introduced to expedite the search
> process.
Thanks for the patch.
How did you test this? I assume you don't have a Power6 machine lying
around? :)
cheers
> diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c
> index 5729b6e059de..b6030ea130eb 100644
> --- a/arch/powerpc/perf/power6-pmu.c
> +++ b/arch/powerpc/perf/power6-pmu.c
> @@ -335,25 +335,34 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
> { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
> };
>
> -/*
> - * This could be made more efficient with a binary search on
> - * a presorted list, if necessary
> - */
> static int find_alternatives_list(u64 event)
> {
> - int i, j;
> - unsigned int alt;
> -
> - for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
> - if (event < event_alternatives[i][0])
> - return -1;
> - for (j = 0; j < MAX_ALT; ++j) {
> - alt = event_alternatives[i][j];
> - if (!alt || event < alt)
> - break;
> - if (event == alt)
> - return i;
> - }
> + const unsigned int presort_event_table[] = {
> + 0x0130e8, 0x080080, 0x080088, 0x10000a, 0x10000b, 0x10000d, 0x10000e,
> + 0x100010, 0x10001a, 0x100026, 0x100054, 0x100056, 0x1000f0, 0x1000f8,
> + 0x1000fc, 0x200008, 0x20000e, 0x200010, 0x200012, 0x200054, 0x2000f0,
> + 0x2000f2, 0x2000f4, 0x2000f5, 0x2000f6, 0x2000f8, 0x2000fc, 0x2000fe,
> + 0x2d0030, 0x30000a, 0x30000c, 0x300010, 0x300012, 0x30001a, 0x300056,
> + 0x3000f0, 0x3000f2, 0x3000f6, 0x3000f8, 0x3000fc, 0x3000fe, 0x400006,
> + 0x400007, 0x40000a, 0x40000e, 0x400010, 0x400018, 0x400056, 0x4000f0,
> + 0x4000f8, 0x600005};
> + const unsigned int event_index_table[] = {
> + 0, 1, 2, 3, 4, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 12, 14,
> + 7, 15, 2, 9, 16, 3, 4, 0, 17, 10, 18, 19, 20, 1, 17, 15, 19,
> + 18, 2, 16, 21, 8, 0, 22, 13, 14, 11, 21, 5, 20, 22, 1, 6, 3};
> + int lo = 0;
> + int hi = ARRAY_SIZE(presort_event_table) - 1;
> +
> + while (lo <= hi) {
> + int mid = lo + (hi - lo) / 2;
> + unsigned int alt = presort_event_table[mid];
> +
> + if (alt < event)
> + lo = mid + 1;
> + else if (alt > event)
> + hi = mid - 1;
> + else
> + return event_index_table[mid];
> }
> return -1;
> }
> --
> 2.25.1
On Thu, Oct 19, 2023 at 12:41:45PM +1100, Michael Ellerman wrote:
> Kuan-Wei Chiu <[email protected]> writes:
> > This patch improves the performance of event alternative lookup by
> > replacing the previous linear search with a more efficient binary
> > search. This change reduces the time complexity for the search process
> > from O(n) to O(log(n)). A pre-sorted table of event values and their
> > corresponding indices has been introduced to expedite the search
> > process.
>
> Thanks for the patch.
>
> How did you test this? I assume you don't have a Power6 machine lying
> around? :)
>
> cheers
>
I indeed do not have a Power6 machine for testing. Therefore, I designed
a simple unit test [1] to verify the functionality of the patch. In this
test, I ran a loop from 0 to UINT_MAX, using these values as inputs to
compare the return values of the original function with the new function
I implemented, which utilizes binary search. If you have any suggestions
for a more suitable testing method, please let me know. I would greatly
appreciate your feedback.
Thanks,
Kuan-Wei Chiu
[1]:
/* return 0 on success and return non-zero on failure */
int test()
{
u64 event = 0;
for (u64 event = 0; event <= UINT_MAX; event++) {
/* result of the current function in the linux kernel */
int result_old = find_alternatives_list(event);
/* result of the new function using binary search */
int result_new = find_alternatives_list_new(event);
if (result_old != result_new)
return 1;
}
return 0;
}
> > diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c
> > index 5729b6e059de..b6030ea130eb 100644
> > --- a/arch/powerpc/perf/power6-pmu.c
> > +++ b/arch/powerpc/perf/power6-pmu.c
> > @@ -335,25 +335,34 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
> > { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
> > };
> >
> > -/*
> > - * This could be made more efficient with a binary search on
> > - * a presorted list, if necessary
> > - */
> > static int find_alternatives_list(u64 event)
> > {
> > - int i, j;
> > - unsigned int alt;
> > -
> > - for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
> > - if (event < event_alternatives[i][0])
> > - return -1;
> > - for (j = 0; j < MAX_ALT; ++j) {
> > - alt = event_alternatives[i][j];
> > - if (!alt || event < alt)
> > - break;
> > - if (event == alt)
> > - return i;
> > - }
> > + const unsigned int presort_event_table[] = {
> > + 0x0130e8, 0x080080, 0x080088, 0x10000a, 0x10000b, 0x10000d, 0x10000e,
> > + 0x100010, 0x10001a, 0x100026, 0x100054, 0x100056, 0x1000f0, 0x1000f8,
> > + 0x1000fc, 0x200008, 0x20000e, 0x200010, 0x200012, 0x200054, 0x2000f0,
> > + 0x2000f2, 0x2000f4, 0x2000f5, 0x2000f6, 0x2000f8, 0x2000fc, 0x2000fe,
> > + 0x2d0030, 0x30000a, 0x30000c, 0x300010, 0x300012, 0x30001a, 0x300056,
> > + 0x3000f0, 0x3000f2, 0x3000f6, 0x3000f8, 0x3000fc, 0x3000fe, 0x400006,
> > + 0x400007, 0x40000a, 0x40000e, 0x400010, 0x400018, 0x400056, 0x4000f0,
> > + 0x4000f8, 0x600005};
> > + const unsigned int event_index_table[] = {
> > + 0, 1, 2, 3, 4, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 12, 14,
> > + 7, 15, 2, 9, 16, 3, 4, 0, 17, 10, 18, 19, 20, 1, 17, 15, 19,
> > + 18, 2, 16, 21, 8, 0, 22, 13, 14, 11, 21, 5, 20, 22, 1, 6, 3};
> > + int lo = 0;
> > + int hi = ARRAY_SIZE(presort_event_table) - 1;
> > +
> > + while (lo <= hi) {
> > + int mid = lo + (hi - lo) / 2;
> > + unsigned int alt = presort_event_table[mid];
> > +
> > + if (alt < event)
> > + lo = mid + 1;
> > + else if (alt > event)
> > + hi = mid - 1;
> > + else
> > + return event_index_table[mid];
> > }
> > return -1;
> > }
> > --
> > 2.25.1
Kuan-Wei Chiu <[email protected]> writes:
> On Thu, Oct 19, 2023 at 12:41:45PM +1100, Michael Ellerman wrote:
>> Kuan-Wei Chiu <[email protected]> writes:
>> > This patch improves the performance of event alternative lookup by
>> > replacing the previous linear search with a more efficient binary
>> > search. This change reduces the time complexity for the search process
>> > from O(n) to O(log(n)). A pre-sorted table of event values and their
>> > corresponding indices has been introduced to expedite the search
>> > process.
>>
>> Thanks for the patch.
>>
>> How did you test this? I assume you don't have a Power6 machine lying
>> around? :)
>
> I indeed do not have a Power6 machine for testing. Therefore, I designed
> a simple unit test [1] to verify the functionality of the patch. In this
> test, I ran a loop from 0 to UINT_MAX, using these values as inputs to
> compare the return values of the original function with the new function
> I implemented, which utilizes binary search. If you have any suggestions
> for a more suitable testing method, please let me know. I would greatly
> appreciate your feedback.
That's an excellent technique :)
The other option would be to test it using qemu.
You can't actually emulate a Power6 with qemu, but you can emulate a
Power7 which is similar. The code in power7-pmu.c is similar enough that
you could copy this code into there and test it that way.
But I don't expect you to do all that. I have an actual Power6 I can
give it a quick test on, and your unit test should have found any bugs
anyway.
For future reference you can add extra detail about testing and so on
below the "---" line in your patch, ie. below the diffstat but above the
diff. Content in there will not appear in the final commit, so it's good
for information you want to tell the maintainer, but is a bit verbose to
be in the permanant change log - like for example how you tested
something.
My only other comment would be to change the name of
"presort_event_table" to "presorted_event_table" - but I can do that
when applying.
I'll pick this up for v6.7.
cheers
> [1]:
> /* return 0 on success and return non-zero on failure */
> int test()
> {
> u64 event = 0;
> for (u64 event = 0; event <= UINT_MAX; event++) {
> /* result of the current function in the linux kernel */
> int result_old = find_alternatives_list(event);
> /* result of the new function using binary search */
> int result_new = find_alternatives_list_new(event);
>
> if (result_old != result_new)
> return 1;
> }
> return 0;
> }
>
>
>> > diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c
>> > index 5729b6e059de..b6030ea130eb 100644
>> > --- a/arch/powerpc/perf/power6-pmu.c
>> > +++ b/arch/powerpc/perf/power6-pmu.c
>> > @@ -335,25 +335,34 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
>> > { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
>> > };
>> >
>> > -/*
>> > - * This could be made more efficient with a binary search on
>> > - * a presorted list, if necessary
>> > - */
>> > static int find_alternatives_list(u64 event)
>> > {
>> > - int i, j;
>> > - unsigned int alt;
>> > -
>> > - for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
>> > - if (event < event_alternatives[i][0])
>> > - return -1;
>> > - for (j = 0; j < MAX_ALT; ++j) {
>> > - alt = event_alternatives[i][j];
>> > - if (!alt || event < alt)
>> > - break;
>> > - if (event == alt)
>> > - return i;
>> > - }
>> > + const unsigned int presort_event_table[] = {
>> > + 0x0130e8, 0x080080, 0x080088, 0x10000a, 0x10000b, 0x10000d, 0x10000e,
>> > + 0x100010, 0x10001a, 0x100026, 0x100054, 0x100056, 0x1000f0, 0x1000f8,
>> > + 0x1000fc, 0x200008, 0x20000e, 0x200010, 0x200012, 0x200054, 0x2000f0,
>> > + 0x2000f2, 0x2000f4, 0x2000f5, 0x2000f6, 0x2000f8, 0x2000fc, 0x2000fe,
>> > + 0x2d0030, 0x30000a, 0x30000c, 0x300010, 0x300012, 0x30001a, 0x300056,
>> > + 0x3000f0, 0x3000f2, 0x3000f6, 0x3000f8, 0x3000fc, 0x3000fe, 0x400006,
>> > + 0x400007, 0x40000a, 0x40000e, 0x400010, 0x400018, 0x400056, 0x4000f0,
>> > + 0x4000f8, 0x600005};
>> > + const unsigned int event_index_table[] = {
>> > + 0, 1, 2, 3, 4, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 12, 14,
>> > + 7, 15, 2, 9, 16, 3, 4, 0, 17, 10, 18, 19, 20, 1, 17, 15, 19,
>> > + 18, 2, 16, 21, 8, 0, 22, 13, 14, 11, 21, 5, 20, 22, 1, 6, 3};
>> > + int lo = 0;
>> > + int hi = ARRAY_SIZE(presort_event_table) - 1;
>> > +
>> > + while (lo <= hi) {
>> > + int mid = lo + (hi - lo) / 2;
>> > + unsigned int alt = presort_event_table[mid];
>> > +
>> > + if (alt < event)
>> > + lo = mid + 1;
>> > + else if (alt > event)
>> > + hi = mid - 1;
>> > + else
>> > + return event_index_table[mid];
>> > }
>> > return -1;
>> > }
>> > --
>> > 2.25.1
On Sat, 14 Oct 2023 01:57:14 +0800, Kuan-Wei Chiu wrote:
> This patch improves the performance of event alternative lookup by
> replacing the previous linear search with a more efficient binary
> search. This change reduces the time complexity for the search process
> from O(n) to O(log(n)). A pre-sorted table of event values and their
> corresponding indices has been introduced to expedite the search
> process.
>
> [...]
Applied to powerpc/next.
[1/1] powerpc/perf: Optimize find_alternatives_list() using binary search
https://git.kernel.org/powerpc/c/e08c43e6c3eb5d805b61d981f1e8286ee0dc6d1a
cheers