In case the call side is not providing a swap function, we either use
a 32 bit or a generic swap function. When swapping around pointers on
64 bit architectures falling back to use the generic swap function
seems like an unnecessary waste.
There at least 9 users ('sort' is of difficult to grep for) of sort()
and all of them use the sort function without a customized swap
function. Furthermore, they are all using pointers to swap around:
arch/x86/kernel/e820.c:sanitize_e820_map()
arch/x86/mm/extable.c:sort_extable()
drivers/acpi/fan.c:acpi_fan_get_fps()
fs/btrfs/super.c:btrfs_descending_sort_devices()
fs/xfs/libxfs/xfs_dir2_block.c:xfs_dir2_sf_to_block()
kernel/range.c:clean_sort_range()
mm/memcontrol.c:__mem_cgroup_usage_register_event()
sound/pci/hda/hda_auto_parser.c:snd_hda_parse_pin_defcfg()
sound/pci/hda/hda_auto_parser.c:sort_pins_by_sequence()
Obviously, we could improve the swap for other sizes as well
but this is overkill at this point.
A simple test shows sorting a 400 element array (try to stay in one
page) with either with u32_swap() or u64_swap() show that the theory
actually works. This test was done on a x86_64 (Intel Xeon E5-4610)
machine.
- swap_32:
NumSamples = 100; Min = 48.00; Max = 49.00
Mean = 48.320000; Variance = 0.217600; SD = 0.466476; Median 48.000000
each * represents a count of 1
48.0000 - 48.1000 [ 68]: ********************************************************************
48.1000 - 48.2000 [ 0]:
48.2000 - 48.3000 [ 0]:
48.3000 - 48.4000 [ 0]:
48.4000 - 48.5000 [ 0]:
48.5000 - 48.6000 [ 0]:
48.6000 - 48.7000 [ 0]:
48.7000 - 48.8000 [ 0]:
48.8000 - 48.9000 [ 0]:
48.9000 - 49.0000 [ 32]: ********************************
- swap_64:
NumSamples = 100; Min = 44.00; Max = 63.00
Mean = 48.250000; Variance = 18.687500; SD = 4.322904; Median 47.000000
each * represents a count of 1
44.0000 - 45.9000 [ 15]: ***************
45.9000 - 47.8000 [ 37]: *************************************
47.8000 - 49.7000 [ 39]: ***************************************
49.7000 - 51.6000 [ 0]:
51.6000 - 53.5000 [ 0]:
53.5000 - 55.4000 [ 0]:
55.4000 - 57.3000 [ 0]:
57.3000 - 59.2000 [ 1]: *
59.2000 - 61.1000 [ 3]: ***
61.1000 - 63.0000 [ 5]: *****
- swap_72:
NumSamples = 100; Min = 53.00; Max = 71.00
Mean = 55.070000; Variance = 21.565100; SD = 4.643824; Median 53.000000
each * represents a count of 1
53.0000 - 54.8000 [ 73]: *************************************************************************
54.8000 - 56.6000 [ 9]: *********
56.6000 - 58.4000 [ 9]: *********
58.4000 - 60.2000 [ 0]:
60.2000 - 62.0000 [ 0]:
62.0000 - 63.8000 [ 0]:
63.8000 - 65.6000 [ 0]:
65.6000 - 67.4000 [ 1]: *
67.4000 - 69.2000 [ 4]: ****
69.2000 - 71.0000 [ 4]: ****
- test program:
static int cmp_32(const void *a, const void *b)
{
u32 l = *(u32 *)a;
u32 r = *(u32 *)b;
if (l < r)
return -1;
if (l > r)
return 1;
return 0;
}
static int cmp_64(const void *a, const void *b)
{
u64 l = *(u64 *)a;
u64 r = *(u64 *)b;
if (l < r)
return -1;
if (l > r)
return 1;
return 0;
}
static int cmp_72(const void *a, const void *b)
{
u32 l = get_unaligned((u32 *) a);
u32 r = get_unaligned((u32 *) b);
if (l < r)
return -1;
if (l > r)
return -1;
return 0;
}
static void init_array32(void *array)
{
u32 *a = array;
int i;
a[0] = 3821;
for (i = 1; i < ARRAY_ELEMENTS; i++)
a[i] = next_pseudo_random32(a[i-1]);
}
static void init_array64(void *array)
{
u64 *a = array;
int i;
a[0] = 3821;
for (i = 1; i < ARRAY_ELEMENTS; i++)
a[i] = next_pseudo_random32(a[i-1]);
}
static void init_array72(void *array)
{
char *p;
u32 v;
int i;
v = 3821;
for (i = 0; i < ARRAY_ELEMENTS; i++) {
p = (char *)array + (i * 9);
put_unaligned(v, (u32*) p);
v = next_pseudo_random32(v);
}
}
static void sort_test(void (*init)(void *array),
int (*cmp) (const void *, const void *),
void *array, size_t size)
{
ktime_t start, stop;
int i;
for (i = 0; i < 10000; i++) {
init(array);
local_irq_disable();
start = ktime_get();
sort(array, ARRAY_ELEMENTS, size, cmp, NULL);
stop = ktime_get();
local_irq_enable();
if (i > 10000 - 101)
pr_info("%lld\n", ktime_to_us(ktime_sub(stop, start)));
}
}
static void *create_array(size_t size)
{
void *array;
array = kmalloc(ARRAY_ELEMENTS * size, GFP_KERNEL);
if (!array)
return NULL;
return array;
}
static int perform_test(size_t size)
{
void *array;
array = create_array(size);
if (!array)
return -ENOMEM;
pr_info("test element size %d bytes\n", (int)size);
switch (size) {
case 4:
sort_test(init_array32, cmp_32, array, size);
break;
case 8:
sort_test(init_array64, cmp_64, array, size);
break;
case 9:
sort_test(init_array72, cmp_72, array, size);
break;
}
kfree(array);
return 0;
}
static int __init sort_tests_init(void)
{
int err;
err = perform_test(sizeof(u32));
if (err)
return err;
err = perform_test(sizeof(u64));
if (err)
return err;
err = perform_test(sizeof(u64)+1);
if (err)
return err;
return 0;
}
static void __exit sort_tests_exit(void)
{
}
module_init(sort_tests_init);
module_exit(sort_tests_exit);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Daniel Wagner");
MODULE_DESCRIPTION("sort perfomance tests");
Signed-off-by: Daniel Wagner <[email protected]>
Cc: Rasmus Villemoes <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: [email protected]
---
Changes since v0:
- Test for alignment of the array.
- Fixed compare function in test program according
Rasmus' feedback
- Added an unaligned test cases (swap_72)
lib/sort.c | 35 +++++++++++++++++++++++++++++++++--
1 file changed, 33 insertions(+), 2 deletions(-)
diff --git a/lib/sort.c b/lib/sort.c
index 43c9fe7..9c6b229 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -15,6 +15,13 @@ static void u32_swap(void *a, void *b, int size)
*(u32 *)b = t;
}
+static void u64_swap(void *a, void *b, int size)
+{
+ u64 t = *(u64 *)a;
+ *(u64 *)a = *(u64 *)b;
+ *(u64 *)b = t;
+}
+
static void generic_swap(void *a, void *b, int size)
{
char t;
@@ -50,8 +57,32 @@ void sort(void *base, size_t num, size_t size,
/* pre-scale counters for performance */
int i = (num/2 - 1) * size, n = num * size, c, r;
- if (!swap_func)
- swap_func = (size == 4 ? u32_swap : generic_swap);
+ if (!swap_func) {
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+ switch (size) {
+ case 4:
+ swap_func = u32_swap;
+ break;
+ case 8:
+ swap_func = u64_swap;
+ break;
+ }
+#else
+ switch (size) {
+ case 4:
+ if (((unsigned long)base & 3) == 0)
+ swap_func = u32_swap;
+ break;
+ case 8:
+ if (((unsigned long)base & 7) == 0)
+ swap_func = u64_swap;
+ break;
+ }
+#endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */
+
+ if (!swap_func)
+ swap_func = generic_swap;
+ }
/* heapify */
for ( ; i >= 0; i -= size) {
--
2.1.0
On 05/13/2015 01:00 PM, Daniel Wagner wrote:
> static int cmp_72(const void *a, const void *b)
> {
> u32 l = get_unaligned((u32 *) a);
> u32 r = get_unaligned((u32 *) b);
>
> if (l < r)
> return -1;
> if (l > r)
> return -1;
> return 0;
> }
Argh, obviously wrong. Should I send a fix? The numbers do not change.
On Wed, 13 May 2015 13:28:33 +0200 Daniel Wagner <[email protected]> wrote:
> On 05/13/2015 01:00 PM, Daniel Wagner wrote:
> > static int cmp_72(const void *a, const void *b)
> > {
> > u32 l = get_unaligned((u32 *) a);
> > u32 r = get_unaligned((u32 *) b);
> >
> > if (l < r)
> > return -1;
> > if (l > r)
> > return -1;
> > return 0;
> > }
>
> Argh, obviously wrong. Should I send a fix? The numbers do not change.
I fixed it up.
On Wed, May 13 2015, Daniel Wagner <[email protected]> wrote:
>
> - if (!swap_func)
> - swap_func = (size == 4 ? u32_swap : generic_swap);
> + if (!swap_func) {
> +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
> + switch (size) {
> + case 4:
> + swap_func = u32_swap;
> + break;
> + case 8:
> + swap_func = u64_swap;
> + break;
> + }
> +#else
> + switch (size) {
> + case 4:
> + if (((unsigned long)base & 3) == 0)
> + swap_func = u32_swap;
> + break;
> + case 8:
> + if (((unsigned long)base & 7) == 0)
> + swap_func = u64_swap;
> + break;
> + }
> +#endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */
> +
> + if (!swap_func)
> + swap_func = generic_swap;
> + }
I was more thinking of something like
static int alignment_ok(const void *base, int align)
{
return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
((unsigned long)base & (align - 1)) == 0;
}
...
if (!swap_func) {
if (size == 4 && alignment_ok(base, 4))
swap_func = u32_swap;
else if (size == 8 && alignment_ok(base, 8))
swap_func = u64_swap;
else
swap_func = generic_swap;
}
It seems to generate the same code (I usually worry about how gcc messes
up switches), so this is just a readability thing.
Rasmus