we just record the cached_hole_size now, which will be used when
the criteria meet both of 'free_vmap_cache == NULL' and 'size <
cached_hole_size'. However, under above scenario, the search will
start from the rb_root and then find the node which just in front
of the cached hole.
free_vmap_cache miss:
vmap_area_root
/ \
_next U
/ (T1)
cached_hole_node
/
... (T2)
/
first
vmap_area_list->first->......->cached_hole_node->cached_hole_node.list.next
|-------(T3)-------| | <<< cached_hole_size >>> |
vmap_area_list->......->cached_hole_node->cached_hole_node.list.next
| <<< cached_hole_size >>> |
The time cost to search the node now is T = T1 + T2 + T3.
The commit add a cached_hole_node here to record the one just in front of
the cached_hole_size, which can help to avoid walking the rb tree and
the list and make the T = 0;
Signed-off-by: Zhaoyang Huang <[email protected]>
---
mm/vmalloc.c | 23 +++++++++++++++++++++--
1 file changed, 21 insertions(+), 2 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8698c1c..4e76e7f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -336,6 +336,7 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
/* The vmap cache globals are protected by vmap_area_lock */
static struct rb_node *free_vmap_cache;
+static struct vmap_area *cached_hole_node;
static unsigned long cached_hole_size;
static unsigned long cached_vstart;
static unsigned long cached_align;
@@ -444,6 +445,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
size < cached_hole_size ||
vstart < cached_vstart ||
align < cached_align) {
+ /*if we have a cached node, just use it*/
+ if ((size < cached_hole_size) && cached_hole_node != NULL) {
+ addr = ALIGN(cached_hole_node->va_end, align);
+ cached_hole_node = NULL;
+ goto found;
+ }
nocache:
cached_hole_size = 0;
free_vmap_cache = NULL;
@@ -487,8 +494,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
/* from the starting point, walk areas until a suitable hole is found */
while (addr + size > first->va_start && addr + size <= vend) {
- if (addr + cached_hole_size < first->va_start)
+ if (addr + cached_hole_size < first->va_start) {
cached_hole_size = first->va_start - addr;
+ /*record the node corresponding to the hole*/
+ cached_hole_node = (first->list.prev ==
+ &vmap_area_list) ?
+ NULL : list_prev_entry(first, list);
+ }
addr = ALIGN(first->va_end, align);
if (addr + size < addr)
goto overflow;
@@ -571,10 +583,17 @@ static void __free_vmap_area(struct vmap_area *va)
}
}
}
+ if (va == cached_hole_node) {
+ /*cached node is freed, the hole get bigger*/
+ if (cached_hole_node->list.prev != &vmap_area_list)
+ cached_hole_node = list_prev_entry(cached_hole_node,
+ list);
+ else
+ cached_hole_node = NULL;
+ }
rb_erase(&va->rb_node, &vmap_area_root);
RB_CLEAR_NODE(&va->rb_node);
list_del_rcu(&va->list);
-
/*
* Track the highest possible candidate for pcpu area
* allocation. Areas outside of vmalloc area can be returned
--
1.9.1
On Fri, Jul 21, 2017 at 06:01:41PM +0800, Zhaoyang Huang wrote:
> we just record the cached_hole_size now, which will be used when
> the criteria meet both of 'free_vmap_cache == NULL' and 'size <
> cached_hole_size'. However, under above scenario, the search will
> start from the rb_root and then find the node which just in front
> of the cached hole.
>
> free_vmap_cache miss:
> vmap_area_root
> / \
> _next U
> / (T1)
> cached_hole_node
> /
> ... (T2)
> /
> first
>
> vmap_area_list->first->......->cached_hole_node->cached_hole_node.list.next
> |-------(T3)-------| | <<< cached_hole_size >>> |
>
> vmap_area_list->......->cached_hole_node->cached_hole_node.list.next
> | <<< cached_hole_size >>> |
>
> The time cost to search the node now is T = T1 + T2 + T3.
> The commit add a cached_hole_node here to record the one just in front of
> the cached_hole_size, which can help to avoid walking the rb tree and
> the list and make the T = 0;
Yes, but does this matter in practice? Are there any workloads where
this makes a difference? If so, how much?
On Fri 21-07-17 04:39:48, Matthew Wilcox wrote:
> On Fri, Jul 21, 2017 at 06:01:41PM +0800, Zhaoyang Huang wrote:
> > we just record the cached_hole_size now, which will be used when
> > the criteria meet both of 'free_vmap_cache == NULL' and 'size <
> > cached_hole_size'. However, under above scenario, the search will
> > start from the rb_root and then find the node which just in front
> > of the cached hole.
> >
> > free_vmap_cache miss:
> > vmap_area_root
> > / \
> > _next U
> > / (T1)
> > cached_hole_node
> > /
> > ... (T2)
> > /
> > first
> >
> > vmap_area_list->first->......->cached_hole_node->cached_hole_node.list.next
> > |-------(T3)-------| | <<< cached_hole_size >>> |
> >
> > vmap_area_list->......->cached_hole_node->cached_hole_node.list.next
> > | <<< cached_hole_size >>> |
> >
> > The time cost to search the node now is T = T1 + T2 + T3.
> > The commit add a cached_hole_node here to record the one just in front of
> > the cached_hole_size, which can help to avoid walking the rb tree and
> > the list and make the T = 0;
>
> Yes, but does this matter in practice? Are there any workloads where
> this makes a difference? If so, how much?
I have already asked this and didn't get any response. There were other
versions of a similar patch without a good clarification...
Zhaoyang Huang, please try to formulate the problem you are fixing and
why. While it is clear that you add _an_ optimization it is not really
clear why we need it and whether it might adversely affect existing
workloads. I would rather not touch this code unless there is a strong
justification for it.
--
Michal Hocko
SUSE Labs
FYI, we noticed the following commit:
commit: 3fef5b604acd6e2346d932088b526891fbf93ee9 ("mm/vmalloc: add a node corresponding to cached_hole_size")
url: https://github.com/0day-ci/linux/commits/Zhaoyang-Huang/mm-vmalloc-add-a-node-corresponding-to-cached_hole_size/20170724-021305
in testcase: boot
on test machine: qemu-system-x86_64 -enable-kvm -smp 2 -m 512M
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
+----------------------------------------------------+-----------+------------+
| | v4.13-rc1 | 3fef5b604a |
+----------------------------------------------------+-----------+------------+
| boot_successes | 686 | 0 |
| boot_failures | 2369 | 32 |
| BUG:kernel_in_stage | 10 | |
| WARNING:possible_recursive_locking_detected | 2282 | |
| BUG:kernel_hang_in_test_stage | 294 | |
| INFO:creating/lkp/benchmarks/ltp/output_directory | 344 | |
| INFO:creating/lkp/benchmarks/ltp/results_directory | 344 | |
| INFO:ltp-pan_reported_some_tests_FAIL | 218 | |
| INFO:ltp-pan_reported_all_tests_PASS | 124 | |
| invoked_oom-killer:gfp_mask=0x | 4 | |
| Mem-Info | 4 | |
| BUG:unable_to_handle_kernel | 3 | |
| Oops:#[##] | 3 | |
| Kernel_panic-not_syncing:Fatal_exception | 5 | 32 |
| general_protection_fault:#[##] | 2 | |
| BUG:kernel_hang_in_boot_stage | 11 | |
| BUG:kernel_reboot-without-warning_in_test_stage | 1 | |
| kernel_BUG_at_mm/vmalloc.c | 0 | 32 |
| invalid_opcode:#[##] | 0 | 32 |
+----------------------------------------------------+-----------+------------+
[ 3.052952] kernel BUG at mm/vmalloc.c:526!
[ 3.052956] invalid opcode: 0000 [#1] SMP
[ 3.052957] Modules linked in:
[ 3.052961] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-rc1-00001-g3fef5b6 #5
[ 3.052963] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-20161025_171302-gandalf 04/01/2014
[ 3.052965] task: ffff8c443bcb8040 task.stack: ffffb9f3c0194000
[ 3.052969] RIP: 0010:alloc_vmap_area+0x2b8/0x36f
[ 3.052971] RSP: 0000:ffffb9f3c0197be8 EFLAGS: 00010206
[ 3.052974] RAX: ffffb9f3c01a9000 RBX: 0000000000002000 RCX: 0000000000000d81
[ 3.052975] RDX: 0000000000000003 RSI: 0000000000000003 RDI: ffffffff980c52a0
[ 3.052977] RBP: ffffb9f3c0197c40 R08: 0000000173ae419e R09: ffffffff97a436f8
[ 3.052979] R10: ffffb9f3c0197b50 R11: 0000000000000002 R12: 0000000000000fff
[ 3.052980] R13: ffff8c443b99dc00 R14: ffffffffc02db000 R15: 0000000000000000
[ 3.052983] FS: 0000000000000000(0000) GS:ffff8c443ca00000(0000) knlGS:0000000000000000
[ 3.052984] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3.052986] CR2: 0000000000000000 CR3: 000000001c02c000 CR4: 00000000000006f0
[ 3.052990] Call Trace:
[ 3.052999] __get_vm_area_node+0xee/0x158
[ 3.053011] ? ftrace_stub+0x5/0x5
[ 3.053014] ? ftrace_call+0x34/0x34
[ 3.053017] __vmalloc_node_range+0x67/0x226
[ 3.053021] ? arch_ftrace_update_trampoline+0xad/0x21c
[ 3.053023] ? __mutex_unlock_slowpath+0x3a/0x201
[ 3.053027] ? arch_ftrace_update_trampoline+0xad/0x21c
[ 3.053030] ? ftrace_stub+0x5/0x5
[ 3.053032] ? ftrace_call+0x34/0x34
[ 3.053035] module_alloc+0xa7/0xb7
[ 3.053039] ? arch_ftrace_update_trampoline+0xad/0x21c
[ 3.053043] arch_ftrace_update_trampoline+0xad/0x21c
[ 3.053045] ? ftrace_regs_caller+0x65/0x65
[ 3.053053] ftrace_startup+0xf4/0x19e
[ 3.053057] register_ftrace_function+0x26/0x3c
[ 3.053060] arm_kprobe+0x5d/0xc5
[ 3.053063] register_kprobe+0x3fa/0x49e
[ 3.053069] init_test_probes+0x51/0x462
[ 3.053071] ? init_test_probes+0x51/0x462
[ 3.053074] ? register_module_notifier+0x18/0x1a
[ 3.053078] init_kprobes+0x1f7/0x204
[ 3.053081] ? debugfs_kprobe_init+0xaa/0xaa
[ 3.053084] ? set_debug_rodata+0x17/0x17
[ 3.053087] do_one_initcall+0x90/0x138
[ 3.053094] ? set_debug_rodata+0x17/0x17
[ 3.053096] kernel_init_freeable+0x1c6/0x24e
[ 3.053099] ? rest_init+0xd8/0xd8
[ 3.053103] kernel_init+0xe/0xfa
[ 3.053106] ret_from_fork+0x2a/0x40
[ 3.053114] Code: 67 eb ff ff 49 8d 45 18 48 c7 c7 a0 52 0c 98 48 89 05 ab e5 6a 04 e8 e2 07 de 01 49 8b 45 00 49 85 c4 74 02 0f 0b 49 39 c6 76 02 <0f> 0b 48 8b 5d b8 49 3b 5d 08 4c 89 e8 0f 83 95 00 00 00 0f 0b
[ 3.053163] RIP: alloc_vmap_area+0x2b8/0x36f RSP: ffffb9f3c0197be8
[ 3.053170] ---[ end trace fdb091fa3551ba62 ]---
To reproduce:
git clone https://github.com/01org/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> job-script # job-script is attached in this email
Thanks,
Xiaolong