This series is a variant of Linus's jiffies based caching approach in the:
"get_vmalloc_info() and /proc/meminfo insanely expensive"
thread on lkml.
The idea is to track modifications to the vmalloc list by wrapping the
lock/unlock primitives, and to put a flag next to the spinlock. If the
spinlock is taken then it's cheap to modify this flag, and if it has
not been taken (the cached case) it will be a read-mostly variable
for every CPU in essence.
It seems to work for me, but it's only very (very!) lightly tested.
Would something like this be acceptable (and is it correct)?
Thanks,
Ingo
Ingo Molnar (3):
mm/vmalloc: Abstract out vmap_area_lock lock/unlock operations
mm/vmalloc: Track vmalloc info changes
mm/vmalloc: Cache the vmalloc memory info
mm/vmalloc.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
1 file changed, 57 insertions(+), 25 deletions(-)
--
2.1.4
We want to add some extra cache invalidation logic to vmalloc()
area list unlocks - for that first abstract away the vmap_area_lock
operations into vmap_lock()/vmap_unlock().
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: [email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
mm/vmalloc.c | 55 +++++++++++++++++++++++++++++++++----------------------
1 file changed, 33 insertions(+), 22 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2faaa2976447..605138083880 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -277,6 +277,17 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define VM_VM_AREA 0x04
static DEFINE_SPINLOCK(vmap_area_lock);
+
+static inline void vmap_lock(void)
+{
+ spin_lock(&vmap_area_lock);
+}
+
+static inline void vmap_unlock(void)
+{
+ spin_unlock(&vmap_area_lock);
+}
+
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
static struct rb_root vmap_area_root = RB_ROOT;
@@ -373,7 +384,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
retry:
- spin_lock(&vmap_area_lock);
+ vmap_lock();
/*
* Invalidate cache if we have more permissive parameters.
* cached_hole_size notes the largest hole noticed _below_
@@ -452,7 +463,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->flags = 0;
__insert_vmap_area(va);
free_vmap_cache = &va->rb_node;
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
BUG_ON(va->va_start & (align-1));
BUG_ON(va->va_start < vstart);
@@ -461,7 +472,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return va;
overflow:
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
@@ -514,9 +525,9 @@ static void __free_vmap_area(struct vmap_area *va)
*/
static void free_vmap_area(struct vmap_area *va)
{
- spin_lock(&vmap_area_lock);
+ vmap_lock();
__free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
}
/*
@@ -642,10 +653,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
flush_tlb_kernel_range(*start, *end);
if (nr) {
- spin_lock(&vmap_area_lock);
+ vmap_lock();
list_for_each_entry_safe(va, n_va, &valist, purge_list)
__free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
}
spin_unlock(&purge_lock);
}
@@ -707,9 +718,9 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
{
struct vmap_area *va;
- spin_lock(&vmap_area_lock);
+ vmap_lock();
va = __find_vmap_area(addr);
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
return va;
}
@@ -1304,14 +1315,14 @@ EXPORT_SYMBOL_GPL(map_vm_area);
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, const void *caller)
{
- spin_lock(&vmap_area_lock);
+ vmap_lock();
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
va->flags |= VM_VM_AREA;
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
}
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -1433,10 +1444,10 @@ struct vm_struct *remove_vm_area(const void *addr)
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;
- spin_lock(&vmap_area_lock);
+ vmap_lock();
va->vm = NULL;
va->flags &= ~VM_VM_AREA;
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
vmap_debug_free_range(va->va_start, va->va_end);
kasan_free_shadow(vm);
@@ -2008,7 +2019,7 @@ long vread(char *buf, char *addr, unsigned long count)
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
- spin_lock(&vmap_area_lock);
+ vmap_lock();
list_for_each_entry(va, &vmap_area_list, list) {
if (!count)
break;
@@ -2040,7 +2051,7 @@ long vread(char *buf, char *addr, unsigned long count)
count -= n;
}
finished:
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
if (buf == buf_start)
return 0;
@@ -2090,7 +2101,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
buflen = count;
- spin_lock(&vmap_area_lock);
+ vmap_lock();
list_for_each_entry(va, &vmap_area_list, list) {
if (!count)
break;
@@ -2121,7 +2132,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
count -= n;
}
finished:
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
if (!copied)
return 0;
return buflen;
@@ -2435,7 +2446,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
goto err_free;
}
retry:
- spin_lock(&vmap_area_lock);
+ vmap_lock();
/* start scanning - we scan from the top, begin with the last area */
area = term_area = last_area;
@@ -2457,7 +2468,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
* comparing.
*/
if (base + last_end < vmalloc_start + last_end) {
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
if (!purged) {
purge_vmap_area_lazy();
purged = true;
@@ -2512,7 +2523,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
vmap_area_pcpu_hole = base + offsets[last_area];
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
/* insert all vm's */
for (area = 0; area < nr_vms; area++)
@@ -2557,7 +2568,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
loff_t n = *pos;
struct vmap_area *va;
- spin_lock(&vmap_area_lock);
+ vmap_lock();
va = list_entry((&vmap_area_list)->next, typeof(*va), list);
while (n > 0 && &va->list != &vmap_area_list) {
n--;
@@ -2585,7 +2596,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
static void s_stop(struct seq_file *m, void *p)
__releases(&vmap_area_lock)
{
- spin_unlock(&vmap_area_lock);
+ vmap_unlock();
}
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
--
2.1.4
Add a 'vmap_info_changed' flag to track changes to vmalloc()
statistics.
For simplicity this flag is set every time we unlock the
vmap_area_lock.
This flag is not yet used.
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: [email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
mm/vmalloc.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 605138083880..d21febaa557a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -276,7 +276,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
-static DEFINE_SPINLOCK(vmap_area_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(vmap_area_lock);
+
+static int vmap_info_changed;
static inline void vmap_lock(void)
{
@@ -285,6 +287,7 @@ static inline void vmap_lock(void)
static inline void vmap_unlock(void)
{
+ vmap_info_changed = 1;
spin_unlock(&vmap_area_lock);
}
--
2.1.4
Linus reported that glibc (rather stupidly) reads /proc/meminfo
for every sysinfo() call, which causes the Git build to use
a surprising amount of CPU time, mostly due to the overhead
of get_vmalloc_info() - which walks a long list to do its
statistics.
Modify Linus's jiffies based patch to use the newly introduced
vmap_info_changed flag instead: when we cache the vmalloc-info,
we clear the flag. If the flag gets re-set then we'll calculate
the information again.
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: [email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
mm/vmalloc.c | 22 ++++++++++++++++++++--
1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d21febaa557a..ef48e557df5a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2702,7 +2702,7 @@ static int __init proc_vmalloc_init(void)
}
module_init(proc_vmalloc_init);
-void get_vmalloc_info(struct vmalloc_info *vmi)
+static void calc_vmalloc_info(struct vmalloc_info *vmi)
{
struct vmap_area *va;
unsigned long free_area_size;
@@ -2749,5 +2749,23 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
out:
rcu_read_unlock();
}
-#endif
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+ static struct vmalloc_info cached_info;
+
+ if (!vmap_info_changed) {
+ *vmi = cached_info;
+ return;
+ }
+
+ WRITE_ONCE(vmap_info_changed, 0);
+ barrier();
+
+ calc_vmalloc_info(vmi);
+
+ barrier();
+ cached_info = *vmi;
+}
+
+#endif
--
2.1.4
On Sat, Aug 22, 2015 at 3:44 AM, Ingo Molnar <[email protected]> wrote:
>
> Would something like this be acceptable (and is it correct)?
I don't think any of this can be called "correct", in that the
unlocked accesses to the cached state are clearly racy, but I think
it's very much "acceptable".
Linus
Linus wrote:
> I don't think any of this can be called "correct", in that the
> unlocked accesses to the cached state are clearly racy, but I think
> it's very much "acceptable".
I'd think you could easily fix that with a seqlock-like system.
What makes it so simple is that you can always fall back to
calc_vmalloc_info if there's any problem, rather than looping or blocking.
The basic idea is that you have a seqlock counter, but if either of
the two lsbits are set, the cached information is stale.
Basically, you need a seqlock and a spinlock. The seqlock does
most of the work, and the spinlock ensures that there's only one
updater of the cache.
vmap_unlock() does set_bit(0, &seq->sequence). This marks the information
as stale.
get_vmalloc_info reads the seqlock. There are two case:
- If the two lsbits are 10, the cached information is valid.
Copy it out, re-check the seqlock, and loop if the sequence
number changes.
- In any other case, the cached information is
not valid.
- Try to obtain the spinlock. Do not block if it's unavailable.
- If unavailable, do not block.
- If the lock is acquired:
- Set the sequence to (sequence | 3) + 1 (we're the only writer)
- This bumps the sequence number and leaves the lsbits at 00 (invalid)
- Memory barrier TBD. Do the RCU ops in calc_vmalloc_info do it for us?
- Call calc_vmalloc_info
- If we obtained the spinlock earlier:
- Copy our vmi to cached_info
- smp_wmb()
- set_bit(1, &seq->sequence). This marks the information as valid,
as long as bit 0 is still clear.
- Release the spinlock.
Basically, bit 0 says "vmalloc info has changed", and bit 1 says
"vmalloc cache has been updated". This clears bit 0 before
starting the update so that an update during calc_vmalloc_info
will force a new update.
So the three case are basically:
00 - calc_vmalloc_info() in progress
01 - vmap_unlock() during calc_vmalloc_info()
10 - cached_info is valid
11 - vmap_unlock has invalidated cached_info, awaiting refresh
Logically, the sequence number should be initialized to ...01,
but the code above handles 00 okay.
* George Spelvin <[email protected]> wrote:
> Linus wrote:
> > I don't think any of this can be called "correct", in that the
> > unlocked accesses to the cached state are clearly racy, but I think
> > it's very much "acceptable".
>
> I'd think you could easily fix that with a seqlock-like system.
>
> What makes it so simple is that you can always fall back to
> calc_vmalloc_info if there's any problem, rather than looping or blocking.
>
> The basic idea is that you have a seqlock counter, but if either of
> the two lsbits are set, the cached information is stale.
>
> Basically, you need a seqlock and a spinlock. The seqlock does
> most of the work, and the spinlock ensures that there's only one
> updater of the cache.
>
> vmap_unlock() does set_bit(0, &seq->sequence). This marks the information
> as stale.
>
> get_vmalloc_info reads the seqlock. There are two case:
> - If the two lsbits are 10, the cached information is valid.
> Copy it out, re-check the seqlock, and loop if the sequence
> number changes.
> - In any other case, the cached information is
> not valid.
> - Try to obtain the spinlock. Do not block if it's unavailable.
> - If unavailable, do not block.
> - If the lock is acquired:
> - Set the sequence to (sequence | 3) + 1 (we're the only writer)
> - This bumps the sequence number and leaves the lsbits at 00 (invalid)
> - Memory barrier TBD. Do the RCU ops in calc_vmalloc_info do it for us?
> - Call calc_vmalloc_info
> - If we obtained the spinlock earlier:
> - Copy our vmi to cached_info
> - smp_wmb()
> - set_bit(1, &seq->sequence). This marks the information as valid,
> as long as bit 0 is still clear.
> - Release the spinlock.
>
> Basically, bit 0 says "vmalloc info has changed", and bit 1 says
> "vmalloc cache has been updated". This clears bit 0 before
> starting the update so that an update during calc_vmalloc_info
> will force a new update.
>
> So the three case are basically:
> 00 - calc_vmalloc_info() in progress
> 01 - vmap_unlock() during calc_vmalloc_info()
> 10 - cached_info is valid
> 11 - vmap_unlock has invalidated cached_info, awaiting refresh
>
> Logically, the sequence number should be initialized to ...01,
> but the code above handles 00 okay.
I think this is too complex.
How about something simple like the patch below (on top of the third patch)?
It makes the vmalloc info transactional - /proc/meminfo will always print a
consistent set of numbers. (Not that we really care about races there, but it
looks really simple to solve so why not.)
( I also moved the function-static cache next to the flag and seqlock - this
should further compress the cache footprint. )
Have I missed anything? Very lightly tested: booted in a VM.
Thanks,
Ingo
=========================>
mm/vmalloc.c | 23 ++++++++++++++++++-----
1 file changed, 18 insertions(+), 5 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ef48e557df5a..66726f41e726 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -278,7 +278,15 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(vmap_area_lock);
+/*
+ * Seqlock and flag for the vmalloc info cache printed in /proc/meminfo.
+ *
+ * The assumption of the optimization is that it's read frequently, but
+ * modified infrequently.
+ */
+static DEFINE_SEQLOCK(vmap_info_lock);
static int vmap_info_changed;
+static struct vmalloc_info vmap_info_cache;
static inline void vmap_lock(void)
{
@@ -2752,10 +2760,14 @@ static void calc_vmalloc_info(struct vmalloc_info *vmi)
void get_vmalloc_info(struct vmalloc_info *vmi)
{
- static struct vmalloc_info cached_info;
+ if (!READ_ONCE(vmap_info_changed)) {
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&vmap_info_lock);
+ *vmi = vmap_info_cache;
+ } while (read_seqretry(&vmap_info_lock, seq));
- if (!vmap_info_changed) {
- *vmi = cached_info;
return;
}
@@ -2764,8 +2776,9 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
calc_vmalloc_info(vmi);
- barrier();
- cached_info = *vmi;
+ write_seqlock(&vmap_info_lock);
+ vmap_info_cache = *vmi;
+ write_sequnlock(&vmap_info_lock);
}
#endif
Ingo Molnar <[email protected]> wrote:
> I think this is too complex.
>
> How about something simple like the patch below (on top of the third patch)?
> It makes the vmalloc info transactional - /proc/meminfo will always print a
> consistent set of numbers. (Not that we really care about races there, but it
> looks really simple to solve so why not.)
Looks like a huge simplification!
It needs a comment about the approximate nature of the locking and
the obvious race conditions:
1) The first caller to get_vmalloc_info() clears vmap_info_changed
before updating vmap_info_cache, so a second caller is likely to
get stale data for the duration of a calc_vmalloc_info call.
2) Although unlikely, it's possible for two threads to race calling
calc_vmalloc_info, and the one that computes fresher data updates
the cache first, so the later write leaves stale data.
Other issues:
3) Me, I'd make vmap_info_changed a bool, for documentation more than
any space saving.
4) I wish there were a trylock version of write_seqlock, so we could
avoid blocking entirely. (You *could* hand-roll it, but that eats
into the simplicity.)
* George Spelvin <[email protected]> wrote:
> Ingo Molnar <[email protected]> wrote:
> > I think this is too complex.
> >
> > How about something simple like the patch below (on top of the third patch)?
>
> > It makes the vmalloc info transactional - /proc/meminfo will always print a
> > consistent set of numbers. (Not that we really care about races there, but it
> > looks really simple to solve so why not.)
>
> Looks like a huge simplification!
>
> It needs a comment about the approximate nature of the locking and
> the obvious race conditions:
> 1) The first caller to get_vmalloc_info() clears vmap_info_changed
> before updating vmap_info_cache, so a second caller is likely to
> get stale data for the duration of a calc_vmalloc_info call.
> 2) Although unlikely, it's possible for two threads to race calling
> calc_vmalloc_info, and the one that computes fresher data updates
> the cache first, so the later write leaves stale data.
>
> Other issues:
> 3) Me, I'd make vmap_info_changed a bool, for documentation more than
> any space saving.
> 4) I wish there were a trylock version of write_seqlock, so we could
> avoid blocking entirely. (You *could* hand-roll it, but that eats
> into the simplicity.)
Ok, fair enough - so how about the attached approach instead, which uses a 64-bit
generation counter to track changes to the vmalloc state.
This is still very simple, but should not suffer from stale data being returned
indefinitely in /proc/meminfo. We might race - but that was true before as well
due to the lock-less RCU list walk - but we'll always return a correct and
consistent version of the information.
Lightly tested. This is a replacement patch to make it easier to read via email.
I also made sure there's no extra overhead in the !CONFIG_PROC_FS case.
Note that there's an even simpler variant possible I think: we could use just the
two generation counters and barriers to remove the seqlock.
Thanks,
Ingo
==============================>
>From f9fd770e75e2edb4143f32ced0b53d7a77969c94 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <[email protected]>
Date: Sat, 22 Aug 2015 12:28:01 +0200
Subject: [PATCH] mm/vmalloc: Cache the vmalloc memory info
Linus reported that glibc (rather stupidly) reads /proc/meminfo
for every sysinfo() call, which causes the Git build to use
a surprising amount of CPU time, mostly due to the overhead
of get_vmalloc_info() - which walks a long list to do its
statistics.
Modify Linus's jiffies based patch to use generation counters
to cache the vmalloc info: vmap_unlock() increases the generation
counter, and the get_vmalloc_info() reads it and compares it
against a cached generation counter.
Also use a seqlock to make sure we always print a consistent
set of vmalloc statistics.
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: [email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
mm/vmalloc.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 56 insertions(+), 3 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 605138083880..d72b23436906 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -276,7 +276,21 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
-static DEFINE_SPINLOCK(vmap_area_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(vmap_area_lock);
+
+#ifdef CONFIG_PROC_FS
+/*
+ * A seqlock and two generation counters for a simple cache of the
+ * vmalloc allocation statistics info printed in /proc/meminfo.
+ *
+ * ( The assumption of the optimization is that it's read frequently, but
+ * modified infrequently. )
+ */
+static DEFINE_SEQLOCK(vmap_info_lock);
+static u64 vmap_info_gen;
+static u64 vmap_info_cache_gen;
+static struct vmalloc_info vmap_info_cache;
+#endif
static inline void vmap_lock(void)
{
@@ -285,6 +299,9 @@ static inline void vmap_lock(void)
static inline void vmap_unlock(void)
{
+#ifdef CONFIG_PROC_FS
+ WRITE_ONCE(vmap_info_gen, vmap_info_gen+1);
+#endif
spin_unlock(&vmap_area_lock);
}
@@ -2699,7 +2716,7 @@ static int __init proc_vmalloc_init(void)
}
module_init(proc_vmalloc_init);
-void get_vmalloc_info(struct vmalloc_info *vmi)
+static void calc_vmalloc_info(struct vmalloc_info *vmi)
{
struct vmap_area *va;
unsigned long free_area_size;
@@ -2746,5 +2763,41 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
out:
rcu_read_unlock();
}
-#endif
+/*
+ * Return a consistent snapshot of the current vmalloc allocation
+ * statistics, for /proc/meminfo:
+ */
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+ u64 gen = READ_ONCE(vmap_info_gen);
+
+ /*
+ * If the generation counter of the cache matches that of
+ * the vmalloc generation counter then return the cache:
+ */
+ if (READ_ONCE(vmap_info_cache_gen) == gen) {
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&vmap_info_lock);
+ *vmi = vmap_info_cache;
+ } while (read_seqretry(&vmap_info_lock, seq));
+
+ return;
+ }
+
+ calc_vmalloc_info(vmi);
+
+ /*
+ * If are racing with a new vmalloc() then we might write
+ * the old generation counter here - and the next call to
+ * get_vmalloc_info() will fix things up:
+ */
+ write_seqlock(&vmap_info_lock);
+ vmap_info_cache = *vmi;
+ WRITE_ONCE(vmap_info_cache_gen, gen);
+ write_sequnlock(&vmap_info_lock);
+}
+
+#endif /* CONFIG_PROC_FS */
On Sun, Aug 23 2015, Ingo Molnar <[email protected]> wrote:
> Ok, fair enough - so how about the attached approach instead, which
> uses a 64-bit generation counter to track changes to the vmalloc
> state.
How does this invalidation approach compare to the jiffies approach? In
other words, how often does the vmalloc info actually change (or rather,
in this approximation, how often is vmap_area_lock taken)? In
particular, does it also solve the problem with git's test suite and
similar situations with lots of short-lived processes?
> ==============================>
> From f9fd770e75e2edb4143f32ced0b53d7a77969c94 Mon Sep 17 00:00:00 2001
> From: Ingo Molnar <[email protected]>
> Date: Sat, 22 Aug 2015 12:28:01 +0200
> Subject: [PATCH] mm/vmalloc: Cache the vmalloc memory info
>
> Linus reported that glibc (rather stupidly) reads /proc/meminfo
> for every sysinfo() call,
Not quite: It is done by the two functions get_{av,}phys_pages
functions; and get_phys_pages is called (once per process) by glibc's
qsort implementation. In fact, sysinfo() is (at least part of) the cure,
not the disease. Whether qsort should care about the total amount of
memory is another discussion.
<http://thread.gmane.org/gmane.comp.lib.glibc.alpha/54342/focus=54558>
Rasmus
I was curious why these fields were ever added to /proc/meminfo, and dug
up this:
commit d262ee3ee6ba4f5f6125571d93d9d63191d2ef76
Author: Andrew Morton <[email protected]>
Date: Sat Apr 12 12:59:04 2003 -0700
[PATCH] vmalloc stats in /proc/meminfo
From: Matt Porter <[email protected]>
There was a thread a while back on lkml where Dave Hansen proposed this
simple vmalloc usage reporting patch. The thread pretty much died out as
most people seemed focused on what VM loading type bugs it could solve. I
had posted that this type of information was really valuable in debugging
embedded Linux board ports. A common example is where people do arch
specific setup that limits there vmalloc space and then they find modules
won't load. ;) Having the Vmalloc* info readily available is real useful in
helping folks to fix their kernel ports.
That thread is at <http://thread.gmane.org/gmane.linux.kernel/53360>.
[Maybe one could just remove the fields and see if anybody actually
notices/cares any longer. Or, if they are only used by kernel
developers, put them in their own file.]
Rasmus
First, an actual, albeit minor, bug: initializing both vmap_info_gen
and vmap_info_cache_gen to 0 marks the cache as valid, which it's not.
vmap_info_gen should be initialized to 1 to force an initial
cache update.
Second, I don't see why you need a 64-bit counter. Seqlocks consider
32 bits (31 bits, actually, the lsbit means "update in progress") quite
a strong enough guarantee.
Third, it seems as though vmap_info_cache_gen is basically a duplicate
of vmap_info_lock.sequence. It should be possible to make one variable
serve both purposes.
You just need a kludge to handle the case of multiple vamp_info updates
between cache updates.
There are two simple ones:
1) Avoid bumping vmap_info_gen unnecessarily. In vmap_unlock(), do
vmap_info_gen = (vmap_info_lock.sequence | 1) + 1;
2) - Make vmap_info_gen a seqcount_t
- In vmap_unlock(), do write_seqcount_barrier(&vmap_info_gen)
- In get_vmalloc_info, inside the seqlock critical section, do
vmap_info_lock.seqcount.sequence = vmap_info_gen.sequence - 1;
(Using the vmap_info_gen.sequence read while validating the
cache in the first place.)
I should try to write an actual patch illustrating this.
* Rasmus Villemoes <[email protected]> wrote:
> On Sun, Aug 23 2015, Ingo Molnar <[email protected]> wrote:
>
> > Ok, fair enough - so how about the attached approach instead, which
> > uses a 64-bit generation counter to track changes to the vmalloc
> > state.
>
> How does this invalidation approach compare to the jiffies approach? In
> other words, how often does the vmalloc info actually change (or rather,
> in this approximation, how often is vmap_area_lock taken)? In
> particular, does it also solve the problem with git's test suite and
> similar situations with lots of short-lived processes?
The two approaches are pretty similar, and in a typical distro with typical
workload vmalloc() is mostly a boot time affair.
But vmalloc() can be used more often in certain corner cases - neither of the
patches makes that in any way slower, just the optimization won't trigger that
often.
Since vmalloc() use is suboptimal for several reasons (it does not use large pages
for kernel space allocations, etc.), this is all pretty OK IMHO.
> > ==============================>
> > From f9fd770e75e2edb4143f32ced0b53d7a77969c94 Mon Sep 17 00:00:00 2001
> > From: Ingo Molnar <[email protected]>
> > Date: Sat, 22 Aug 2015 12:28:01 +0200
> > Subject: [PATCH] mm/vmalloc: Cache the vmalloc memory info
> >
> > Linus reported that glibc (rather stupidly) reads /proc/meminfo
> > for every sysinfo() call,
>
> Not quite: It is done by the two functions get_{av,}phys_pages
> functions; and get_phys_pages is called (once per process) by glibc's
> qsort implementation. In fact, sysinfo() is (at least part of) the cure,
> not the disease. Whether qsort should care about the total amount of
> memory is another discussion.
>
> <http://thread.gmane.org/gmane.comp.lib.glibc.alpha/54342/focus=54558>
Thanks, is the fixed up changelog below better?
Ingo
===============>
mm/vmalloc: Cache the vmalloc memory info
Linus reported that for scripting-intense workloads such as the
Git build, glibc's qsort will read /proc/meminfo for every process
created (by way of get_phys_pages()), which causes the Git build
to generate a surprising amount of kernel overhead.
A fair chunk of the overhead is due to get_vmalloc_info() - which
walks a potentially long list to do its statistics.
Modify Linus's jiffies based patch to use generation counters
to cache the vmalloc info: vmap_unlock() increases the generation
counter, and the get_vmalloc_info() reads it and compares it
against a cached generation counter.
Also use a seqlock to make sure we always print a consistent
set of vmalloc statistics.
Reported-by: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: [email protected]
Signed-off-by: Ingo Molnar <[email protected]>
* Rasmus Villemoes <[email protected]> wrote:
> I was curious why these fields were ever added to /proc/meminfo, and dug
> up this:
>
> commit d262ee3ee6ba4f5f6125571d93d9d63191d2ef76
> Author: Andrew Morton <[email protected]>
> Date: Sat Apr 12 12:59:04 2003 -0700
>
> [PATCH] vmalloc stats in /proc/meminfo
>
> From: Matt Porter <[email protected]>
>
> There was a thread a while back on lkml where Dave Hansen proposed this
> simple vmalloc usage reporting patch. The thread pretty much died out as
> most people seemed focused on what VM loading type bugs it could solve. I
> had posted that this type of information was really valuable in debugging
> embedded Linux board ports. A common example is where people do arch
> specific setup that limits there vmalloc space and then they find modules
> won't load. ;) Having the Vmalloc* info readily available is real useful in
> helping folks to fix their kernel ports.
>
> That thread is at <http://thread.gmane.org/gmane.linux.kernel/53360>.
>
> [Maybe one could just remove the fields and see if anybody actually
> notices/cares any longer. Or, if they are only used by kernel
> developers, put them in their own file.]
So instead of removing the fields (which I'm quite sure is an ABI breaker as it
could break less robust /proc/meminfo parsers and scripts), we could just report
'0' all the time - and have the real info somewhere else?
Thanks,
Ingo
* George Spelvin <[email protected]> wrote:
> First, an actual, albeit minor, bug: initializing both vmap_info_gen
> and vmap_info_cache_gen to 0 marks the cache as valid, which it's not.
Ha! :-) Fixed.
> vmap_info_gen should be initialized to 1 to force an initial
> cache update.
Yeah.
> Second, I don't see why you need a 64-bit counter. Seqlocks consider
> 32 bits (31 bits, actually, the lsbit means "update in progress") quite
> a strong enough guarantee.
Just out of general paranoia - but you are right, and this would lower the
overhead on 32-bit SMP platforms a bit, plus it avoids 64-bit word tearing
artifacts on 32 bit platforms as well.
I modified it to u32.
> Third, it seems as though vmap_info_cache_gen is basically a duplicate
> of vmap_info_lock.sequence. It should be possible to make one variable
> serve both purposes.
Correct, I alluded to that in my description:
> > Note that there's an even simpler variant possible I think: we could use just
> > the two generation counters and barriers to remove the seqlock.
> You just need a kludge to handle the case of multiple vamp_info updates
> between cache updates.
>
> There are two simple ones:
>
> 1) Avoid bumping vmap_info_gen unnecessarily. In vmap_unlock(), do
> vmap_info_gen = (vmap_info_lock.sequence | 1) + 1;
> 2) - Make vmap_info_gen a seqcount_t
> - In vmap_unlock(), do write_seqcount_barrier(&vmap_info_gen)
> - In get_vmalloc_info, inside the seqlock critical section, do
> vmap_info_lock.seqcount.sequence = vmap_info_gen.sequence - 1;
> (Using the vmap_info_gen.sequence read while validating the
> cache in the first place.)
>
> I should try to write an actual patch illustrating this.
So I think something like the patch below is even simpler than trying to kludge
generation counter semantics into seqcounts.
I used two generation counters and a spinlock. The fast path is completely
lockless and lightweight on modern SMP platforms. (where smp_rmb() is a no-op or
very cheap.)
There's not even a seqlock retry loop, instead an invalid cache causes us to fall
back to the old behavior - and the freshest result is guaranteed to end up in the
cache.
The linecount got a bit larger: but half of it is comments.
Note that the generation counters are signed integers so that this comparison can
be done:
+ if (gen-vmap_info_cache_gen > 0) {
Thanks,
Ingo
======================>
>From 1a4c168a22cc302282cbd1bf503ecfc4dc52b74f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <[email protected]>
Date: Sat, 22 Aug 2015 12:28:01 +0200
Subject: [PATCH] mm/vmalloc: Cache the vmalloc memory info
Linus reported that for scripting-intense workloads such as the
Git build, glibc's qsort will read /proc/meminfo for every process
created (by way of get_phys_pages()), which causes the Git build
to generate a surprising amount of kernel overhead.
A fair chunk of the overhead is due to get_vmalloc_info() - which
walks a potentially long list to do its statistics.
Modify Linus's jiffies based patch to use generation counters
to cache the vmalloc info: vmap_unlock() increases the generation
counter, and the get_vmalloc_info() reads it and compares it
against a cached generation counter.
Also use a seqlock to make sure we always print a consistent
set of vmalloc statistics.
Reported-by: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: [email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
mm/vmalloc.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 80 insertions(+), 3 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 605138083880..23df06ebb48a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -276,7 +276,21 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
-static DEFINE_SPINLOCK(vmap_area_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(vmap_area_lock);
+
+#ifdef CONFIG_PROC_FS
+/*
+ * A seqlock and two generation counters for a simple cache of the
+ * vmalloc allocation statistics info printed in /proc/meminfo.
+ *
+ * ( The assumption of the optimization is that it's read frequently, but
+ * modified infrequently. )
+ */
+static DEFINE_SPINLOCK(vmap_info_lock);
+static int vmap_info_gen = 1;
+static int vmap_info_cache_gen;
+static struct vmalloc_info vmap_info_cache;
+#endif
static inline void vmap_lock(void)
{
@@ -285,6 +299,9 @@ static inline void vmap_lock(void)
static inline void vmap_unlock(void)
{
+#ifdef CONFIG_PROC_FS
+ WRITE_ONCE(vmap_info_gen, vmap_info_gen+1);
+#endif
spin_unlock(&vmap_area_lock);
}
@@ -2699,7 +2716,7 @@ static int __init proc_vmalloc_init(void)
}
module_init(proc_vmalloc_init);
-void get_vmalloc_info(struct vmalloc_info *vmi)
+static void calc_vmalloc_info(struct vmalloc_info *vmi)
{
struct vmap_area *va;
unsigned long free_area_size;
@@ -2746,5 +2763,65 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
out:
rcu_read_unlock();
}
-#endif
+/*
+ * Return a consistent snapshot of the current vmalloc allocation
+ * statistics, for /proc/meminfo:
+ */
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+ int gen = READ_ONCE(vmap_info_gen);
+
+ /*
+ * If the generation counter of the cache matches that of
+ * the vmalloc generation counter then return the cache:
+ */
+ if (READ_ONCE(vmap_info_cache_gen) == gen) {
+ int gen_after;
+
+ /*
+ * The two read barriers make sure that we read
+ * 'gen', 'vmap_info_cache' and 'gen_after' in
+ * precisely that order:
+ */
+ smp_rmb();
+ *vmi = vmap_info_cache;
+
+ smp_rmb();
+ gen_after = READ_ONCE(vmap_info_gen);
+
+ /* The cache is still valid: */
+ if (gen == gen_after)
+ return;
+
+ /* Ok, the cache got invalidated just now, regenerate it */
+ gen = gen_after;
+ }
+
+ /* Make sure 'gen' is read before the vmalloc info */
+ smp_rmb();
+
+ calc_vmalloc_info(vmi);
+
+ /*
+ * All updates to vmap_info_cache_gen go through this spinlock,
+ * so when the cache got invalidated, we'll only mark it valid
+ * again if we first fully write the new vmap_info_cache.
+ *
+ * This ensures that partial results won't be used.
+ */
+ spin_lock(&vmap_info_lock);
+ if (gen-vmap_info_cache_gen > 0) {
+ vmap_info_cache = *vmi;
+ /*
+ * Make sure the new cached data is visible before
+ * the generation counter update:
+ */
+ smp_wmb();
+
+ WRITE_ONCE(vmap_info_cache_gen, gen);
+ }
+ spin_unlock(&vmap_info_lock);
+}
+
+#endif /* CONFIG_PROC_FS */
* Ingo Molnar <[email protected]> wrote:
> +/*
> + * Return a consistent snapshot of the current vmalloc allocation
> + * statistics, for /proc/meminfo:
> + */
> +void get_vmalloc_info(struct vmalloc_info *vmi)
> +{
> + int gen = READ_ONCE(vmap_info_gen);
> +
> + /*
> + * If the generation counter of the cache matches that of
> + * the vmalloc generation counter then return the cache:
> + */
> + if (READ_ONCE(vmap_info_cache_gen) == gen) {
> + int gen_after;
> +
> + /*
> + * The two read barriers make sure that we read
> + * 'gen', 'vmap_info_cache' and 'gen_after' in
> + * precisely that order:
> + */
> + smp_rmb();
> + *vmi = vmap_info_cache;
> +
> + smp_rmb();
> + gen_after = READ_ONCE(vmap_info_gen);
> +
> + /* The cache is still valid: */
> + if (gen == gen_after)
> + return;
> +
> + /* Ok, the cache got invalidated just now, regenerate it */
> + gen = gen_after;
> + }
One more detail: I just realized that with the read barriers, the READ_ONCE()
accesses are not needed anymore - the barriers and the control dependencies are
enough.
This will further simplify the code.
Thanks,
Ingo
* Ingo Molnar <[email protected]> wrote:
> One more detail: I just realized that with the read barriers, the READ_ONCE()
> accesses are not needed anymore - the barriers and the control dependencies are
> enough.
>
> This will further simplify the code.
I.e. something like the updated patch below. (We still need the WRITE_ONCE() for
vmap_info_gen update.)
Thanks,
Ingo
========================>
>From 46a0507e0a395a7bc2fe4b46a4766e7457ac0140 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <[email protected]>
Date: Sat, 22 Aug 2015 12:28:01 +0200
Subject: [PATCH] mm/vmalloc: Cache the vmalloc memory info
Linus reported that for scripting-intense workloads such as the
Git build, glibc's qsort will read /proc/meminfo for every process
created (by way of get_phys_pages()), which causes the Git build
to generate a surprising amount of kernel overhead.
A fair chunk of the overhead is due to get_vmalloc_info() - which
walks a potentially long list to do its statistics.
Modify Linus's jiffies based patch to use generation counters
to cache the vmalloc info: vmap_unlock() increases the generation
counter, and the get_vmalloc_info() reads it and compares it
against a cached generation counter.
Also use a seqlock to make sure we always print a consistent
set of vmalloc statistics.
Reported-by: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: [email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
mm/vmalloc.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 79 insertions(+), 3 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 605138083880..2f8d9660e007 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -276,7 +276,21 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
-static DEFINE_SPINLOCK(vmap_area_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(vmap_area_lock);
+
+#ifdef CONFIG_PROC_FS
+/*
+ * A seqlock and two generation counters for a simple cache of the
+ * vmalloc allocation statistics info printed in /proc/meminfo.
+ *
+ * ( The assumption of the optimization is that it's read frequently, but
+ * modified infrequently. )
+ */
+static DEFINE_SPINLOCK(vmap_info_lock);
+static int vmap_info_gen = 1;
+static int vmap_info_cache_gen;
+static struct vmalloc_info vmap_info_cache;
+#endif
static inline void vmap_lock(void)
{
@@ -285,6 +299,9 @@ static inline void vmap_lock(void)
static inline void vmap_unlock(void)
{
+#ifdef CONFIG_PROC_FS
+ WRITE_ONCE(vmap_info_gen, vmap_info_gen+1);
+#endif
spin_unlock(&vmap_area_lock);
}
@@ -2699,7 +2716,7 @@ static int __init proc_vmalloc_init(void)
}
module_init(proc_vmalloc_init);
-void get_vmalloc_info(struct vmalloc_info *vmi)
+static void calc_vmalloc_info(struct vmalloc_info *vmi)
{
struct vmap_area *va;
unsigned long free_area_size;
@@ -2746,5 +2763,64 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
out:
rcu_read_unlock();
}
-#endif
+/*
+ * Return a consistent snapshot of the current vmalloc allocation
+ * statistics, for /proc/meminfo:
+ */
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+ int gen = vmap_info_gen;
+
+ /*
+ * If the generation counter of the cache matches that of
+ * the vmalloc generation counter then return the cache:
+ */
+ if (vmap_info_cache_gen == gen) {
+ int gen_after;
+
+ /*
+ * The two read barriers make sure that we read
+ * 'gen', 'vmap_info_cache' and 'gen_after' in
+ * precisely that order:
+ */
+ smp_rmb();
+ *vmi = vmap_info_cache;
+
+ smp_rmb();
+ gen_after = vmap_info_gen;
+
+ /* The cache is still valid: */
+ if (gen == gen_after)
+ return;
+
+ /* Ok, the cache got invalidated just now, regenerate it */
+ gen = gen_after;
+ }
+
+ /* Make sure 'gen' is read before the vmalloc info */
+ smp_rmb();
+
+ calc_vmalloc_info(vmi);
+
+ /*
+ * All updates to vmap_info_cache_gen go through this spinlock,
+ * so when the cache got invalidated, we'll only mark it valid
+ * again if we first fully write the new vmap_info_cache.
+ *
+ * This ensures that partial results won't be used.
+ */
+ spin_lock(&vmap_info_lock);
+ if (gen-vmap_info_cache_gen > 0) {
+ vmap_info_cache = *vmi;
+ /*
+ * Make sure the new cached data is visible before
+ * the generation counter update:
+ */
+ smp_wmb();
+ vmap_info_cache_gen = gen;
+ }
+ spin_unlock(&vmap_info_lock);
+}
+
+#endif /* CONFIG_PROC_FS */
On Mon, Aug 24 2015, Ingo Molnar <[email protected]> wrote:
>
> Thanks, is the fixed up changelog below better?
>
Yes, though Linus specifically referred to "make test" (but I guess one
could/should consider that part of the build process).
Rasmus
> mm/vmalloc: Cache the vmalloc memory info
>
> Linus reported that for scripting-intense workloads such as the
> Git build, glibc's qsort will read /proc/meminfo for every process
> created (by way of get_phys_pages()), which causes the Git build
> to generate a surprising amount of kernel overhead.
(I hope I'm not annoying you by bikeshedding this too much, although I
think this is improving.)
You've sort of re-invented spinlocks, but after thinking a bit,
it all works.
Rather than using a single word, which is incremented to an odd number
at the start of an update and an even number at the end, there are
two. An update is in progress when they're unequal.
vmap_info_gen is incremented early, when the cache needs updating, and
read late (after the cache is copied).
vmap_info_cache_gen is incremented after the cache is updated, and read
early (before the cache is copied).
This is logically equivalent to my complicated scheme with atomic updates
to various bits in a single generation word, but greatly simplified by
having two separate words. In particular, there's no longer a need to
distinguish "vmap has updated list" from "calc_vmalloc_info in progress".
I particularly like the "gen - vmap_info_cache_gen > 0" test.
You *must* test for inequality to prevent tearing of a valid cache
(...grr...English heteronyms...), and given that, might as well
require it be fresher.
Anyway, suggested changes for v6 (sigh...):
First: you do a second read of vmap_info_gen to optimize out the copy
of vmalloc_info if it's easily seen as pointless, but given how small
vmalloc_info is (two words!), i'd be inclined to omit that optimization.
Copy always, *then* see if it's worth keeping. Smaller code, faster
fast path, and is barely noticeable on the slow path.
Second, and this is up to you, I'd be inclined to go fully non-blocking and
only spin_trylock(). If that fails, just skip the cache update.
Third, ANSI C rules allow a compiler to assume that signed integer
overflow does not occur. That means that gcc is allowed to optimize
"if (x - y > 0)" to "if (x > y)".
Given that gcc has annoyed us by using this optimization in other
contexts, It might be safer to make them unsigned (which is required to
wrap properly) and cast to integer after subtraction.
Basically, the following (untested, but pretty damn simple):
+/*
+ * Return a consistent snapshot of the current vmalloc allocation
+ * statistics, for /proc/meminfo:
+ */
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+ unsigned gen, cache_gen = READ_ONCE(vmap_info_cache_gen);
+
+ /*
+ * The two read barriers make sure that we read
+ * 'cache_gen', 'vmap_info_cache' and 'gen' in
+ * precisely that order:
+ */
+ smp_rmb();
+ *vmi = vmap_info_cache;
+
+ smp_rmb();
+ gen = READ_ONCE(vmap_info_gen);
+
+ /*
+ * If the generation counter of the cache matches that of
+ * the vmalloc generation counter then return the cache:
+ */
+ if (gen == cache_gen)
+ return;
+
+ /* Make sure 'gen' is read before the vmalloc info */
+ smp_rmb();
+ calc_vmalloc_info(vmi);
+
+ /*
+ * All updates to vmap_info_cache_gen go through this spinlock,
+ * so when the cache got invalidated, we'll only mark it valid
+ * again if we first fully write the new vmap_info_cache.
+ *
+ * This ensures that partial results won't be used.
+ */
+ if (spin_trylock(&vmap_info_lock)) {
+ if ((int)(gen - vmap_info_cache_gen) > 0) {
+ vmap_info_cache = *vmi;
+ /*
+ * Make sure the new cached data is visible before
+ * the generation counter update:
+ */
+ smp_wmb();
+ WRITE_ONCE(vmap_info_cache_gen, gen);
+ }
+ spin_unlock(&vmap_info_lock);
+ }
+}
+
+#endif /* CONFIG_PROC_FS */
The only remaining *very small* nit is that this function is a mix of
"return early" and "wrap it in an if()" style. If you want to make that
"if (!spin_trylock(...)) return;", I leave that you your esthetic judgement.
>>>>> "Ingo" == Ingo Molnar <[email protected]> writes:
Ingo> * George Spelvin <[email protected]> wrote:
>> First, an actual, albeit minor, bug: initializing both vmap_info_gen
>> and vmap_info_cache_gen to 0 marks the cache as valid, which it's not.
Ingo> Ha! :-) Fixed.
>> vmap_info_gen should be initialized to 1 to force an initial
>> cache update.
Blech, it should be initialized with a proper #define
VMAP_CACHE_NEEDS_UPDATE 1, instead of more magic numbers.
Ingo> + */
Ingo> +static DEFINE_SPINLOCK(vmap_info_lock);
Ingo> +static int vmap_info_gen = 1;
static int vmap_info_gen = VMAP_CACHE_NEEDS_UPDATE;
Ingo> +static int vmap_info_cache_gen;
Ingo> +static struct vmalloc_info vmap_info_cache;
Ingo> +#endif
This will help keep bugs like this out in the future... I hope!
John Stoffel <[email protected]> wrote:
>> vmap_info_gen should be initialized to 1 to force an initial
>> cache update.
> Blech, it should be initialized with a proper #define
> VMAP_CACHE_NEEDS_UPDATE 1, instead of more magic numbers.
Er... this is a joke, right?
First, this number is used exactly once, and it's not part of a collection
of similar numbers. And the definition would be adjacent to the use.
We have easier ways of accomplishing that, called "comments".
Second, your proposed name is misleading. "needs update" is defined
as vmap_info_gen != vmap_info_cache_gen. There is no particular value
of either that has this meaning.
For example, initializing vmap_info_cache_gen to -1 would do just as well.
(I actually considered that before deciding that +1 was "simpler" than -1.)
For some versions of the code, an *arbitrary* difference is okay.
You could set one ot 0xDEADBEEF and the other to 0xFEEDFACE.
For other versions, the magnitude matters, but not *too* much.
Initializing it to 42 would be perfectly correct, but waste time doing
42 cache updates before settling down.
Singling out the value 1 as VMAP_CACHE_NEEDS_UPDATE is actively misleading.
> This will help keep bugs like this out in the future... I hope!
And this is the punchline, right?
The problem was not realizing that non-default initialization was required;
what we *call* the non-default value is irrelevant.
I doubt it would ever have been a real (i.e. noticeable) bug, actually;
the first bit of vmap activity in very early boot would have invalidated
the cache.
(John, my apologies if I went over the top and am contributing to LKML's
reputation for flaming. I *did* actually laugh, and *do* think it's a
dumb idea, but my annoyance is really directed at unpleasant memories of
mindless application of coding style guidelines. In this case, I suspect
you just posted before reading carefully enough to see the subtle logic.)
George> John Stoffel <[email protected]> wrote:
>>> vmap_info_gen should be initialized to 1 to force an initial
>>> cache update.
>> Blech, it should be initialized with a proper #define
>> VMAP_CACHE_NEEDS_UPDATE 1, instead of more magic numbers.
George> Er... this is a joke, right?
Not really. The comment made before was that by setting this variable
to zero, it wasn't properly initialized. Which implies that either
the API is wrong... or we should be documenting it better. I just
went in the direction of the #define instead of a comment.
George> First, this number is used exactly once, and it's not part of
George> a collection of similar numbers. And the definition would be
George> adjacent to the use.
George> We have easier ways of accomplishing that, called "comments".
Sure, that would be the better solution in this case.
George> Second, your proposed name is misleading. "needs update" is defined
George> as vmap_info_gen != vmap_info_cache_gen. There is no particular value
George> of either that has this meaning.
George> For example, initializing vmap_info_cache_gen to -1 would do just as well.
George> (I actually considered that before deciding that +1 was "simpler" than -1.)
See, I just threw out a dumb suggestion without reading the patch
properly. My fault.
George> (John, my apologies if I went over the top and am contributing to LKML's
George> reputation for flaming. I *did* actually laugh, and *do* think it's a
George> dumb idea, but my annoyance is really directed at unpleasant memories of
George> mindless application of coding style guidelines. In this case, I suspect
George> you just posted before reading carefully enough to see the subtle logic.)
Nope, I'm in the wrong here. And your comment here is wonderful, I
really do appreciate how you handled my ham fisted attempt to
contribute. But I've got thick skin and I'll keep trying in my free
time to comment on patches when I can.
John
* George Spelvin <[email protected]> wrote:
> (I hope I'm not annoying you by bikeshedding this too much, although I
> think this is improving.)
[ I don't mind, although I wish other, more critical parts of the kernel got this
much attention as well ;-) ]
> Anyway, suggested changes for v6 (sigh...):
>
> First: you do a second read of vmap_info_gen to optimize out the copy
> of vmalloc_info if it's easily seen as pointless, but given how small
> vmalloc_info is (two words!), i'd be inclined to omit that optimization.
>
> Copy always, *then* see if it's worth keeping. Smaller code, faster
> fast path, and is barely noticeable on the slow path.
Ok, done.
> Second, and this is up to you, I'd be inclined to go fully non-blocking and
> only spin_trylock(). If that fails, just skip the cache update.
So I'm not sure about this one: we have no guarantee of the order every updater
reaches the spinlock, and we want the 'freshest' updater to do the update. The
trylock might cause us to drop the 'freshest' update erroneously - so this change
would introduce a 'stale data' bug I think.
> Third, ANSI C rules allow a compiler to assume that signed integer
> overflow does not occur. That means that gcc is allowed to optimize
> "if (x - y > 0)" to "if (x > y)".
That's annoying ...
> Given that gcc has annoyed us by using this optimization in other
> contexts, It might be safer to make them unsigned (which is required to
> wrap properly) and cast to integer after subtraction.
Ok, done.
> Basically, the following (untested, but pretty damn simple):
I've attached v6 which applies your first and last suggestion, but not the trylock
one.
I also removed _ONCE() accesses from the places that didn't need them.
I added your Reviewed-by optimistically, saving a v7 submission hopefully ;-)
Lightly tested.
Thanks,
Ingo
==============================>
>From 8364822f9cff9da9f9858f0ca1f1ddc5bd3ad3a1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <[email protected]>
Date: Sat, 22 Aug 2015 12:28:01 +0200
Subject: [PATCH] mm/vmalloc: Cache the vmalloc memory info
Linus reported that for certain workloads such as 'make test' in the
Git build, glibc's qsort will read /proc/meminfo for every process
created (by way of get_phys_pages()), which causes the Git build
to generate a surprising amount of kernel overhead.
A fair chunk of the overhead is due to get_vmalloc_info() - which
walks a potentially long list to do its statistics.
Modify Linus's jiffies based patch to use generation counters
to cache the vmalloc info: vmap_unlock() increases the generation
counter, and the get_vmalloc_info() reads it and compares it
against a cached generation counter.
Also use a spinlock to make sure we always print a consistent
set of vmalloc statistics, FWIIW.
Reported-by: Linus Torvalds <[email protected]>
Reviewed-by: George Spelvin <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: [email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
mm/vmalloc.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 74 insertions(+), 3 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 605138083880..a0a4274a7be9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -276,7 +276,21 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
-static DEFINE_SPINLOCK(vmap_area_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(vmap_area_lock);
+
+#ifdef CONFIG_PROC_FS
+/*
+ * A seqlock and two generation counters for a simple cache of the
+ * vmalloc allocation statistics info printed in /proc/meminfo.
+ *
+ * ( The assumption of the optimization is that it's read frequently, but
+ * modified infrequently. )
+ */
+static DEFINE_SPINLOCK(vmap_info_lock);
+static unsigned int vmap_info_gen = 1;
+static unsigned int vmap_info_cache_gen;
+static struct vmalloc_info vmap_info_cache;
+#endif
static inline void vmap_lock(void)
{
@@ -285,6 +299,9 @@ static inline void vmap_lock(void)
static inline void vmap_unlock(void)
{
+#ifdef CONFIG_PROC_FS
+ WRITE_ONCE(vmap_info_gen, vmap_info_gen+1);
+#endif
spin_unlock(&vmap_area_lock);
}
@@ -2699,7 +2716,7 @@ static int __init proc_vmalloc_init(void)
}
module_init(proc_vmalloc_init);
-void get_vmalloc_info(struct vmalloc_info *vmi)
+static void calc_vmalloc_info(struct vmalloc_info *vmi)
{
struct vmap_area *va;
unsigned long free_area_size;
@@ -2746,5 +2763,59 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
out:
rcu_read_unlock();
}
-#endif
+/*
+ * Return a consistent snapshot of the current vmalloc allocation
+ * statistics, for /proc/meminfo:
+ */
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+ unsigned int cache_gen, gen;
+
+ /*
+ * The common case is that the cache is valid, so first
+ * read it, then check its validity.
+ *
+ * The two read barriers make sure that we read
+ * 'cache_gen', 'vmap_info_cache' and 'gen' in
+ * precisely that order:
+ */
+ cache_gen = vmap_info_cache_gen;
+ smp_rmb();
+ *vmi = vmap_info_cache;
+ smp_rmb();
+ gen = vmap_info_gen;
+
+ /*
+ * If the generation counter of the cache matches that of
+ * the vmalloc generation counter then return the cache:
+ */
+ if (cache_gen == gen)
+ return;
+
+ /* Make sure 'gen' is read before the vmalloc info: */
+ smp_rmb();
+ calc_vmalloc_info(vmi);
+
+ /*
+ * All updates to vmap_info_cache_gen go through this spinlock,
+ * so when the cache got invalidated, we'll only mark it valid
+ * again if we first fully write the new vmap_info_cache.
+ *
+ * This ensures that partial results won't be used and that the
+ * vmalloc info belonging to the freshest update is used:
+ */
+ spin_lock(&vmap_info_lock);
+ if ((int)(gen-vmap_info_cache_gen) > 0) {
+ vmap_info_cache = *vmi;
+ /*
+ * Make sure the new cached data is visible before
+ * the generation counter update:
+ */
+ smp_wmb();
+ vmap_info_cache_gen = gen;
+ }
+ spin_unlock(&vmap_info_lock);
+}
+
+#endif /* CONFIG_PROC_FS */
>> Second, and this is up to you, I'd be inclined to go fully non-blocking
>> and only spin_trylock(). If that fails, just skip the cache update.
> So I'm not sure about this one: we have no guarantee of the order every
> updater reaches the spinlock, and we want the 'freshest' updater to do
> the update. The trylock might cause us to drop the 'freshest' update
> erroneously - so this change would introduce a 'stale data' bug I think.
Er, no it wouldn't. If someone leaves the cache stale, they'll leave
vmap_info_cache_gen != vmap_info_gen and the next reader to come along
will see the cache is stale and refresh it.
If there's lock contention, there's a risk of more work, because the
callers fall back to calc_vmalloc_info rather than waiting.
But it's not a big risk. With the blocking code, if two readers
arrive simultaneously and see a stale cache, they'll both call
calc_vmalloc_info() and then line up to update the cache.
The second will get the lock but then *not* update the cache.
With trylock(), the second will just skip the update faster.
Same number of calc_vmalloc_info() calls.
The only inefficient case is if two readers arrive far enough apart that
vmap_info_gen is updated between them, yet close enough together than
the second arrives at the lock while the first is updating the cache.
In that case, the second reader will not update the cache and (assuming
no more changes to vmap_info_gen) some future third reader will have to
duplicate the effort of calling calc_vmalloc_info().
But that's such a tiny window I give preference to my fondness
for non-blocking code.
> I added your Reviewed-by optimistically, saving a v7 submission hopefully ;-)
You did right. As I said, the non-blocking part is your preference.
I've done so much nasty stuff in interrupt handlers ($DAY_JOB more than
kernel) that I go for the non-blocking algorithm whenever possible.
Reviewed-by: George Spelvin <[email protected]>
On Sun, Aug 23, 2015 at 10:17:51AM +0200, Ingo Molnar wrote:
> +static u64 vmap_info_gen;
> +static u64 vmap_info_cache_gen;
> +void get_vmalloc_info(struct vmalloc_info *vmi)
> +{
> + u64 gen = READ_ONCE(vmap_info_gen);
> +
> + /*
> + * If the generation counter of the cache matches that of
> + * the vmalloc generation counter then return the cache:
> + */
> + if (READ_ONCE(vmap_info_cache_gen) == gen) {
Why are those things u64? It has the obvious down-side that you still
get split loads on 32bit machines.
On Tue, Aug 25, 2015 at 11:56:38AM +0200, Ingo Molnar wrote:
> +void get_vmalloc_info(struct vmalloc_info *vmi)
> +{
> + unsigned int cache_gen, gen;
I see you dropped the u64 thing, good, ignore that previous email.
> +
> + /*
> + * The common case is that the cache is valid, so first
> + * read it, then check its validity.
> + *
> + * The two read barriers make sure that we read
> + * 'cache_gen', 'vmap_info_cache' and 'gen' in
> + * precisely that order:
> + */
> + cache_gen = vmap_info_cache_gen;
> + smp_rmb();
> + *vmi = vmap_info_cache;
> + smp_rmb();
> + gen = vmap_info_gen;
> +
> + /*
> + * If the generation counter of the cache matches that of
> + * the vmalloc generation counter then return the cache:
> + */
> + if (cache_gen == gen)
> + return;
There is one aspect of READ_ONCE() that is not replaced with smp_rmb(),
and that is that READ_ONCE() should avoid split loads.
Without READ_ONCE() the compiler is free to turn the loads into separate
and out of order byte loads just because its insane, thereby also making
the WRITE_ONCE() pointless.
Now I'm fairly sure it all doesn't matter much, the info can change the
moment we've copied it, so the read is inherently racy.
And by that same argument I feel this is all somewhat over engineered.
> +
> + /* Make sure 'gen' is read before the vmalloc info: */
> + smp_rmb();
> + calc_vmalloc_info(vmi);
> +
> + /*
> + * All updates to vmap_info_cache_gen go through this spinlock,
> + * so when the cache got invalidated, we'll only mark it valid
> + * again if we first fully write the new vmap_info_cache.
> + *
> + * This ensures that partial results won't be used and that the
> + * vmalloc info belonging to the freshest update is used:
> + */
> + spin_lock(&vmap_info_lock);
> + if ((int)(gen-vmap_info_cache_gen) > 0) {
> + vmap_info_cache = *vmi;
> + /*
> + * Make sure the new cached data is visible before
> + * the generation counter update:
> + */
> + smp_wmb();
> + vmap_info_cache_gen = gen;
> + }
> + spin_unlock(&vmap_info_lock);
> +}
> +
> +#endif /* CONFIG_PROC_FS */
On Tue, Aug 25 2015, Ingo Molnar <[email protected]> wrote:
> * George Spelvin <[email protected]> wrote:
>
>> (I hope I'm not annoying you by bikeshedding this too much, although I
>> think this is improving.)
>
> [ I don't mind, although I wish other, more critical parts of the kernel got this
> much attention as well ;-) ]
>
Since we're beating dead horses, let me point out one possibly
unintentional side-effect of initializing just one of vmap_info{,_cache}_gen:
$ nm -n vmlinux | grep -E 'vmap_info(_cache)?_gen'
ffffffff81e4e5e0 d vmap_info_gen
ffffffff820d5700 b vmap_info_cache_gen
[Up-thread, you wrote "I also moved the function-static cache next to the
flag and seqlock - this should further compress the cache footprint."]
One should probably ensure that they end up in the same cacheline if one
wants the fast-path to be as fast as possible - the easiest way to
ensure that is to put them in a small struct, and that might as well
contain the spinlock and the cache itself as well.
It's been fun seeing this evolve, but overall, I tend to agree with
Peter: It's a lot of complexity for little gain. If we're not going to
just kill the Vmalloc* fields (which is probably too controversial)
I'd prefer Linus' simpler version.
Rasmus
>>> (I hope I'm not annoying you by bikeshedding this too much, although I
>>> think this is improving.)
>>
>> [ I don't mind, although I wish other, more critical parts of the kernel got this
>> much attention as well ;-) ]
That's the problem with small, understandable problems: people *aren't*
scared to mess with them.
> It's been fun seeing this evolve, but overall, I tend to agree with
> Peter: It's a lot of complexity for little gain. If we're not going to
> just kill the Vmalloc* fields (which is probably too controversial)
> I'd prefer Linus' simpler version.
Are you sure you're not being affected by the number of iterations?
The final version is not actually a lot of code (although yes, more than
Linus's), and offers the advantage of peace of mind: there's not some
nasty-smelling code you can't entirely trust left behind.
On Sun, Aug 23, 2015 at 2:56 PM, Rasmus Villemoes
<[email protected]> wrote:
>
> [Maybe one could just remove the fields and see if anybody actually
> notices/cares any longer. Or, if they are only used by kernel
> developers, put them in their own file.]
I'm actually inclined to try exactly that for 4.3, and then take
Ingo's patch as a fallback in case somebody actually notices.
I'm not convinced anybody actually uses those values, and they are
getting *less* relevant rather than more (on 64-bit, those values
really don't matter, since the vmalloc space isn't really a
limitation), so let's try removing them and seeing what happens. And
then we know what we can do if somebody does actually notice.
Linus
On Tue, Aug 25, 2015 at 9:39 AM, Linus Torvalds
<[email protected]> wrote:
>
> I'm not convinced anybody actually uses those values, and they are
> getting *less* relevant rather than more (on 64-bit, those values
> really don't matter, since the vmalloc space isn't really a
> limitation)
Side note: the people who actually care about "my vmalloc area is too
full, what's up?" would use /proc/vmallocinfo anyway, since that's
what shows things like fragmentation etc.
So I'm just talking about removing the /proc/meminfo part. First try
to remove it *all*, and if there is some script that hollers because
it wants to parse them, print out the values as zero.
Linus