2013-03-13 06:33:09

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH v2 0/8] remove vm_struct list management

This patchset remove vm_struct list management after initializing vmalloc.
Adding and removing an entry to vmlist is linear time complexity, so
it is inefficient. If we maintain this list, overall time complexity of
adding and removing area to vmalloc space is O(N), although we use
rbtree for finding vacant place and it's time complexity is just O(logN).

And vmlist and vmlist_lock is used many places of outside of vmalloc.c.
It is preferable that we hide this raw data structure and provide
well-defined function for supporting them, because it makes that they
cannot mistake when manipulating theses structure and it makes us easily
maintain vmalloc layer.

For kexec and makedumpfile, I export vmap_area_list, instead of vmlist.
This comes from Atsushi's recommendation.
For more information, please refer below link.
https://lkml.org/lkml/2012/12/6/184

These are based on v3.9-rc2.

Changes from v1
5/8: skip areas for lazy_free
6/8: skip areas for lazy_free
7/8: export vmap_area_list for kexec, instead of vmlist

Joonsoo Kim (8):
mm, vmalloc: change iterating a vmlist to find_vm_area()
mm, vmalloc: move get_vmalloc_info() to vmalloc.c
mm, vmalloc: protect va->vm by vmap_area_lock
mm, vmalloc: iterate vmap_area_list, instead of vmlist in
vread/vwrite()
mm, vmalloc: iterate vmap_area_list in get_vmalloc_info()
mm, vmalloc: iterate vmap_area_list, instead of vmlist, in
vmallocinfo()
mm, vmalloc: export vmap_area_list, instead of vmlist
mm, vmalloc: remove list management of vmlist after initializing
vmalloc

arch/tile/mm/pgtable.c | 7 +-
arch/unicore32/mm/ioremap.c | 17 ++--
arch/x86/mm/ioremap.c | 7 +-
fs/proc/Makefile | 2 +-
fs/proc/internal.h | 18 ----
fs/proc/meminfo.c | 1 +
fs/proc/mmu.c | 60 -------------
include/linux/vmalloc.h | 21 ++++-
kernel/kexec.c | 2 +-
mm/nommu.c | 3 +-
mm/vmalloc.c | 207 +++++++++++++++++++++++++++++--------------
11 files changed, 170 insertions(+), 175 deletions(-)
delete mode 100644 fs/proc/mmu.c

--
1.7.9.5


2013-03-13 06:33:19

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH v2 7/8] mm, vmalloc: export vmap_area_list, instead of vmlist

From: Joonsoo Kim <[email protected]>

Although our intention is to unexport internal structure entirely,
but there is one exception for kexec. kexec dumps address of vmlist
and makedumpfile uses this information.

We are about to remove vmlist, then another way to retrieve information
of vmalloc layer is needed for makedumpfile. For this purpose,
we export vmap_area_list, instead of vmlist.

Cc: Eric Biederman <[email protected]>
Cc: Dave Anderson <[email protected]>
Cc: Vivek Goyal <[email protected]>
Cc: Atsushi Kumagai <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 698b1e5..8a25f90 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -130,8 +130,7 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
/*
* Internals. Dont't use..
*/
-extern rwlock_t vmlist_lock;
-extern struct vm_struct *vmlist;
+extern struct list_head vmap_area_list;
extern __init void vm_area_add_early(struct vm_struct *vm);
extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);

diff --git a/kernel/kexec.c b/kernel/kexec.c
index bddd3d7..d9bfc6c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1489,7 +1489,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(swapper_pg_dir);
#endif
VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmlist);
+ VMCOREINFO_SYMBOL(vmap_area_list);

#ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(mem_map);
diff --git a/mm/nommu.c b/mm/nommu.c
index e193280..ed82358 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -228,8 +228,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL(follow_pfn);

-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
+LIST_HEAD(vmap_area_list);

void vfree(const void *addr)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bda6cef..7e63984 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -261,7 +261,8 @@ struct vmap_area {
};

static DEFINE_SPINLOCK(vmap_area_lock);
-static LIST_HEAD(vmap_area_list);
+/* Export for kexec only */
+LIST_HEAD(vmap_area_list);
static struct rb_root vmap_area_root = RB_ROOT;

/* The vmap cache globals are protected by vmap_area_lock */
@@ -272,6 +273,10 @@ static unsigned long cached_align;

static unsigned long vmap_area_pcpu_hole;

+/*** Old vmalloc interfaces ***/
+static DEFINE_RWLOCK(vmlist_lock);
+static struct vm_struct *vmlist;
+
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
@@ -1283,10 +1288,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
}
EXPORT_SYMBOL_GPL(map_vm_area);

-/*** Old vmalloc interfaces ***/
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
-
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, const void *caller)
{
--
1.7.9.5

2013-03-13 06:33:17

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH v2 3/8] mm, vmalloc: protect va->vm by vmap_area_lock

From: Joonsoo Kim <[email protected]>

Inserting and removing an entry to vmlist is linear time complexity, so
it is inefficient. Following patches will try to remove vmlist entirely.
This patch is preparing step for it.

For removing vmlist, iterating vmlist codes should be changed to iterating
a vmap_area_list. Before implementing that, we should make sure that
when we iterate a vmap_area_list, accessing to va->vm doesn't cause a race
condition. This patch ensure that when iterating a vmap_area_list,
there is no race condition for accessing to vm_struct.

Signed-off-by: Joonsoo Kim <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d9878b..1bf94ad 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1290,12 +1290,14 @@ struct vm_struct *vmlist;
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, const void *caller)
{
+ spin_lock(&vmap_area_lock);
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
va->flags |= VM_VM_AREA;
+ spin_unlock(&vmap_area_lock);
}

static void insert_vmalloc_vmlist(struct vm_struct *vm)
@@ -1447,6 +1449,11 @@ struct vm_struct *remove_vm_area(const void *addr)
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;

+ spin_lock(&vmap_area_lock);
+ va->vm = NULL;
+ va->flags &= ~VM_VM_AREA;
+ spin_unlock(&vmap_area_lock);
+
if (!(vm->flags & VM_UNLIST)) {
struct vm_struct *tmp, **p;
/*
--
1.7.9.5

2013-03-13 06:33:15

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH v2 4/8] mm, vmalloc: iterate vmap_area_list, instead of vmlist in vread/vwrite()

From: Joonsoo Kim <[email protected]>

Now, when we hold a vmap_area_lock, va->vm can't be discarded. So we can
safely access to va->vm when iterating a vmap_area_list with holding a
vmap_area_lock. With this property, change iterating vmlist codes in
vread/vwrite() to iterating vmap_area_list.

There is a little difference relate to lock, because vmlist_lock is mutex,
but, vmap_area_lock is spin_lock. It may introduce a spinning overhead
during vread/vwrite() is executing. But, these are debug-oriented
functions, so this overhead is not real problem for common case.

Signed-off-by: Joonsoo Kim <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1bf94ad..59aa328 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2012,7 +2012,8 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)

long vread(char *buf, char *addr, unsigned long count)
{
- struct vm_struct *tmp;
+ struct vmap_area *va;
+ struct vm_struct *vm;
char *vaddr, *buf_start = buf;
unsigned long buflen = count;
unsigned long n;
@@ -2021,10 +2022,17 @@ long vread(char *buf, char *addr, unsigned long count)
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;

- read_lock(&vmlist_lock);
- for (tmp = vmlist; count && tmp; tmp = tmp->next) {
- vaddr = (char *) tmp->addr;
- if (addr >= vaddr + tmp->size - PAGE_SIZE)
+ spin_lock(&vmap_area_lock);
+ list_for_each_entry(va, &vmap_area_list, list) {
+ if (!count)
+ break;
+
+ if (!(va->flags & VM_VM_AREA))
+ continue;
+
+ vm = va->vm;
+ vaddr = (char *) vm->addr;
+ if (addr >= vaddr + vm->size - PAGE_SIZE)
continue;
while (addr < vaddr) {
if (count == 0)
@@ -2034,10 +2042,10 @@ long vread(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = vaddr + tmp->size - PAGE_SIZE - addr;
+ n = vaddr + vm->size - PAGE_SIZE - addr;
if (n > count)
n = count;
- if (!(tmp->flags & VM_IOREMAP))
+ if (!(vm->flags & VM_IOREMAP))
aligned_vread(buf, addr, n);
else /* IOREMAP area is treated as memory hole */
memset(buf, 0, n);
@@ -2046,7 +2054,7 @@ long vread(char *buf, char *addr, unsigned long count)
count -= n;
}
finished:
- read_unlock(&vmlist_lock);
+ spin_unlock(&vmap_area_lock);

if (buf == buf_start)
return 0;
@@ -2085,7 +2093,8 @@ finished:

long vwrite(char *buf, char *addr, unsigned long count)
{
- struct vm_struct *tmp;
+ struct vmap_area *va;
+ struct vm_struct *vm;
char *vaddr;
unsigned long n, buflen;
int copied = 0;
@@ -2095,10 +2104,17 @@ long vwrite(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
buflen = count;

- read_lock(&vmlist_lock);
- for (tmp = vmlist; count && tmp; tmp = tmp->next) {
- vaddr = (char *) tmp->addr;
- if (addr >= vaddr + tmp->size - PAGE_SIZE)
+ spin_lock(&vmap_area_lock);
+ list_for_each_entry(va, &vmap_area_list, list) {
+ if (!count)
+ break;
+
+ if (!(va->flags & VM_VM_AREA))
+ continue;
+
+ vm = va->vm;
+ vaddr = (char *) vm->addr;
+ if (addr >= vaddr + vm->size - PAGE_SIZE)
continue;
while (addr < vaddr) {
if (count == 0)
@@ -2107,10 +2123,10 @@ long vwrite(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = vaddr + tmp->size - PAGE_SIZE - addr;
+ n = vaddr + vm->size - PAGE_SIZE - addr;
if (n > count)
n = count;
- if (!(tmp->flags & VM_IOREMAP)) {
+ if (!(vm->flags & VM_IOREMAP)) {
aligned_vwrite(buf, addr, n);
copied++;
}
@@ -2119,7 +2135,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
count -= n;
}
finished:
- read_unlock(&vmlist_lock);
+ spin_unlock(&vmap_area_lock);
if (!copied)
return 0;
return buflen;
--
1.7.9.5

2013-03-13 06:33:13

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH v2 5/8] mm, vmalloc: iterate vmap_area_list in get_vmalloc_info()

From: Joonsoo Kim <[email protected]>

This patch is preparing step for removing vmlist entirely.
For above purpose, we change iterating a vmap_list codes to iterating a
vmap_area_list. It is somewhat trivial change, but just one thing
should be noticed.

vmlist is lack of information about some areas in vmalloc address space.
For example, vm_map_ram() allocate area in vmalloc address space,
but it doesn't make a link with vmlist. To provide full information about
vmalloc address space is better idea, so we don't use va->vm and use
vmap_area directly.
This makes get_vmalloc_info() more precise.

Signed-off-by: Joonsoo Kim <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 59aa328..aee1f61 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2671,46 +2671,50 @@ module_init(proc_vmalloc_init);

void get_vmalloc_info(struct vmalloc_info *vmi)
{
- struct vm_struct *vma;
+ struct vmap_area *va;
unsigned long free_area_size;
unsigned long prev_end;

vmi->used = 0;
+ vmi->largest_chunk = 0;

- if (!vmlist) {
- vmi->largest_chunk = VMALLOC_TOTAL;
- } else {
- vmi->largest_chunk = 0;
+ prev_end = VMALLOC_START;

- prev_end = VMALLOC_START;
-
- read_lock(&vmlist_lock);
+ spin_lock(&vmap_area_lock);

- for (vma = vmlist; vma; vma = vma->next) {
- unsigned long addr = (unsigned long) vma->addr;
+ if (list_empty(&vmap_area_list)) {
+ vmi->largest_chunk = VMALLOC_TOTAL;
+ goto out;
+ }

- /*
- * Some archs keep another range for modules in vmlist
- */
- if (addr < VMALLOC_START)
- continue;
- if (addr >= VMALLOC_END)
- break;
+ list_for_each_entry(va, &vmap_area_list, list) {
+ unsigned long addr = va->va_start;

- vmi->used += vma->size;
+ /*
+ * Some archs keep another range for modules in vmalloc space
+ */
+ if (addr < VMALLOC_START)
+ continue;
+ if (addr >= VMALLOC_END)
+ break;

- free_area_size = addr - prev_end;
- if (vmi->largest_chunk < free_area_size)
- vmi->largest_chunk = free_area_size;
+ if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
+ continue;

- prev_end = vma->size + addr;
- }
+ vmi->used += (va->va_end - va->va_start);

- if (VMALLOC_END - prev_end > vmi->largest_chunk)
- vmi->largest_chunk = VMALLOC_END - prev_end;
+ free_area_size = addr - prev_end;
+ if (vmi->largest_chunk < free_area_size)
+ vmi->largest_chunk = free_area_size;

- read_unlock(&vmlist_lock);
+ prev_end = va->va_end;
}
+
+ if (VMALLOC_END - prev_end > vmi->largest_chunk)
+ vmi->largest_chunk = VMALLOC_END - prev_end;
+
+out:
+ spin_unlock(&vmap_area_lock);
}
#endif

--
1.7.9.5

2013-03-13 06:34:32

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH v2 8/8] mm, vmalloc: remove list management of vmlist after initializing vmalloc

From: Joonsoo Kim <[email protected]>

Now, there is no need to maintain vmlist after initializing vmalloc.
So remove related code and data structure.

Signed-off-by: Joonsoo Kim <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7e63984..151da8a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -273,10 +273,6 @@ static unsigned long cached_align;

static unsigned long vmap_area_pcpu_hole;

-/*** Old vmalloc interfaces ***/
-static DEFINE_RWLOCK(vmlist_lock);
-static struct vm_struct *vmlist;
-
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
@@ -318,7 +314,7 @@ static void __insert_vmap_area(struct vmap_area *va)
rb_link_node(&va->rb_node, parent, p);
rb_insert_color(&va->rb_node, &vmap_area_root);

- /* address-sort this list so it is usable like the vmlist */
+ /* address-sort this list */
tmp = rb_prev(&va->rb_node);
if (tmp) {
struct vmap_area *prev;
@@ -1130,6 +1126,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
}
EXPORT_SYMBOL(vm_map_ram);

+static struct vm_struct *vmlist __initdata;
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -1301,10 +1298,8 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
spin_unlock(&vmap_area_lock);
}

-static void insert_vmalloc_vmlist(struct vm_struct *vm)
+static void clear_vm_unlist(struct vm_struct *vm)
{
- struct vm_struct *tmp, **p;
-
/*
* Before removing VM_UNLIST,
* we should make sure that vm has proper values.
@@ -1312,22 +1307,13 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm)
*/
smp_wmb();
vm->flags &= ~VM_UNLIST;
-
- write_lock(&vmlist_lock);
- for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
- if (tmp->addr >= vm->addr)
- break;
- }
- vm->next = *p;
- *p = vm;
- write_unlock(&vmlist_lock);
}

static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, const void *caller)
{
setup_vmalloc_vm(vm, va, flags, caller);
- insert_vmalloc_vmlist(vm);
+ clear_vm_unlist(vm);
}

static struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -1370,10 +1356,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,

/*
* When this function is called from __vmalloc_node_range,
- * we do not add vm_struct to vmlist here to avoid
- * accessing uninitialized members of vm_struct such as
- * pages and nr_pages fields. They will be set later.
- * To distinguish it from others, we use a VM_UNLIST flag.
+ * we add VM_UNLIST flag to avoid accessing uninitialized
+ * members of vm_struct such as pages and nr_pages fields.
+ * They will be set later.
*/
if (flags & VM_UNLIST)
setup_vmalloc_vm(area, va, flags, caller);
@@ -1462,20 +1447,6 @@ struct vm_struct *remove_vm_area(const void *addr)
va->flags &= ~VM_VM_AREA;
spin_unlock(&vmap_area_lock);

- if (!(vm->flags & VM_UNLIST)) {
- struct vm_struct *tmp, **p;
- /*
- * remove from list and disallow access to
- * this vm_struct before unmap. (address range
- * confliction is maintained by vmap.)
- */
- write_lock(&vmlist_lock);
- for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
- ;
- *p = tmp->next;
- write_unlock(&vmlist_lock);
- }
-
vmap_debug_free_range(va->va_start, va->va_end);
free_unmap_vmap_area(va);
vm->size -= PAGE_SIZE;
@@ -1695,10 +1666,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return NULL;

/*
- * In this function, newly allocated vm_struct is not added
- * to vmlist at __get_vm_area_node(). so, it is added here.
+ * In this function, newly allocated vm_struct has VM_UNLIST flag.
+ * It means that vm_struct is not fully initialized.
+ * Now, it is fully initialized, so remove this flag here.
*/
- insert_vmalloc_vmlist(area);
+ clear_vm_unlist(area);

/*
* A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2594,7 +2566,7 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
if (!counters)
return;

- /* Pair with smp_wmb() in insert_vmalloc_vmlist() */
+ /* Pair with smp_wmb() in clear_vm_unlist() */
smp_rmb();
if (v->flags & VM_UNLIST)
return;
--
1.7.9.5

2013-03-13 06:34:51

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH v2 6/8] mm, vmalloc: iterate vmap_area_list, instead of vmlist, in vmallocinfo()

From: Joonsoo Kim <[email protected]>

This patch is preparing step for removing vmlist entirely.
For above purpose, we change iterating a vmap_list codes to iterating a
vmap_area_list. It is somewhat trivial change, but just one thing
should be noticed.

Using vmap_area_list in vmallocinfo() introduce ordering problem in SMP
system. In s_show(), we retrieve some values from vm_struct. vm_struct's
values is not fully setup when va->vm is assigned. Full setup is notified
by removing VM_UNLIST flag without holding a lock. When we see that
VM_UNLIST is removed, it is not ensured that vm_struct has proper values
in view of other CPUs. So we need smp_[rw]mb for ensuring that proper
values is assigned when we see that VM_UNLIST is removed.

Therefore, this patch not only change a iteration list, but also add a
appropriate smp_[rw]mb to right places.

Signed-off-by: Joonsoo Kim <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index aee1f61..bda6cef 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1304,7 +1304,14 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm)
{
struct vm_struct *tmp, **p;

+ /*
+ * Before removing VM_UNLIST,
+ * we should make sure that vm has proper values.
+ * Pair with smp_rmb() in show_numa_info().
+ */
+ smp_wmb();
vm->flags &= ~VM_UNLIST;
+
write_lock(&vmlist_lock);
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
if (tmp->addr >= vm->addr)
@@ -2542,19 +2549,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)

#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
- __acquires(&vmlist_lock)
+ __acquires(&vmap_area_lock)
{
loff_t n = *pos;
- struct vm_struct *v;
+ struct vmap_area *va;

- read_lock(&vmlist_lock);
- v = vmlist;
- while (n > 0 && v) {
+ spin_lock(&vmap_area_lock);
+ va = list_entry((&vmap_area_list)->next, typeof(*va), list);
+ while (n > 0 && &va->list != &vmap_area_list) {
n--;
- v = v->next;
+ va = list_entry(va->list.next, typeof(*va), list);
}
- if (!n)
- return v;
+ if (!n && &va->list != &vmap_area_list)
+ return va;

return NULL;

@@ -2562,16 +2569,20 @@ static void *s_start(struct seq_file *m, loff_t *pos)

static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct vm_struct *v = p;
+ struct vmap_area *va = p, *next;

++*pos;
- return v->next;
+ next = list_entry(va->list.next, typeof(*va), list);
+ if (&next->list != &vmap_area_list)
+ return next;
+
+ return NULL;
}

static void s_stop(struct seq_file *m, void *p)
- __releases(&vmlist_lock)
+ __releases(&vmap_area_lock)
{
- read_unlock(&vmlist_lock);
+ spin_unlock(&vmap_area_lock);
}

static void show_numa_info(struct seq_file *m, struct vm_struct *v)
@@ -2582,6 +2593,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
if (!counters)
return;

+ /* Pair with smp_wmb() in insert_vmalloc_vmlist() */
+ smp_rmb();
+ if (v->flags & VM_UNLIST)
+ return;
+
memset(counters, 0, nr_node_ids * sizeof(unsigned int));

for (nr = 0; nr < v->nr_pages; nr++)
@@ -2595,7 +2611,20 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)

static int s_show(struct seq_file *m, void *p)
{
- struct vm_struct *v = p;
+ struct vmap_area *va = p;
+ struct vm_struct *v;
+
+ if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
+ return 0;
+
+ if (!(va->flags & VM_VM_AREA)) {
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
+ return 0;
+ }
+
+ v = va->vm;

seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
--
1.7.9.5

2013-03-13 06:35:36

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH v2 1/8] mm, vmalloc: change iterating a vmlist to find_vm_area()

From: Joonsoo Kim <[email protected]>

The purpose of iterating a vmlist is finding vm area with specific
virtual address. find_vm_area() is provided for this purpose
and more efficient, because it uses a rbtree.
So change it.

Cc: Chris Metcalf <[email protected]>
Cc: Guan Xuetao <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Acked-by: Guan Xuetao <[email protected]>
Acked-by: Ingo Molnar <[email protected]>
Acked-by: Chris Metcalf <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index b3b4972..dfd63ce 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -592,12 +592,7 @@ void iounmap(volatile void __iomem *addr_in)
in parallel. Reuse of the virtual address is prevented by
leaving it in the global lists until we're done with it.
cpa takes care of the direct mappings. */
- read_lock(&vmlist_lock);
- for (p = vmlist; p; p = p->next) {
- if (p->addr == addr)
- break;
- }
- read_unlock(&vmlist_lock);
+ p = find_vm_area((void *)addr);

if (!p) {
pr_err("iounmap: bad address %p\n", addr);
diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c
index b7a6055..13068ee 100644
--- a/arch/unicore32/mm/ioremap.c
+++ b/arch/unicore32/mm/ioremap.c
@@ -235,7 +235,7 @@ EXPORT_SYMBOL(__uc32_ioremap_cached);
void __uc32_iounmap(volatile void __iomem *io_addr)
{
void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr);
- struct vm_struct **p, *tmp;
+ struct vm_struct *vm;

/*
* If this is a section based mapping we need to handle it
@@ -244,17 +244,10 @@ void __uc32_iounmap(volatile void __iomem *io_addr)
* all the mappings before the area can be reclaimed
* by someone else.
*/
- write_lock(&vmlist_lock);
- for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
- if ((tmp->flags & VM_IOREMAP) && (tmp->addr == addr)) {
- if (tmp->flags & VM_UNICORE_SECTION_MAPPING) {
- unmap_area_sections((unsigned long)tmp->addr,
- tmp->size);
- }
- break;
- }
- }
- write_unlock(&vmlist_lock);
+ vm = find_vm_area(addr);
+ if (vm && (vm->flags & VM_IOREMAP) &&
+ (vm->flags & VM_UNICORE_SECTION_MAPPING))
+ unmap_area_sections((unsigned long)vm->addr, vm->size);

vunmap(addr);
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 78fe3f1..9a1e658 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -282,12 +282,7 @@ void iounmap(volatile void __iomem *addr)
in parallel. Reuse of the virtual address is prevented by
leaving it in the global lists until we're done with it.
cpa takes care of the direct mappings. */
- read_lock(&vmlist_lock);
- for (p = vmlist; p; p = p->next) {
- if (p->addr == (void __force *)addr)
- break;
- }
- read_unlock(&vmlist_lock);
+ p = find_vm_area((void __force *)addr);

if (!p) {
printk(KERN_ERR "iounmap: bad address %p\n", addr);
--
1.7.9.5

2013-03-13 06:35:51

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH v2 2/8] mm, vmalloc: move get_vmalloc_info() to vmalloc.c

From: Joonsoo Kim <[email protected]>

Now get_vmalloc_info() is in fs/proc/mmu.c. There is no reason
that this code must be here and it's implementation needs vmlist_lock
and it iterate a vmlist which may be internal data structure for vmalloc.

It is preferable that vmlist_lock and vmlist is only used in vmalloc.c
for maintainability. So move the code to vmalloc.c

Signed-off-by: Joonsoo Kim <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 712f24d..ab30716 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -5,7 +5,7 @@
obj-y += proc.o

proc-y := nommu.o task_nommu.o
-proc-$(CONFIG_MMU) := mmu.o task_mmu.o
+proc-$(CONFIG_MMU) := task_mmu.o

proc-y += inode.o root.o base.o generic.o array.o \
fd.o
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 85ff3a4..7571035 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -30,24 +30,6 @@ extern int proc_net_init(void);
static inline int proc_net_init(void) { return 0; }
#endif

-struct vmalloc_info {
- unsigned long used;
- unsigned long largest_chunk;
-};
-
-#ifdef CONFIG_MMU
-#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
-extern void get_vmalloc_info(struct vmalloc_info *vmi);
-#else
-
-#define VMALLOC_TOTAL 0UL
-#define get_vmalloc_info(vmi) \
-do { \
- (vmi)->used = 0; \
- (vmi)->largest_chunk = 0; \
-} while(0)
-#endif
-
extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 1efaaa1..5aa847a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -11,6 +11,7 @@
#include <linux/swap.h>
#include <linux/vmstat.h>
#include <linux/atomic.h>
+#include <linux/vmalloc.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "internal.h"
diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c
deleted file mode 100644
index 8ae221d..0000000
--- a/fs/proc/mmu.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* mmu.c: mmu memory info files
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells ([email protected])
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/spinlock.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <asm/pgtable.h>
-#include "internal.h"
-
-void get_vmalloc_info(struct vmalloc_info *vmi)
-{
- struct vm_struct *vma;
- unsigned long free_area_size;
- unsigned long prev_end;
-
- vmi->used = 0;
-
- if (!vmlist) {
- vmi->largest_chunk = VMALLOC_TOTAL;
- }
- else {
- vmi->largest_chunk = 0;
-
- prev_end = VMALLOC_START;
-
- read_lock(&vmlist_lock);
-
- for (vma = vmlist; vma; vma = vma->next) {
- unsigned long addr = (unsigned long) vma->addr;
-
- /*
- * Some archs keep another range for modules in vmlist
- */
- if (addr < VMALLOC_START)
- continue;
- if (addr >= VMALLOC_END)
- break;
-
- vmi->used += vma->size;
-
- free_area_size = addr - prev_end;
- if (vmi->largest_chunk < free_area_size)
- vmi->largest_chunk = free_area_size;
-
- prev_end = vma->size + addr;
- }
-
- if (VMALLOC_END - prev_end > vmi->largest_chunk)
- vmi->largest_chunk = VMALLOC_END - prev_end;
-
- read_unlock(&vmlist_lock);
- }
-}
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6071e91..698b1e5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -158,4 +158,22 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
# endif
#endif

+struct vmalloc_info {
+ unsigned long used;
+ unsigned long largest_chunk;
+};
+
+#ifdef CONFIG_MMU
+#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
+extern void get_vmalloc_info(struct vmalloc_info *vmi);
+#else
+
+#define VMALLOC_TOTAL 0UL
+#define get_vmalloc_info(vmi) \
+do { \
+ (vmi)->used = 0; \
+ (vmi)->largest_chunk = 0; \
+} while (0)
+#endif
+
#endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f751f2..1d9878b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2645,5 +2645,49 @@ static int __init proc_vmalloc_init(void)
return 0;
}
module_init(proc_vmalloc_init);
+
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+ struct vm_struct *vma;
+ unsigned long free_area_size;
+ unsigned long prev_end;
+
+ vmi->used = 0;
+
+ if (!vmlist) {
+ vmi->largest_chunk = VMALLOC_TOTAL;
+ } else {
+ vmi->largest_chunk = 0;
+
+ prev_end = VMALLOC_START;
+
+ read_lock(&vmlist_lock);
+
+ for (vma = vmlist; vma; vma = vma->next) {
+ unsigned long addr = (unsigned long) vma->addr;
+
+ /*
+ * Some archs keep another range for modules in vmlist
+ */
+ if (addr < VMALLOC_START)
+ continue;
+ if (addr >= VMALLOC_END)
+ break;
+
+ vmi->used += vma->size;
+
+ free_area_size = addr - prev_end;
+ if (vmi->largest_chunk < free_area_size)
+ vmi->largest_chunk = free_area_size;
+
+ prev_end = vma->size + addr;
+ }
+
+ if (VMALLOC_END - prev_end > vmi->largest_chunk)
+ vmi->largest_chunk = VMALLOC_END - prev_end;
+
+ read_unlock(&vmlist_lock);
+ }
+}
#endif

--
1.7.9.5

2013-03-13 06:44:38

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v2 7/8] mm, vmalloc: export vmap_area_list, instead of vmlist

Joonsoo Kim <[email protected]> writes:

> From: Joonsoo Kim <[email protected]>
>
> Although our intention is to unexport internal structure entirely,
> but there is one exception for kexec. kexec dumps address of vmlist
> and makedumpfile uses this information.
>
> We are about to remove vmlist, then another way to retrieve information
> of vmalloc layer is needed for makedumpfile. For this purpose,
> we export vmap_area_list, instead of vmlist.

That seems entirely reasonable to me. Usage by kexec should not limit
the evoluion of the kernel especially usage by makedumpfile.

Atsushi Kumagai can you make makedumpfile work with this change?

Eric

> Cc: Eric Biederman <[email protected]>
> Cc: Dave Anderson <[email protected]>
> Cc: Vivek Goyal <[email protected]>
> Cc: Atsushi Kumagai <[email protected]>
> Signed-off-by: Joonsoo Kim <[email protected]>
> Signed-off-by: Joonsoo Kim <[email protected]>
>
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 698b1e5..8a25f90 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -130,8 +130,7 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
> /*
> * Internals. Dont't use..
> */
> -extern rwlock_t vmlist_lock;
> -extern struct vm_struct *vmlist;
> +extern struct list_head vmap_area_list;
> extern __init void vm_area_add_early(struct vm_struct *vm);
> extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
>
> diff --git a/kernel/kexec.c b/kernel/kexec.c
> index bddd3d7..d9bfc6c 100644
> --- a/kernel/kexec.c
> +++ b/kernel/kexec.c
> @@ -1489,7 +1489,7 @@ static int __init crash_save_vmcoreinfo_init(void)
> VMCOREINFO_SYMBOL(swapper_pg_dir);
> #endif
> VMCOREINFO_SYMBOL(_stext);
> - VMCOREINFO_SYMBOL(vmlist);
> + VMCOREINFO_SYMBOL(vmap_area_list);
>
> #ifndef CONFIG_NEED_MULTIPLE_NODES
> VMCOREINFO_SYMBOL(mem_map);
> diff --git a/mm/nommu.c b/mm/nommu.c
> index e193280..ed82358 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -228,8 +228,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
> }
> EXPORT_SYMBOL(follow_pfn);
>
> -DEFINE_RWLOCK(vmlist_lock);
> -struct vm_struct *vmlist;
> +LIST_HEAD(vmap_area_list);
>
> void vfree(const void *addr)
> {
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index bda6cef..7e63984 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -261,7 +261,8 @@ struct vmap_area {
> };
>
> static DEFINE_SPINLOCK(vmap_area_lock);
> -static LIST_HEAD(vmap_area_list);
> +/* Export for kexec only */
> +LIST_HEAD(vmap_area_list);
> static struct rb_root vmap_area_root = RB_ROOT;
>
> /* The vmap cache globals are protected by vmap_area_lock */
> @@ -272,6 +273,10 @@ static unsigned long cached_align;
>
> static unsigned long vmap_area_pcpu_hole;
>
> +/*** Old vmalloc interfaces ***/
> +static DEFINE_RWLOCK(vmlist_lock);
> +static struct vm_struct *vmlist;
> +
> static struct vmap_area *__find_vmap_area(unsigned long addr)
> {
> struct rb_node *n = vmap_area_root.rb_node;
> @@ -1283,10 +1288,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
> }
> EXPORT_SYMBOL_GPL(map_vm_area);
>
> -/*** Old vmalloc interfaces ***/
> -DEFINE_RWLOCK(vmlist_lock);
> -struct vm_struct *vmlist;
> -
> static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> unsigned long flags, const void *caller)
> {

2013-03-15 11:18:07

by Atsushi Kumagai

[permalink] [raw]
Subject: Re: [PATCH v2 7/8] mm, vmalloc: export vmap_area_list, instead of vmlist

Hello,

On Tue, 12 Mar 2013 23:43:48 -0700
[email protected] (Eric W. Biederman) wrote:

> Joonsoo Kim <[email protected]> writes:
>
> > From: Joonsoo Kim <[email protected]>
> >
> > Although our intention is to unexport internal structure entirely,
> > but there is one exception for kexec. kexec dumps address of vmlist
> > and makedumpfile uses this information.
> >
> > We are about to remove vmlist, then another way to retrieve information
> > of vmalloc layer is needed for makedumpfile. For this purpose,
> > we export vmap_area_list, instead of vmlist.
>
> That seems entirely reasonable to me. Usage by kexec should not limit
> the evoluion of the kernel especially usage by makedumpfile.
>
> Atsushi Kumagai can you make makedumpfile work with this change?

Sure! I'm going to work with this change in the next version.
But, I noticed that necessary information is missed in this patch,
and sorry for too late reply.

Both OFFSET(vmap_area.va_start) and OFFSET(vmap_area.list) are
necessary to get vmalloc_start value from vmap_area_list, but
they aren't exported in this patch.
I understand that the policy of this patch series "to unexport
internal structure entirely", although the information is necessary
for makedumpfile.

Additionally, OFFSET(vm_struct.addr) is no longer used, should be
removed. It was added for the same purpose as vmlist in the commit
below.

commit acd99dbf54020f5c80b9aa2f2ea86f43cb285b02
Author: Ken'ichi Ohmichi <[email protected]>
Date: Sat Oct 18 20:28:30 2008 -0700

kdump: add vmlist.addr to vmcoreinfo for x86 vmalloc translation.

To sum it up, I would like to push the patch below.

Thanks
Atsushi Kumagai

--
From: Atsushi Kumagai <[email protected]>
Date: Fri, 15 Mar 2013 14:19:28 +0900
Subject: [PATCH] kexec, vmalloc: Export additional information of
vmalloc layer.

Now, vmap_area_list is exported as VMCOREINFO for makedumpfile
to get the start address of vmalloc region (vmalloc_start).
The address which contains vmalloc_start value is represented as
below:

vmap_area_list.next - OFFSET(vmap_area.list) + OFFSET(vmap_area.va_start)

However, both OFFSET(vmap_area.va_start) and OFFSET(vmap_area.list)
aren't exported as VMCOREINFO.

So, this patch exports them externally with small cleanup.

Signed-off-by: Atsushi Kumagai <[email protected]>
---
include/linux/vmalloc.h | 12 ++++++++++++
kernel/kexec.c | 3 ++-
mm/vmalloc.c | 11 -----------
3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 8a25f90..62e0354 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -4,6 +4,7 @@
#include <linux/spinlock.h>
#include <linux/init.h>
#include <asm/page.h> /* pgprot_t */
+#include <linux/rbtree.h>

struct vm_area_struct; /* vma defining user mapping in mm_types.h */

@@ -35,6 +36,17 @@ struct vm_struct {
const void *caller;
};

+struct vmap_area {
+ unsigned long va_start;
+ unsigned long va_end;
+ unsigned long flags;
+ struct rb_node rb_node; /* address sorted rbtree */
+ struct list_head list; /* address sorted list */
+ struct list_head purge_list; /* "lazy purge" list */
+ struct vm_struct *vm;
+ struct rcu_head rcu_head;
+};
+
/*
* Highlevel APIs for driver use
*/
diff --git a/kernel/kexec.c b/kernel/kexec.c
index d9bfc6c..5db0148 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1527,7 +1527,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(free_area, free_list);
VMCOREINFO_OFFSET(list_head, next);
VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vm_struct, addr);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
log_buf_kexec_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 151da8a..72043d6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -249,17 +249,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04

-struct vmap_area {
- unsigned long va_start;
- unsigned long va_end;
- unsigned long flags;
- struct rb_node rb_node; /* address sorted rbtree */
- struct list_head list; /* address sorted list */
- struct list_head purge_list; /* "lazy purge" list */
- struct vm_struct *vm;
- struct rcu_head rcu_head;
-};
-
static DEFINE_SPINLOCK(vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
--
1.8.0.2