2006-03-24 18:17:58

by Jeff Dike

[permalink] [raw]
Subject: [PATCH 12/16] UML - Memory hotplug

This adds hotplug memory support to UML. The mconsole syntax is
config mem=[+-]n[KMG]
In other words, add or subtract some number of kilobytes, megabytes, or
gigabytes.

Unplugged pages are allocated and then madvise(MADV_REMOVE), which is
a currently experimental madvise extension. These pages are tracked so
they can be plugged back in later if the admin decides to give them back.
The first page to be unplugged is used to keep track of about 4M of other
pages. A list_head is the first thing on this page. The rest is filled
with addresses of other unplugged pages. This first page is not madvised,
obviously.
When this page is filled, the next page is used in a similar way and linked
onto a list with the first page. Etc.
This whole process reverses when pages are plugged back in. When a tracking
page no longer tracks any unplugged pages, then it is next in line for
plugging, which is done by freeing pages back to the kernel.

This patch also removes checking for /dev/anon on the host, which is obsoleted
by MADVISE_REMOVE.

Signed-off-by: Jeff Dike <[email protected]>

Index: linux-2.6.16/arch/um/drivers/mconsole_kern.c
===================================================================
--- linux-2.6.16.orig/arch/um/drivers/mconsole_kern.c 2006-03-23 17:15:05.000000000 -0500
+++ linux-2.6.16/arch/um/drivers/mconsole_kern.c 2006-03-23 17:39:21.000000000 -0500
@@ -20,6 +20,8 @@
#include "linux/namei.h"
#include "linux/proc_fs.h"
#include "linux/syscalls.h"
+#include "linux/list.h"
+#include "linux/mm.h"
#include "linux/console.h"
#include "asm/irq.h"
#include "asm/uaccess.h"
@@ -347,6 +349,139 @@ static struct mc_device *mconsole_find_d
return(NULL);
}

+#define UNPLUGGED_PER_PAGE \
+ ((PAGE_SIZE - sizeof(struct list_head)) / sizeof(unsigned long))
+
+struct unplugged_pages {
+ struct list_head list;
+ void *pages[UNPLUGGED_PER_PAGE];
+};
+
+static unsigned long long unplugged_pages_count = 0;
+static struct list_head unplugged_pages = LIST_HEAD_INIT(unplugged_pages);
+static int unplug_index = UNPLUGGED_PER_PAGE;
+
+static int mem_config(char *str)
+{
+ unsigned long long diff;
+ int err = -EINVAL, i, add;
+ char *ret;
+
+ if(str[0] != '=')
+ goto out;
+
+ str++;
+ if(str[0] == '-')
+ add = 0;
+ else if(str[0] == '+'){
+ add = 1;
+ }
+ else goto out;
+
+ str++;
+ diff = memparse(str, &ret);
+ if(*ret != '\0')
+ goto out;
+
+ diff /= PAGE_SIZE;
+
+ for(i = 0; i < diff; i++){
+ struct unplugged_pages *unplugged;
+ void *addr;
+
+ if(add){
+ if(list_empty(&unplugged_pages))
+ break;
+
+ unplugged = list_entry(unplugged_pages.next,
+ struct unplugged_pages, list);
+ if(unplug_index > 0)
+ addr = unplugged->pages[--unplug_index];
+ else {
+ list_del(&unplugged->list);
+ addr = unplugged;
+ unplug_index = UNPLUGGED_PER_PAGE;
+ }
+
+ free_page((unsigned long) addr);
+ unplugged_pages_count--;
+ }
+ else {
+ struct page *page;
+
+ page = alloc_page(GFP_ATOMIC);
+ if(page == NULL)
+ break;
+
+ unplugged = page_address(page);
+ if(unplug_index == UNPLUGGED_PER_PAGE){
+ INIT_LIST_HEAD(&unplugged->list);
+ list_add(&unplugged->list, &unplugged_pages);
+ unplug_index = 0;
+ }
+ else {
+ struct list_head *entry = unplugged_pages.next;
+ addr = unplugged;
+
+ unplugged = list_entry(entry,
+ struct unplugged_pages,
+ list);
+ unplugged->pages[unplug_index++] = addr;
+ err = os_drop_memory(addr, PAGE_SIZE);
+ if(err)
+ printk("Failed to release memory - "
+ "errno = %d\n", err);
+ }
+
+ unplugged_pages_count++;
+ }
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+static int mem_get_config(char *name, char *str, int size, char **error_out)
+{
+ char buf[sizeof("18446744073709551615\0")];
+ int len = 0;
+
+ sprintf(buf, "%ld", uml_physmem);
+ CONFIG_CHUNK(str, size, len, buf, 1);
+
+ return len;
+}
+
+static int mem_id(char **str, int *start_out, int *end_out)
+{
+ *start_out = 0;
+ *end_out = 0;
+
+ return 0;
+}
+
+static int mem_remove(int n)
+{
+ return -EBUSY;
+}
+
+static struct mc_device mem_mc = {
+ .name = "mem",
+ .config = mem_config,
+ .get_config = mem_get_config,
+ .id = mem_id,
+ .remove = mem_remove,
+};
+
+static int mem_mc_init(void)
+{
+ mconsole_register_dev(&mem_mc);
+ return 0;
+}
+
+__initcall(mem_mc_init);
+
#define CONFIG_BUF_SIZE 64

static void mconsole_get_config(int (*get_config)(char *, char *, int,
Index: linux-2.6.16/arch/um/include/os.h
===================================================================
--- linux-2.6.16.orig/arch/um/include/os.h 2006-03-23 17:35:56.000000000 -0500
+++ linux-2.6.16/arch/um/include/os.h 2006-03-23 17:39:21.000000000 -0500
@@ -205,6 +205,7 @@ extern int os_map_memory(void *virt, int
extern int os_protect_memory(void *addr, unsigned long len,
int r, int w, int x);
extern int os_unmap_memory(void *addr, int len);
+extern int os_drop_memory(void *addr, int length);
extern void os_flush_stdout(void);

/* tt.c
Index: linux-2.6.16/arch/um/os-Linux/process.c
===================================================================
--- linux-2.6.16.orig/arch/um/os-Linux/process.c 2006-03-23 17:15:05.000000000 -0500
+++ linux-2.6.16/arch/um/os-Linux/process.c 2006-03-23 17:39:21.000000000 -0500
@@ -187,6 +187,20 @@ int os_unmap_memory(void *addr, int len)
return(0);
}

+#ifndef MADV_REMOVE
+#define MADV_REMOVE 0x5 /* remove these pages & resources */
+#endif
+
+int os_drop_memory(void *addr, int length)
+{
+ int err;
+
+ err = madvise(addr, length, MADV_REMOVE);
+ if(err < 0)
+ err = -errno;
+ return 0;
+}
+
void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int))
{
int flags = 0, pages;
Index: linux-2.6.16/arch/um/include/mem_user.h
===================================================================
--- linux-2.6.16.orig/arch/um/include/mem_user.h 2006-03-23 17:15:05.000000000 -0500
+++ linux-2.6.16/arch/um/include/mem_user.h 2006-03-23 17:39:21.000000000 -0500
@@ -49,7 +49,6 @@ extern int iomem_size;
extern unsigned long host_task_size;
extern unsigned long task_size;

-extern void check_devanon(void);
extern int init_mem_user(void);
extern void setup_memory(void *entry);
extern unsigned long find_iomem(char *driver, unsigned long *len_out);
Index: linux-2.6.16/arch/um/os-Linux/mem.c
===================================================================
--- linux-2.6.16.orig/arch/um/os-Linux/mem.c 2006-03-23 17:15:05.000000000 -0500
+++ linux-2.6.16/arch/um/os-Linux/mem.c 2006-03-23 17:39:21.000000000 -0500
@@ -121,36 +121,11 @@ int create_tmp_file(unsigned long long l
return(fd);
}

-static int create_anon_file(unsigned long long len)
-{
- void *addr;
- int fd;
-
- fd = open("/dev/anon", O_RDWR);
- if(fd < 0) {
- perror("opening /dev/anon");
- exit(1);
- }
-
- addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
- if(addr == MAP_FAILED){
- perror("mapping physmem file");
- exit(1);
- }
- munmap(addr, len);
-
- return(fd);
-}
-
-extern int have_devanon;
-
int create_mem_file(unsigned long long len)
{
int err, fd;

- if(have_devanon)
- fd = create_anon_file(len);
- else fd = create_tmp_file(len);
+ fd = create_tmp_file(len);

err = os_set_exec_close(fd, 1);
if(err < 0){
Index: linux-2.6.16/arch/um/os-Linux/start_up.c
===================================================================
--- linux-2.6.16.orig/arch/um/os-Linux/start_up.c 2006-03-23 17:23:54.000000000 -0500
+++ linux-2.6.16/arch/um/os-Linux/start_up.c 2006-03-23 17:39:21.000000000 -0500
@@ -470,25 +470,6 @@ int can_do_skas(void)
}
#endif

-int have_devanon = 0;
-
-/* Runs on boot kernel stack - already safe to use printk. */
-
-void check_devanon(void)
-{
- int fd;
-
- printk("Checking for /dev/anon on the host...");
- fd = open("/dev/anon", O_RDWR);
- if(fd < 0){
- printk("Not available (open failed with errno %d)\n", errno);
- return;
- }
-
- printk("OK\n");
- have_devanon = 1;
-}
-
int __init parse_iomem(char *str, int *add)
{
struct iomem_region *new;
@@ -664,6 +645,5 @@ void os_check_bugs(void)
{
check_ptrace();
check_sigio();
- check_devanon();
}



2006-03-24 22:43:25

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 12/16] UML - Memory hotplug

Jeff Dike <[email protected]> wrote:
>
> This adds hotplug memory support to UML. The mconsole syntax is
> config mem=[+-]n[KMG]
> In other words, add or subtract some number of kilobytes, megabytes, or
> gigabytes.
>
> Unplugged pages are allocated and then madvise(MADV_REMOVE), which is
> a currently experimental madvise extension. These pages are tracked so
> they can be plugged back in later if the admin decides to give them back.
> The first page to be unplugged is used to keep track of about 4M of other
> pages. A list_head is the first thing on this page. The rest is filled
> with addresses of other unplugged pages. This first page is not madvised,
> obviously.
> When this page is filled, the next page is used in a similar way and linked
> onto a list with the first page. Etc.
> This whole process reverses when pages are plugged back in. When a tracking
> page no longer tracks any unplugged pages, then it is next in line for
> plugging, which is done by freeing pages back to the kernel.
>
> This patch also removes checking for /dev/anon on the host, which is obsoleted
> by MADVISE_REMOVE.
>
> ...
>
> +static unsigned long long unplugged_pages_count = 0;

The `= 0;' causes this to consume space in vmlinux's .data. If we put it
in bss and let crt0.o take care of zeroing it, we save a little disk space.


> + page = alloc_page(GFP_ATOMIC);

That's potentially quite a few atomically-allocated pages. I guess UML is
more resistant to oom than normal kernels (?) but it'd be nice to be able to
run page reclaim here.

> + char buf[sizeof("18446744073709551615\0")];

rofl. We really ought to have a #define for "this architecture's maximum
length of an asciified int/long/s32/s64". Generally people do
guess-and-giggle-plus-20%, or they just get it wrong.

> +#ifndef MADV_REMOVE
> +#define MADV_REMOVE 0x5 /* remove these pages & resources */
> +#endif
> +
> +int os_drop_memory(void *addr, int length)
> +{
> + int err;
> +
> + err = madvise(addr, length, MADV_REMOVE);
> + if(err < 0)
> + err = -errno;
> + return 0;
> +}

* NOTE: Currently, only shmfs/tmpfs is supported for this operation.
* Other filesystems return -ENOSYS.

Are you expecting that this memory is backed by tmpfs?

2006-03-24 23:59:00

by Blaisorblade

[permalink] [raw]
Subject: Re: [uml-devel] Re: [PATCH 12/16] UML - Memory hotplug

On Friday 24 March 2006 23:45, Andrew Morton wrote:
> Jeff Dike <[email protected]> wrote:

> > Unplugged pages are allocated and then madvise(MADV_REMOVE),

> > This patch also removes checking for /dev/anon on the host, which is
> > obsoleted by MADVISE_REMOVE.

> * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
> * Other filesystems return -ENOSYS.

> Are you expecting that this memory is backed by tmpfs?

Yes, that's the recommended configuration, and we're going to move the default
position for the backing file to /dev/shm.

However, it's bogus to miss any error handling - possibly not returning err is
wanted (dunno) because Jeff wants to pretend it succeeded anyway, but at
least a printk() to inform the user that memory must lay on tmpfs is
required.

> +int os_drop_memory(void *addr, int length)
> +{
> + int err;
> +
> + err = madvise(addr, length, MADV_REMOVE);
> + if(err < 0)
> + err = -errno;

Jeff, did you mean the "return _0_" rather than "return err" below? It's
incoherent with the existance of the "err" local.

> + return 0;
> +}

--
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade





___________________________________
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB
http://mail.yahoo.it

2006-03-25 01:04:24

by Jeff Dike

[permalink] [raw]
Subject: Re: [PATCH 12/16] UML - Memory hotplug

On Fri, Mar 24, 2006 at 02:45:35PM -0800, Andrew Morton wrote:
> The `= 0;' causes this to consume space in vmlinux's .data. If we put it
> in bss and let crt0.o take care of zeroing it, we save a little disk space.

Yup.

> > + page = alloc_page(GFP_ATOMIC);
>
> That's potentially quite a few atomically-allocated pages. I guess UML is
> more resistant to oom than normal kernels (?) but it'd be nice to be able to
> run page reclaim here.

This is the big question with this patch. How incestuous do I want to
get with the VM system in order to get it to free up pages? For now,
I decided to be fairly hands-off, allocate as many pages as I can get,
and return the total number to the host. The host, if it wasn't happy
with the results, can wait a bit while the UML notices that it is
really low on memory and frees some up, and then hit up the UML for
the remainder.

> > + char buf[sizeof("18446744073709551615\0")];
>
> rofl. We really ought to have a #define for "this architecture's maximum
> length of an asciified int/long/s32/s64". Generally people do
> guess-and-giggle-plus-20%, or they just get it wrong.

I can write one up. I did some quick grepping, and there are a good
number of constant over-estimates, plus some which might be in danger
with an large number of devices, plus one (kallsyms.c) which actually
does some sane-looking approximate math to get a reasonable number (which
is then doubled).

> * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
> * Other filesystems return -ENOSYS.
>
> Are you expecting that this memory is backed by tmpfs?

Yes, but there should be some checking of this beforehand.

Drop this version for now, and I'll send a new one to cover these
problems plus the one that BlaisorBlade pointed out.

Jeff

2006-03-25 01:17:57

by Jeff Dike

[permalink] [raw]
Subject: Re: [uml-devel] Re: [PATCH 12/16] UML - Memory hotplug

On Sat, Mar 25, 2006 at 12:58:57AM +0100, Blaisorblade wrote:
> > +int os_drop_memory(void *addr, int length)
> > +{
> > + int err;
> > +
> > + err = madvise(addr, length, MADV_REMOVE);
> > + if(err < 0)
> > + err = -errno;
>
> Jeff, did you mean the "return _0_" rather than "return err" below? It's
> incoherent with the existance of the "err" local.
>
> > + return 0;
> > +}

That's just a brain fart - will fix.

Jeff

2006-03-25 19:26:32

by Jan Engelhardt

[permalink] [raw]
Subject: Re: [PATCH 12/16] UML - Memory hotplug


>> + char buf[sizeof("18446744073709551615\0")];
>
>rofl. We really ought to have a #define for "this architecture's maximum
>length of an asciified int/long/s32/s64". Generally people do
>guess-and-giggle-plus-20%, or they just get it wrong.

And this one seems wrong[*] to me too (making it a roflĀ²).
It is two chars (or one[*]) too long.

Consider this test:

#include <stdio.h>
#include <string.h>
int main(void) {
printf("%d\n", sizeof("18446744073709551615\0"));
printf("%d\n", sizeof("18446744073709551615"));
printf("%d\n", strlen("18446744073709551615"));
}

Which will print, when executed,

22
21
20 (the "pure string" length)

[*] Depending on what the original author wanted.



Jan Engelhardt
--

2006-03-25 20:07:49

by Jeff Dike

[permalink] [raw]
Subject: Re: [PATCH 12/16] UML - Memory hotplug

On Sat, Mar 25, 2006 at 08:26:08PM +0100, Jan Engelhardt wrote:
>
> >> + char buf[sizeof("18446744073709551615\0")];
>
> And this one seems wrong[*] to me too (making it a rofl??).
> It is two chars (or one[*]) too long.

One - it needs to be NULL-terminated.
>
> Consider this test:
>
> #include <stdio.h>
> #include <string.h>
> int main(void) {
> printf("%d\n", sizeof("18446744073709551615\0"));
> printf("%d\n", sizeof("18446744073709551615"));
> printf("%d\n", strlen("18446744073709551615"));
> }
>
> Which will print, when executed,
>
> 22
> 21
> 20 (the "pure string" length)
>

Oops, I was basing this on a hazy (too hazy, apparently) recollection
that the C standard specified sizeof("literal string") as being the
pure string length. Now that I'm actually thinking about it, the
actual behavior makes much more sense.

Thanks for checking this out in time for me to fix it in my revised
patch.

Jeff