2002-10-10 01:10:06

by Matthew Dobson

[permalink] [raw]
Subject: [rfc][patch] Memory Binding API v0.3 2.5.41

diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/arch/i386/kernel/entry.S linux-2.5.41-memory_binding_api/arch/i386/kernel/entry.S
--- linux-2.5.41-vanilla/arch/i386/kernel/entry.S Mon Oct 7 11:23:58 2002
+++ linux-2.5.41-memory_binding_api/arch/i386/kernel/entry.S Wed Oct 9 17:54:31 2002
@@ -736,6 +736,8 @@
.long sys_alloc_hugepages /* 250 */
.long sys_free_hugepages
.long sys_exit_group
+ .long sys_mem_setbinding
+ .long sys_mem_getbinding

.rept NR_syscalls-(.-sys_call_table)/4
.long sys_ni_syscall
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/arch/i386/kernel/numaq.c linux-2.5.41-memory_binding_api/arch/i386/kernel/numaq.c
--- linux-2.5.41-vanilla/arch/i386/kernel/numaq.c Mon Oct 7 11:23:33 2002
+++ linux-2.5.41-memory_binding_api/arch/i386/kernel/numaq.c Wed Oct 9 17:54:16 2002
@@ -52,6 +52,10 @@
numnodes = 0;
for(node = 0; node < MAX_NUMNODES; node++) {
if(scd->quads_present31_0 & (1 << node)) {
+ if (test_and_set_bit(numnodes, &node_online_map)){
+ printk("smp_dump_qct: node alread counted?!?!\n");
+ BUG();
+ }
numnodes++;
eq = &scd->eq[node];
/* Convert to pages */
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/arch/i386/kernel/smpboot.c linux-2.5.41-memory_binding_api/arch/i386/kernel/smpboot.c
--- linux-2.5.41-vanilla/arch/i386/kernel/smpboot.c Mon Oct 7 11:24:14 2002
+++ linux-2.5.41-memory_binding_api/arch/i386/kernel/smpboot.c Wed Oct 9 17:54:16 2002
@@ -61,6 +61,10 @@

/* Bitmask of currently online CPUs */
unsigned long cpu_online_map;
+/* Bitmask of currently online memory blocks */
+unsigned long memblk_online_map = 0UL;
+/* Bitmask of currently online nodes */
+unsigned long node_online_map = 0UL;

static volatile unsigned long cpu_callin_map;
volatile unsigned long cpu_callout_map;
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/asm-i386/smp.h linux-2.5.41-memory_binding_api/include/asm-i386/smp.h
--- linux-2.5.41-vanilla/include/asm-i386/smp.h Mon Oct 7 11:23:22 2002
+++ linux-2.5.41-memory_binding_api/include/asm-i386/smp.h Wed Oct 9 17:54:16 2002
@@ -54,6 +54,8 @@
extern void smp_alloc_memory(void);
extern unsigned long phys_cpu_present_map;
extern unsigned long cpu_online_map;
+extern unsigned long memblk_online_map;
+extern unsigned long node_online_map;
extern volatile unsigned long smp_invalidate_needed;
extern int pic_mode;
extern int smp_num_siblings;
@@ -102,6 +104,20 @@
return -1;
}

+#define memblk_online(memblk) (memblk_online_map & (1<<(memblk)))
+
+extern inline unsigned int num_online_memblks(void)
+{
+ return hweight32(memblk_online_map);
+}
+
+#define node_online(node) (node_online_map & (1<<(node)))
+
+extern inline unsigned int num_online_nodes(void)
+{
+ return hweight32(node_online_map);
+}
+
static __inline int hard_smp_processor_id(void)
{
/* we don't want to mark this access volatile - bad code generation */
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/asm-i386/unistd.h linux-2.5.41-memory_binding_api/include/asm-i386/unistd.h
--- linux-2.5.41-vanilla/include/asm-i386/unistd.h Mon Oct 7 11:24:44 2002
+++ linux-2.5.41-memory_binding_api/include/asm-i386/unistd.h Wed Oct 9 17:54:31 2002
@@ -257,6 +257,8 @@
#define __NR_alloc_hugepages 250
#define __NR_free_hugepages 251
#define __NR_exit_group 252
+#define __NR_mem_setbinding 253
+#define __NR_mem_getbinding 254

/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */

diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/linux/init_task.h linux-2.5.41-memory_binding_api/include/linux/init_task.h
--- linux-2.5.41-vanilla/include/linux/init_task.h Mon Oct 7 11:23:25 2002
+++ linux-2.5.41-memory_binding_api/include/linux/init_task.h Wed Oct 9 17:54:08 2002
@@ -76,6 +76,10 @@
.children = LIST_HEAD_INIT(tsk.children), \
.sibling = LIST_HEAD_INIT(tsk.sibling), \
.group_leader = &tsk, \
+ .memblk_binding = { \
+ .bitmask = MEMBLK_NO_BINDING, \
+ .behavior = MPOL_STRICT, \
+ }, \
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
.real_timer = { \
.function = it_real_fn \
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/linux/membind.h linux-2.5.41-memory_binding_api/include/linux/membind.h
--- linux-2.5.41-vanilla/include/linux/membind.h Wed Dec 31 16:00:00 1969
+++ linux-2.5.41-memory_binding_api/include/linux/membind.h Wed Oct 9 17:54:08 2002
@@ -0,0 +1,50 @@
+/*
+ * linux/include/linux/membind.h
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <[email protected]>
+ */
+#ifndef _LINUX_MEMBIND_H
+#define _LINUX_MEMBIND_H
+
+#include <linux/types.h>
+
+#define MEMBLK_NO_BINDING (~0UL)
+
+typedef struct memblk_list {
+ unsigned long bitmask;
+ int behavior;
+} memblk_list_t;
+
+
+#define is_valid_memblk_behavior(x) (1) /* for now */
+#define is_memblk_subset(x, y) (!(~(x) & (y))) /* test whether x is a subset of y */
+
+#define MPOL_STRICT 0 /* Memory MUST be allocated according to binding */
+#define MPOL_LOOSE 1 /* Memory will be allocated according to binding, but
+ can fall back to other memory blocks if necessary. */
+#define MPOL_FIRST 2 /* UNUSED FOR NOW */
+#define MPOL_STRIPE 4 /* UNUSED FOR NOW */
+#define MPOL_RR 8 /* UNUSED FOR NOW */
+
+#endif /* _LINUX_MEMBIND_H */
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/linux/mmzone.h linux-2.5.41-memory_binding_api/include/linux/mmzone.h
--- linux-2.5.41-vanilla/include/linux/mmzone.h Mon Oct 7 11:22:55 2002
+++ linux-2.5.41-memory_binding_api/include/linux/mmzone.h Wed Oct 9 17:54:25 2002
@@ -167,8 +167,9 @@
unsigned long node_start_pfn;
unsigned long node_size;
int node_id;
+ int memblk_id; /* A unique ID for each memory block */
struct pglist_data *pgdat_next;
- wait_queue_head_t kswapd_wait;
+ wait_queue_head_t kswapd_wait;
} pg_data_t;

extern int numnodes;
@@ -249,6 +250,26 @@
#define for_each_zone(zone) \
for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))

+/**
+ * for_each_valid_zone - helper macro to iterate over all memory zones
+ * in a zonelist
+ * @zone - pointer to struct zone variable
+ * @zonelist - pointer to struct zonelist variable
+ *
+ * for_each_valid_zone() is basically an easier to read version of this
+ * piece of code:
+ *
+ * for (i = 0; zonelist->zones[i] != NULL; i++) {
+ * struct zone *z = zonelist->zones[i];
+ * ...
+ * }
+ *
+ * Useful for several loops in __alloc_pages.
+ */
+#define for_each_valid_zone(zone, zonelist) \
+ for (zone = *zonelist->zones; zone; zone++) \
+ if (current->memblk_binding.bitmask & (1 << zone->zone_pgdat->memblk_id))
+
#ifdef CONFIG_NUMA
#define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */
#else /* !CONFIG_NUMA */
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/linux/sched.h linux-2.5.41-memory_binding_api/include/linux/sched.h
--- linux-2.5.41-vanilla/include/linux/sched.h Mon Oct 7 11:23:25 2002
+++ linux-2.5.41-memory_binding_api/include/linux/sched.h Wed Oct 9 17:54:08 2002
@@ -29,6 +29,7 @@
#include <linux/compiler.h>
#include <linux/completion.h>
#include <linux/pid.h>
+#include <linux/membind.h>

struct exec_domain;

@@ -335,6 +336,9 @@
/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];

+ /* additional Memory Binding stuff */
+ memblk_list_t memblk_binding;
+
wait_queue_head_t wait_chldexit; /* for wait4() */
struct completion *vfork_done; /* for vfork() */
int *user_tid; /* for CLONE_CLEARTID */
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/linux/smp.h linux-2.5.41-memory_binding_api/include/linux/smp.h
--- linux-2.5.41-vanilla/include/linux/smp.h Mon Oct 7 11:24:39 2002
+++ linux-2.5.41-memory_binding_api/include/linux/smp.h Wed Oct 9 17:54:16 2002
@@ -94,7 +94,13 @@
#define cpu_online(cpu) ({ BUG_ON((cpu) != 0); 1; })
#define num_online_cpus() 1
#define num_booting_cpus() 1
-#define cpu_possible(cpu) ({ BUG_ON((cpu) != 0); 1; })
+#define cpu_possible(cpu) ({ BUG_ON((cpu) != 0); 1; })
+#define memblk_online_map 1
+#define memblk_online(memblk) ({ BUG_ON((memblk) != 0); 1; })
+#define num_online_memblks() 1
+#define node_online_map 1
+#define node_online(node) ({ BUG_ON((node) != 0); 1; })
+#define num_online_nodes() 1

struct notifier_block;

diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/kernel/sys.c linux-2.5.41-memory_binding_api/kernel/sys.c
--- linux-2.5.41-vanilla/kernel/sys.c Mon Oct 7 11:23:25 2002
+++ linux-2.5.41-memory_binding_api/kernel/sys.c Wed Oct 9 17:54:31 2002
@@ -1303,6 +1303,96 @@
return mask;
}

+/**
+ * sys_mem_setbinding - set the memory binding of a process
+ * @pid: pid of the process
+ * @memblks: new bitmask of memory blocks
+ * @behavior: new behavior
+ */
+asmlinkage long sys_mem_setbinding(pid_t pid, unsigned long memblks,
+ unsigned int behavior)
+{
+ long ret;
+ struct task_struct *p;
+
+ /*
+ * Make sure that at least one of the memblks in the
+ * new mask is online.
+ */
+ memblks &= memblk_online_map;
+ if (!memblks)
+ return -EINVAL;
+
+ /*
+ * Test to make sure the behavior argument is valid.
+ */
+ if (!is_valid_memblk_behavior(behavior))
+ return -EINVAL;
+
+ read_lock(&tasklist_lock);
+
+ p = find_process_by_pid(pid);
+ if (!p) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+
+ /*
+ * The caller must either own the process or have CAP_SYS_NICE.
+ */
+ ret = -EPERM;
+ if ((current->euid != p->euid) && (current->euid != p->uid) &&
+ !capable(CAP_SYS_NICE))
+ goto out_unlock;
+
+ ret = 0;
+ current->memblk_binding.bitmask = memblks;
+ current->memblk_binding.behavior = behavior;
+
+out_unlock:
+ put_task_struct(p);
+ return ret;
+}
+
+/**
+ * sys_mem_getbinding - get the memory binding of a process
+ * @pid: pid of the process
+ * @user_bitmask: bitmask of memory blocks
+ * @user_behavior: behavior
+ */
+asmlinkage long sys_mem_getbinding(pid_t pid, unsigned long *user_bitmask,
+ unsigned int *user_behavior)
+{
+ long ret;
+ unsigned long bitmask;
+ unsigned int behavior;
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+
+ ret = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ ret = 0;
+ bitmask = p->memblk_binding.bitmask;
+ behavior = p->memblk_binding.behavior;
+
+out_unlock:
+ read_unlock(&tasklist_lock);
+ if (ret)
+ return ret;
+ if (copy_to_user(user_bitmask, &bitmask, sizeof(unsigned long)))
+ return -EFAULT;
+ if (copy_to_user(user_behavior, &behavior, sizeof(unsigned int)))
+ return -EFAULT;
+ return ret;
+}
+
asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/mm/numa.c linux-2.5.41-memory_binding_api/mm/numa.c
--- linux-2.5.41-vanilla/mm/numa.c Mon Oct 7 11:24:50 2002
+++ linux-2.5.41-memory_binding_api/mm/numa.c Wed Oct 9 17:54:16 2002
@@ -8,6 +8,7 @@
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/spinlock.h>
+#include <linux/membind.h>

int numnodes = 1; /* Initialized for UMA platforms */

@@ -29,6 +30,7 @@

pgdat = &contig_page_data;
contig_page_data.node_id = 0;
+ contig_page_data.memblk_id = 0;
contig_page_data.node_start_pfn = node_start_pfn;
calculate_totalpages (&contig_page_data, zones_size, zholes_size);
if (pmap == (struct page *)0) {
@@ -37,6 +39,7 @@
}
contig_page_data.node_mem_map = pmap;
free_area_init_core(&contig_page_data, zones_size, zholes_size);
+ memblk_online_map = 1UL;
mem_map = contig_page_data.node_mem_map;
}

@@ -66,6 +69,7 @@
unsigned long size;

pgdat->node_id = nid;
+ pgdat->memblk_id = __node_to_memblk(nid);
pgdat->node_start_pfn = node_start_pfn;
calculate_totalpages (pgdat, zones_size, zholes_size);
if (pmap == (struct page *)0) {
@@ -74,6 +78,10 @@
}
pgdat->node_mem_map = pmap;
free_area_init_core(pgdat, zones_size, zholes_size);
+ if (test_and_set_bit(num_online_memblks(), &memblk_online_map)){
+ printk("free_area_init_core: memblk alread counted?!?!\n");
+ BUG();
+ }

/*
* Get space for the valid bitmap.
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/mm/page_alloc.c linux-2.5.41-memory_binding_api/mm/page_alloc.c
--- linux-2.5.41-vanilla/mm/page_alloc.c Mon Oct 7 11:23:24 2002
+++ linux-2.5.41-memory_binding_api/mm/page_alloc.c Wed Oct 9 17:54:25 2002
@@ -318,57 +318,46 @@
struct zonelist *zonelist)
{
unsigned long min;
- struct zone **zones, *classzone;
+ struct zone *classzone, *zone;
struct page * page;
- int freed, i;
+ int freed;

if (gfp_mask & __GFP_WAIT)
might_sleep();

- mod_page_state(pgalloc, 1<<order);
+ mod_page_state(pgalloc, 1UL << order);

- zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
- classzone = zones[0];
+ classzone = zonelist->zones[0];
if (classzone == NULL) /* no zones in the zonelist */
return NULL;

/* Go through the zonelist once, looking for a zone with enough free */
min = 1UL << order;
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *z = zones[i];
-
+ for_each_valid_zone(zone, zonelist) {
/* the incremental min is allegedly to discourage fallback */
- min += z->pages_low;
- if (z->free_pages > min || z->free_pages >= z->pages_high) {
- page = rmqueue(z, order);
- if (page)
+ min += zone->pages_low;
+ if (zone->free_pages > min || zone->free_pages >= zone->pages_high)
+ if (page = rmqueue(zone, order))
return page;
- }
}

/* we're somewhat low on memory, failed to find what we needed */
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *z = zones[i];
- if (z->free_pages <= z->pages_low &&
- waitqueue_active(&z->zone_pgdat->kswapd_wait))
- wake_up_interruptible(&z->zone_pgdat->kswapd_wait);
+ for_each_valid_zone(zone, zonelist) {
+ if (zone->free_pages <= zone->pages_low &&
+ waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+ wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
}

/* Go through the zonelist again, taking __GFP_HIGH into account */
min = 1UL << order;
- for (i = 0; zones[i] != NULL; i++) {
- unsigned long local_min;
- struct zone *z = zones[i];
-
- local_min = z->pages_min;
+ for_each_valid_zone(zone, zonelist) {
if (gfp_mask & __GFP_HIGH)
- local_min >>= 2;
- min += local_min;
- if (z->free_pages > min || z->free_pages >= z->pages_high) {
- page = rmqueue(z, order);
- if (page)
+ min += zone->pages_min >> 2;
+ else
+ min += zone->pages_min;
+ if (zone->free_pages > min || zone->free_pages >= zone->pages_high)
+ if (page = rmqueue(zone, order))
return page;
- }
}

/* here we're in the low on memory slow path */
@@ -376,13 +365,9 @@
rebalance:
if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
/* go through the zonelist yet again, ignoring mins */
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *z = zones[i];
-
- page = rmqueue(z, order);
- if (page)
+ for_each_valid_zone(zone, zonelist)
+ if (page = rmqueue(zone, order))
return page;
- }
nopage:
if (!(current->flags & PF_NOWARN)) {
printk("%s: page allocation failure."
@@ -403,15 +388,11 @@

/* go through the zonelist yet one more time */
min = 1UL << order;
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *z = zones[i];
-
- min += z->pages_min;
- if (z->free_pages > min || z->free_pages >= z->pages_high) {
- page = rmqueue(z, order);
- if (page)
+ for_each_valid_zone(zone, zonelist) {
+ min += zone->pages_min;
+ if (zone->free_pages > min || zone->free_pages >= zone->pages_high)
+ if (page = rmqueue(zone, order))
return page;
- }
}

/* Don't let big-order allocations loop */


Attachments:
memory_binding_api-v0.3-2.5.41.patch (16.39 kB)

2002-10-10 02:59:41

by Andrew Morton

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Matthew Dobson wrote:
>
> Greetings & Salutations,
> Here's a wonderful patch that I know you're all dying for... Memory
> Binding!


Seems reasonable to me.

Could you tell us a bit about the operator's view of this?

I assume that a typical usage scenario would be to bind a process
to a bunch of CPUs and to then bind that process to a bunch of
memblks as well?

If so, then how does the operator know how to identify those
memblks? To perform the (cpu list) <-> (memblk list) mapping?

Also, what advantage does this provide over the current node-local
allocation policy? I'd have thought that once you'd bound a
process to a CPU (or to a node's CPUs) that as long as the zone
fallback list was right, that process would be getting local memory
pretty much all the time anyway?

Last but not least: you got some benchmark numbers for this?

Thanks.

2002-10-10 04:03:26

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41


> +#define for_each_valid_zone(zone, zonelist) \
> + for (zone = *zonelist->zones; zone; zone++) \
> + if (current->memblk_binding.bitmask & (1 << zone->zone_pgdat->memblk_id))

Does the compiler optimise the last bit away on non-NUMA?
Want to wrap it in #ifdef CONFIG_NUMA_MEMBIND or something?
Not sure what the speed impact of this would be, but I'd
rather it was optional, even on NUMA boxen.

Other than that, looks pretty good.

M.


2002-10-10 08:54:20

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

On Thu, 2002-10-10 at 03:12, Matthew Dobson wrote:
> Greetings & Salutations,
> Here's a wonderful patch that I know you're all dying for... Memory
> Binding! It works just like CPU Affinity (binding) except that it binds
> a processes memory allocations (just buddy allocator for now) to
> specific memory blocks.

If the VM works right just doing CPU binding ought to be enough, surely?


Attachments:
signature.asc (189.00 B)
This is a digitally signed message part

2002-10-10 10:00:40

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41


> +/**
> + * sys_mem_setbinding - set the memory binding of a process
> + * @pid: pid of the process
> + * @memblks: new bitmask of memory blocks
> + * @behavior: new behavior
> + */
> +asmlinkage long sys_mem_setbinding(pid_t pid, unsigned long memblks,
> + unsigned int behavior)
> +{

Do you really think exposing low level internals as memory layout / zone
split up to userspace is a good idea ? (and worth it given that the VM
already has a cpu locality preference?)

I'd much rather see the VM have an arch-specified "cost" for getting
memory from not-the-prefered zones than exposing all this stuff to
userspace and depending on userspace to do the right thing.... it's the
kernel's task to abstract the low level details of the hardware after
all.

Greetings,
Arjan van de Ven


Attachments:
signature.asc (189.00 B)
This is a digitally signed message part

2002-10-10 11:06:28

by Alan

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

On Thu, 2002-10-10 at 11:06, Arjan van de Ven wrote:
>
> > +/**
> > + * sys_mem_setbinding - set the memory binding of a process
> > + * @pid: pid of the process
> > + * @memblks: new bitmask of memory blocks
> > + * @behavior: new behavior
> > + */
> > +asmlinkage long sys_mem_setbinding(pid_t pid, unsigned long memblks,
> > + unsigned int behavior)
> > +{
>
> Do you really think exposing low level internals as memory layout / zone
> split up to userspace is a good idea ? (and worth it given that the VM
> already has a cpu locality preference?)

At least in the embedded world that level is a good idea. I'm not sure
about the syscall interface. An "unsigned long" mask of blocks sounds
like a good way to ensure a broken syscall in the future

2002-10-10 11:26:21

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

At some point in the past, Matthew Dobson wrote:
>>> +asmlinkage long sys_mem_setbinding(pid_t pid, unsigned long memblks,
>>> + unsigned int behavior)

On Thu, 2002-10-10 at 11:06, Arjan van de Ven wrote:
>> Do you really think exposing low level internals as memory layout / zone
>> split up to userspace is a good idea ? (and worth it given that the VM
>> already has a cpu locality preference?)

On Thu, Oct 10, 2002 at 12:22:51PM +0100, Alan Cox wrote:
> At least in the embedded world that level is a good idea. I'm not sure
> about the syscall interface. An "unsigned long" mask of blocks sounds
> like a good way to ensure a broken syscall in the future

Seconded wrt. memblk bitmask interface.

IMHO this level of topology exposure is not inappropriate. These kinds
of details are already exported (unidirectionally) by /proc/, if not
dmesg(1) and in my mind there is neither an aesthetic nor practical
barrier to referring to these machine features in userspace API's
in an advisory way (and absolutely not any kind of reliance). It's
simply another kind of request, and one which several high-performance
applications would like to make. I would also be interested in hearing
more of how embedded applications would make use of this, as my direct
experience in embedded systems is somewhat lacking.

Also, I've already privately replied with some of my stylistic concerns,
including things like the separability of the for_each_in_zonelist()
cleanup bundled into the patch and a typedef or so.


Bill

2002-10-10 18:26:50

by Matthew Dobson

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Andrew Morton wrote:
> Matthew Dobson wrote:
>>Greetings & Salutations,
>> Here's a wonderful patch that I know you're all dying for... Memory
>>Binding!
> Seems reasonable to me.
Good news!

> Could you tell us a bit about the operator's view of this?
>
> I assume that a typical usage scenario would be to bind a process
> to a bunch of CPUs and to then bind that process to a bunch of
> memblks as well?
>
> If so, then how does the operator know how to identify those
> memblks? To perform the (cpu list) <-> (memblk list) mapping?
Well, that's what the super-duper In-Kernel topology API is for! ;) If
the operator wanted to ensure that all the processes memory was *only*
allocated from the memblks closest to her bound CPUs, she'd loop over
her cpu binding, and for each set bit, she'd:
bitmask &= 1 << __node_to_memblk(__cpu_to_node(cpu));
I suppose that I could include a macro to do this in the patch, but I
was a bit afraid (and still am) that it already may be a bit large for
people's tastes. I've got some suggestions on how to split it up/pare
it down, so we'll see.

> Also, what advantage does this provide over the current node-local
> allocation policy? I'd have thought that once you'd bound a
> process to a CPU (or to a node's CPUs) that as long as the zone
> fallback list was right, that process would be getting local memory
> pretty much all the time anyway?
Very true... This is to specifically allow for processes that want to
do something *different* than the default policy. Again, akin to CPU
affinity, this is not something that the average process is going to
ever use, or even care about. The majority of processes don't
specifically bind themselves to certain CPUs or groups of CPUs, because
for them the default scheduler policies are fine. For the majority of
processes, the default memory allocation policy works just dandy. This
is for processes that want to do something different: Testing/debugging
apps on a large (likely NUMA) system, high-end databases, and who knows
what else? There is also a plan to add a function call to bind specific
regions of a processes memory to certain memblks, and this would allow
for efficient shared memory for process groups spread across a large system.

> Last but not least: you got some benchmark numbers for this?
Nope.. It is not something that is going to (on average) improve
benchmark numbers for something like a kernel compile... As you
mentioned above, the default policy is to allocate from the local memory
block anyway. This API is more useful for something where you want to
pin your memory close to a particular process that your process is
working with, or pin you memory to a different node than the one you're
executing on to purposely test/debug something. If you'd like, I can do
some kernbench runs or something to come up with some numbers to show
that it doesn't negatively affect performance, but I don't know of any
benchmarking suites offhand that would show positive numbers.

> Thanks.
My pleasure! ;)

Cheers!

-Matt

2002-10-10 18:41:56

by Matthew Dobson

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Martin J. Bligh wrote:
>>+#define for_each_valid_zone(zone, zonelist) \
>>+ for (zone = *zonelist->zones; zone; zone++) \
>>+ if (current->memblk_binding.bitmask & (1 << zone->zone_pgdat->memblk_id))
>
> Does the compiler optimise the last bit away on non-NUMA?
Nope.

> Want to wrap it in #ifdef CONFIG_NUMA_MEMBIND or something?
Not a problem... I've got some free time this afternoon... Should only
take me a few hours to retool the patch to include this change. ;)

> Not sure what the speed impact of this would be, but I'd
> rather it was optional, even on NUMA boxen.
Sounds reasonable... It'll be in the next itteration.

> Other than that, looks pretty good.
Glad to hear!

> M.

2002-10-10 18:53:57

by Matthew Dobson

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Arjan van de Ven wrote:
> On Thu, 2002-10-10 at 03:12, Matthew Dobson wrote:
>
>>Greetings & Salutations,
>> Here's a wonderful patch that I know you're all dying for... Memory
>>Binding! It works just like CPU Affinity (binding) except that it binds
>>a processes memory allocations (just buddy allocator for now) to
>>specific memory blocks.
>
> If the VM works right just doing CPU binding ought to be enough, surely?
You'll have to look at the response I wrote to Andrew's question along
the same
lines... This patch is for processes who feel that the VM *isn't* doing
quite what they want, and want different behavior.

Cheers!

-Matt

2002-10-10 18:59:47

by Matthew Dobson

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Arjan van de Ven wrote:
>>+/**
>>+ * sys_mem_setbinding - set the memory binding of a process
>>+ * @pid: pid of the process
>>+ * @memblks: new bitmask of memory blocks
>>+ * @behavior: new behavior
>>+ */
>>+asmlinkage long sys_mem_setbinding(pid_t pid, unsigned long memblks,
>>+ unsigned int behavior)
>>+{
>
> Do you really think exposing low level internals as memory layout / zone
> split up to userspace is a good idea ? (and worth it given that the VM
> already has a cpu locality preference?)
Yes, I actually do. If userspace processes/users don't care about the
memory/zone layout, they don't have to look. But if they *do* care,
they should be able to find out, and not be left in the proverbial dark.
Embedded systems will care, as will really large NUMA/Discontig
systems. As with some other patches, this functionality will not affect
the average user on the average computer... They are useful for really
small systems, and really large systems, however.

> I'd much rather see the VM have an arch-specified "cost" for getting
> memory from not-the-prefered zones than exposing all this stuff to
> userspace and depending on userspace to do the right thing.... it's the
> kernel's task to abstract the low level details of the hardware after
> all.
The arch-specified 'cost' is also a good idea. Some of the topology
stuff I'm working on will allow that sort of interface as well. And
this patch does not 'depend' on userspace to do the right thing. The
patch does not alter the default VM behavior unless userspace
*specifically* asks to alter it.

Cheers!

-Matt

2002-10-10 19:10:14

by Matthew Dobson

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

William Lee Irwin III wrote:
> At some point in the past, Matthew Dobson wrote:
>
>>>>+asmlinkage long sys_mem_setbinding(pid_t pid, unsigned long memblks,
>>>>+ unsigned int behavior)
>>>
>
> On Thu, 2002-10-10 at 11:06, Arjan van de Ven wrote:
>
>>>Do you really think exposing low level internals as memory layout / zone
>>>split up to userspace is a good idea ? (and worth it given that the VM
>>>already has a cpu locality preference?)
>>
>
> On Thu, Oct 10, 2002 at 12:22:51PM +0100, Alan Cox wrote:
>
>>At least in the embedded world that level is a good idea. I'm not sure
>>about the syscall interface. An "unsigned long" mask of blocks sounds
>>like a good way to ensure a broken syscall in the future
> Seconded wrt. memblk bitmask interface.
Glad to have your support! :)

> Also, I've already privately replied with some of my stylistic concerns,
> including things like the separability of the for_each_in_zonelist()
> cleanup bundled into the patch and a typedef or so.
Some very good points in your email. Most (if not all) will be
incorporated in v0.4 (later today or tomorrow).

Cheers!

-Matt

2002-10-10 19:04:27

by Matthew Dobson

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Alan Cox wrote:
> On Thu, 2002-10-10 at 11:06, Arjan van de Ven wrote:
>
>>>+/**
>>>+ * sys_mem_setbinding - set the memory binding of a process
>>>+ * @pid: pid of the process
>>>+ * @memblks: new bitmask of memory blocks
>>>+ * @behavior: new behavior
>>>+ */
>>>+asmlinkage long sys_mem_setbinding(pid_t pid, unsigned long memblks,
>>>+ unsigned int behavior)
>>>+{
>>
>>Do you really think exposing low level internals as memory layout / zone
>>split up to userspace is a good idea ? (and worth it given that the VM
>>already has a cpu locality preference?)
>
> At least in the embedded world that level is a good idea. I'm not sure
> about the syscall interface. An "unsigned long" mask of blocks sounds
> like a good way to ensure a broken syscall in the future
Agreed. This is a first pass (well 3rd, but the first two were long ago),
and I'll probably immitate the sys_sched_(s|g)etaffinity calls (even more
than I already have ;) and add a 'length' argument in the next itteration.

Cheers!

-Matt

2002-10-13 22:17:40

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Matthew Dobson <[email protected]> writes:

> Greetings & Salutations,
> Here's a wonderful patch that I know you're all dying for... Memory
> Binding! It works just like CPU Affinity (binding) except that it binds a
> processes memory allocations (just buddy allocator for now) to specific memory
> blocks.
> I've sent this out in the past, but haven't touched it in months. Since
>
> the feature freeze is rapidly approaching, I want to get this out there again
> and see if anyone has any interest in it.
> It's a fairly large patch, mostly because it includes a few odds and
> ends that are topology related, and don't strictly belong in this patch, but are
>
> pre-requisites for it (ie: the [memblk|node]_online_map stuff, and some of the
> cleanups to page_alloc). I'll probably try and break it up into more discrete
> parts very soon.

Due we want this per numa area or simply per zone? My suspicion is that
internally at least we want this per zone.

> Questions, comments, flames, and indifferent shrugs are all welcome.
>
> btw, It applies (mostly) cleanly to mm1 as well. The mm/page_alloc.c changes
> fail, but if anyone is interested, they'll clean up easily, and I'll send you a
> patch.

The API doesn't make much sense at the moment.

1) You are operating on tasks and not mm's, or preferably vmas.
2) sys_mem_setbinding does not move the mm to the new binding.
3) You specify a pid and then change current task instead of
the specified one.

4) An ordered zone list is probably the more natural mapping.
5) mprotect is the more natural model rather than set_cpu_affinity.
6) The code belongs in mm/* not kernel/*

Eric

2002-10-15 00:12:30

by Matthew Dobson

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Eric W. Biederman wrote:
> Matthew Dobson <[email protected]> writes:
>>Greetings & Salutations,
>> Here's a wonderful patch that I know you're all dying for... Memory
>>Binding! It works just like CPU Affinity (binding) except that it binds a
>>processes memory allocations (just buddy allocator for now) to specific memory
>>blocks.
> Due we want this per numa area or simply per zone? My suspicion is that
> internally at least we want this per zone.
I think that per memory block is better. We already have a method for
allocating from specific zones (GFP_* flags). Also, using per zone
binding would involve setting up some way of enumerating the zones,
which would not be immediately obvious to the users of the API. The
memory block already has a straight-forward definition and an easy way
for users to get the appropriate number for the appropriate block
(in-kernel topology). I'm not fanatically opposed to per zone binding,
though, and if there is a general agreement that it would be better that
way, I don't think it would be unreasonably difficult to change it.

> The API doesn't make much sense at the moment.
Hmm.. That is unfortunate, I'd aimed to make it as simple as possible.

> 1) You are operating on tasks and not mm's, or preferably vmas.
Correct. There are plans (somewhere inside my cranium) to allow binding
at that granularity. For now, per task seemed an appropriate level.

> 2) sys_mem_setbinding does not move the mm to the new binding.
Also correct. A task may wish to allocate several large data structures
from one memory area, rebind, do more allocations, rebind, ad nauseum.
There are plans to have a flag that, if set, would force relocation of
all currently allocated memory.

> 3) You specify a pid and then change current task instead of
> the specified one.
Yep... That was definitely a typo... fixed.

> 4) An ordered zone list is probably the more natural mapping.
See my comments above about per zone/memblk. And you reemphasize my
point, how do we order the zone lists in such a way that a user of the
API can easily know/find out what zone #5 is?

> 5) mprotect is the more natural model rather than set_cpu_affinity.
Well, I think that may be true for the API you are imagining (per zone,
per mm/vma, etc), not the one that I've written.

> 6) The code belongs in mm/* not kernel/*
Possibly... I just stuck it in with the vast majority of other syscalls
in kernel/sys.c. As those changes are just code additions, they can
easily be moved if it is deemed appropriate.

Cheers!

-Matt

2002-10-15 00:17:14

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41


>> 4) An ordered zone list is probably the more natural mapping.
> See my comments above about per zone/memblk. And you reemphasize my point, how do we order the zone lists in such a way that a user of the API can easily know/find out what zone #5 is?

Could you explain how that problem is different from finding out
what memblk #5 is ... I don't see the difference?

M.

2002-10-15 00:36:14

by Matthew Dobson

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Martin J. Bligh wrote:
>>>4) An ordered zone list is probably the more natural mapping.
>>
>>See my comments above about per zone/memblk. And you reemphasize my point, how do we order the zone lists in such a way that a user of the API can easily know/find out what zone #5 is?
> Could you explain how that problem is different from finding out
> what memblk #5 is ... I don't see the difference?
Errm... __memblk_to_node(5)

I"m not saying that we couldn't add a similar interface for zones...
something along the lines of:
__memblk_and_zone_to_flat_zone_number(5, DMA)
or some such. It just isn't there now...

Also, right now, memblks map to nodes in a straightforward manner (1-1
on NUMA-Q, the only architecture that has defined them). It will likely
look the same on most architectures, too.

Cheers!

-Matt

2002-10-15 00:39:21

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

>>>> 4) An ordered zone list is probably the more natural mapping.
>>>
>>> See my comments above about per zone/memblk. And you reemphasize my point, how do we order the zone lists in such a way that a user of the API can easily know/find out what zone #5 is?
>> Could you explain how that problem is different from finding out
>> what memblk #5 is ... I don't see the difference?
> Errm... __memblk_to_node(5)

As opposed to creating __zone_to_node(5) ?

> I"m not saying that we couldn't add a similar interface for zones... something along the lines of:
> __memblk_and_zone_to_flat_zone_number(5, DMA)
> or some such. It just isn't there now...

Surely this would dispose of the need for memblks? If not, then
I'd agree it's probably just adding more complication.

M.

2002-10-15 00:49:00

by Matthew Dobson

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Martin J. Bligh wrote:
>>>>>4) An ordered zone list is probably the more natural mapping.
>>>>
>>>>See my comments above about per zone/memblk. And you reemphasize my point, how do we order the zone lists in such a way that a user of the API can easily know/find out what zone #5 is?
>>>
>>>Could you explain how that problem is different from finding out
>>>what memblk #5 is ... I don't see the difference?
>>
>>Errm... __memblk_to_node(5)
>
> As opposed to creating __zone_to_node(5) ?
>
>>I"m not saying that we couldn't add a similar interface for zones... something along the lines of:
>> __memblk_and_zone_to_flat_zone_number(5, DMA)
>>or some such. It just isn't there now...
>
> Surely this would dispose of the need for memblks? If not, then
> I'd agree it's probably just adding more complication.
Well, since each node's memory (or memblk in the parlance of my head ;)
has several 'zones' in it (DMA, HIGHMEM, etc), this conversion function
will need 2 parameters. It may well be called
__node_and_zone_type_to_flat_zone_number(node, DMA|NORMAL|HIGHMEM).

Or, we could have:
__zone_to_node(5) = node #
and
__zone_to_zone_type(5) = DMA|NORMAL|HIGHMEM.

But either way, we would need to specify both pieces.

Cheers!

-Matt

2002-10-15 01:00:11

by john stultz

[permalink] [raw]
Subject: Re: [Lse-tech] Re: [rfc][patch] Memory Binding API v0.3 2.5.41

On Mon, 2002-10-14 at 17:38, Matthew Dobson wrote:
> Also, right now, memblks map to nodes in a straightforward manner (1-1
> on NUMA-Q, the only architecture that has defined them). It will likely
> look the same on most architectures, too.

Just an FYI: I believe the x440 breaks this assumption.

There are 2 chunks on the first CEC. The current discontig patch for it
has to drop the second chunk (anything over 3.5G on the first CEC) in
order to work w/ the existing code. However, that will probably need to
be addressed at some point, so be aware that this might affect you as
well.

thanks
-john

2002-10-15 00:59:12

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

On Mon, Oct 14, 2002 at 05:51:39PM -0700, Matthew Dobson wrote:
> Well, since each node's memory (or memblk in the parlance of my head ;)
> has several 'zones' in it (DMA, HIGHMEM, etc), this conversion function
> will need 2 parameters. It may well be called
> __node_and_zone_type_to_flat_zone_number(node, DMA|NORMAL|HIGHMEM).
> Or, we could have:
> __zone_to_node(5) = node #
> and
> __zone_to_zone_type(5) = DMA|NORMAL|HIGHMEM.
> But either way, we would need to specify both pieces.
> Cheers!
> -Matt

Zone "type" can be found in (page->flags >> ZONE_SHIFT) & 0x3UL and
similarly node ID can be found in page_zone(page)->zone_pgdat->node_id
and these are from the page.

zone->zone_pgdat->node_id does the zone to node conversion
zone - zone_pgdat->node_zones does the zone to zone type conversion.

Node and zone type to flat zone number would be
NODE_DATA(nid)->node_zones[type]

Basically there's a number written in page->flags that should be easy
to decode if you can go on arithmetic alone, and if you need details,
there's a zone_table[] you can get at the zones (and hence pgdats) with.


Bill

2002-10-15 01:06:50

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [Lse-tech] Re: [rfc][patch] Memory Binding API v0.3 2.5.41

On Mon, 2002-10-14 at 17:38, Matthew Dobson wrote:
>> Also, right now, memblks map to nodes in a straightforward manner (1-1
>> on NUMA-Q, the only architecture that has defined them). It will likely
>> look the same on most architectures, too.

On Mon, Oct 14, 2002 at 05:55:53PM -0700, john stultz wrote:
> Just an FYI: I believe the x440 breaks this assumption.
> There are 2 chunks on the first CEC. The current discontig patch for it
> has to drop the second chunk (anything over 3.5G on the first CEC) in
> order to work w/ the existing code. However, that will probably need to
> be addressed at some point, so be aware that this might affect you as
> well.

MAP_NR_DENSE()-based zone-relative pfn to zone->zone_mem_map index
remapping is designed to handle this (and actually more severe
situations). The only constraint is that pfn's must be monotonically
increasing with ->zone_mem_map index. Some non-i386 architectures
virtually remap physical memory to provide the illusion of contiguity
of kernel virtual memory, but in a mature port (e.g. i386) there's high
risk of breaking numerous preexisting drivers.


Bill

2002-10-15 01:13:03

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [Lse-tech] Re: [rfc][patch] Memory Binding API v0.3 2.5.41

> MAP_NR_DENSE()-based zone-relative pfn to zone->zone_mem_map index
> remapping is designed to handle this (and actually more severe
> situations). The only constraint is that pfn's must be monotonically
> increasing with ->zone_mem_map index. Some non-i386 architectures
> virtually remap physical memory to provide the illusion of contiguity
> of kernel virtual memory, but in a mature port (e.g. i386) there's high
> risk of breaking numerous preexisting drivers.

As long as you don't need a hole between 0 and 896Mb (s/896/
appropriate defines/) I don't see that would be a problem.
I purged direct usage of mem_map already, and made people use the
macro wrappers.

Basically a "mini config_nonlinear".

M.

2002-10-15 01:12:39

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [Lse-tech] Re: [rfc][patch] Memory Binding API v0.3 2.5.41

>> Also, right now, memblks map to nodes in a straightforward manner (1-1
>> on NUMA-Q, the only architecture that has defined them). It will likely
>> look the same on most architectures, too.
>
> Just an FYI: I believe the x440 breaks this assumption.
>
> There are 2 chunks on the first CEC. The current discontig patch for it
> has to drop the second chunk (anything over 3.5G on the first CEC) in
> order to work w/ the existing code. However, that will probably need to
> be addressed at some point, so be aware that this might affect you as
> well.

No, the NUMA code in the kernel doesn't support that anyway.
You have to use zholes_size, and waste some struct pages,
or config_nonlinear. Either way you get 1 memblk.

M.

2002-10-15 01:18:06

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [Lse-tech] Re: [rfc][patch] Memory Binding API v0.3 2.5.41

At some point in the past, jstultz wrote:
>> Just an FYI: I believe the x440 breaks this assumption.
>> There are 2 chunks on the first CEC. The current discontig patch for it
>> has to drop the second chunk (anything over 3.5G on the first CEC) in
>> order to work w/ the existing code. However, that will probably need to
>> be addressed at some point, so be aware that this might affect you as
>> well.

On Mon, Oct 14, 2002 at 06:08:56PM -0700, Martin J. Bligh wrote:
> No, the NUMA code in the kernel doesn't support that anyway.
> You have to use zholes_size, and waste some struct pages,
> or config_nonlinear. Either way you get 1 memblk.

I thought zholes stuff freed the struct pages. Maybe that was done
by hand.


Bill

2002-10-15 01:26:15

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [Lse-tech] Re: [rfc][patch] Memory Binding API v0.3 2.5.41

>> No, the NUMA code in the kernel doesn't support that anyway.
>> You have to use zholes_size, and waste some struct pages,
>> or config_nonlinear. Either way you get 1 memblk.
>
> I thought zholes stuff freed the struct pages. Maybe that was done
> by hand.

The only place I see that used in generic code is
calculate_zone_totalpages, free_area_init_core, free_area_init_node,
none of which seem to do that. But cscope might be borked again, I
guess. Must be done in each arch if at all ... which arch did you
think did it?

M.

2002-10-15 01:38:13

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [Lse-tech] Re: [rfc][patch] Memory Binding API v0.3 2.5.41

At some point in the past, I wrote:
>> I thought zholes stuff freed the struct pages. Maybe that was done
>> by hand.

On Mon, Oct 14, 2002 at 06:29:49PM -0700, Martin J. Bligh wrote:
> The only place I see that used in generic code is
> calculate_zone_totalpages, free_area_init_core, free_area_init_node,
> none of which seem to do that. But cscope might be borked again, I
> guess. Must be done in each arch if at all ... which arch did you
> think did it?

Not sure, ISTR something about this going on but I don't see any extant
examples. At any rate, it should be easy to do it by hand, just make
sure there are struct pages tracking the holes in mem_map and free the
space in mem_map that would track the holes.


Bill

2002-10-15 01:55:16

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [Lse-tech] Re: [rfc][patch] Memory Binding API v0.3 2.5.41

On Mon, Oct 14, 2002 at 06:29:49PM -0700, Martin J. Bligh wrote:
>> The only place I see that used in generic code is
>> calculate_zone_totalpages, free_area_init_core, free_area_init_node,
>> none of which seem to do that. But cscope might be borked again, I
>> guess. Must be done in each arch if at all ... which arch did you
>> think did it?

On Mon, Oct 14, 2002 at 06:40:23PM -0700, William Lee Irwin III wrote:
> Not sure, ISTR something about this going on but I don't see any extant
> examples. At any rate, it should be easy to do it by hand, just make
> sure there are struct pages tracking the holes in mem_map and free the
> space in mem_map that would track the holes.

And buddy bitmaps and ->node_valid_addr too. Which stands a good chance
of explaining why it broke, if it ever worked.

Bill

2002-10-15 17:17:05

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [rfc][patch] Memory Binding API v0.3 2.5.41

Matthew Dobson <[email protected]> writes:

> Eric W. Biederman wrote:
> > Matthew Dobson <[email protected]> writes:
> >>Greetings & Salutations,
> >> Here's a wonderful patch that I know you're all dying for... Memory
> >>Binding! It works just like CPU Affinity (binding) except that it binds a
> >>processes memory allocations (just buddy allocator for now) to specific memory
>
> >>blocks.
> > Due we want this per numa area or simply per zone? My suspicion is that
> > internally at least we want this per zone.
> I think that per memory block is better.
[snip]
> I'm not fanatically
> opposed to per zone binding, though, and if there is a general agreement that it
> would be better that way, I don't think it would be unreasonably difficult to
> change it.

My only feeling with zones is that it could be useful in the non numa cases,
if it was per zone.

But unless this API becomes is a pure hint we need at least one specifier that
says writing to swap is o.k.

> > The API doesn't make much sense at the moment.
> Hmm.. That is unfortunate, I'd aimed to make it as simple as possible.

Simple is good only if the proper pieces are connected.

> > 1) You are operating on tasks and not mm's, or preferably vmas.
> Correct. There are plans (somewhere inside my cranium) to allow binding at that
>
> granularity. For now, per task seemed an appropriate level.

It makes it terribly unpredictable. If you have two threads each bound
to a different location there are race conditions which area the memory
is allocated from.

> > 2) sys_mem_setbinding does not move the mm to the new binding.
> Also correct. A task may wish to allocate several large data structures from
> one memory area, rebind, do more allocations, rebind, ad nauseum. There are
> plans to have a flag that, if set, would force relocation of all currently
> allocated memory.

Actually the bindings need to stick to the vma or to the struct address_space.
Otherwise you are talking about an allocation hint, as swapping can trivially
undue it and nothing happens when the actual call is made. A hint is a very
different thing from a binding.

And if we stick this to struct address_space for the non anonymous cases
having a fmem_setbinding(struct fd) that works on files would be a useful
thing as well.

> > 5) mprotect is the more natural model rather than set_cpu_affinity.
> Well, I think that may be true for the API you are imagining (per zone, per
> mm/vma, etc), not the one that I've written.

For a binding with respect to memory I imagine things like mlock(). For
anything else you are talking a future hint to the memory allocators, which
feels less much useful.

Eric