2008-02-25 15:36:00

by David Rientjes

[permalink] [raw]
Subject: [patch 1/6] mempolicy: convert MPOL constants to enum

The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND,
and MPOL_INTERLEAVE, are better declared as part of an enum since they
are sequentially numbered and cannot be combined.

The policy member of struct mempolicy is also converted from type short
to type unsigned short. A negative policy does not have any legitimate
meaning, so it is possible to change its type in preparation for adding
optional mode flags later.

The equivalent member of struct shmem_sb_info is also changed from int
to unsigned short.

For compatibility, the policy formal to get_mempolicy() remains as a
pointer to an int:

int get_mempolicy(int *policy, unsigned long *nmask,
unsigned long maxnode, unsigned long addr,
unsigned long flags);

although the only possible values is the range of type unsigned short.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
include/linux/mempolicy.h | 19 ++++++++++---------
include/linux/shmem_fs.h | 2 +-
mm/mempolicy.c | 27 ++++++++++++++++-----------
mm/shmem.c | 9 +++++----
4 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -9,12 +9,13 @@
*/

/* Policies */
-#define MPOL_DEFAULT 0
-#define MPOL_PREFERRED 1
-#define MPOL_BIND 2
-#define MPOL_INTERLEAVE 3
-
-#define MPOL_MAX MPOL_INTERLEAVE
+enum {
+ MPOL_DEFAULT,
+ MPOL_PREFERRED,
+ MPOL_BIND,
+ MPOL_INTERLEAVE,
+ MPOL_MAX, /* always last member of enum */
+};

/* Flags for get_mem_policy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
@@ -62,7 +63,7 @@ struct mm_struct;
*/
struct mempolicy {
atomic_t refcnt;
- short policy; /* See MPOL_* above */
+ unsigned short policy; /* See MPOL_* above */
union {
struct zonelist *zonelist; /* bind */
short preferred_node; /* preferred */
@@ -133,7 +134,7 @@ struct shared_policy {
spinlock_t lock;
};

-void mpol_shared_policy_init(struct shared_policy *info, int policy,
+void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
nodemask_t *nodes);
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma,
@@ -200,7 +201,7 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
}

static inline void mpol_shared_policy_init(struct shared_policy *info,
- int policy, nodemask_t *nodes)
+ unsigned short policy, nodemask_t *nodes)
{
}

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -34,7 +34,7 @@ struct shmem_sb_info {
uid_t uid; /* Mount uid for root directory */
gid_t gid; /* Mount gid for root directory */
mode_t mode; /* Mount mode for root directory */
- int policy; /* Default NUMA memory alloc policy */
+ unsigned short policy; /* Default NUMA memory alloc policy */
nodemask_t policy_nodes; /* nodemask for preferred and bind */
};

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -114,7 +114,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
const nodemask_t *newmask);

/* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, nodemask_t *nodes)
+static int mpol_check_policy(unsigned short mode, nodemask_t *nodes)
{
int was_empty, is_empty;

@@ -159,6 +159,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
if (!was_empty && is_empty)
return -EINVAL;
break;
+ default:
+ BUG();
}
return 0;
}
@@ -201,7 +203,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
}

/* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
{
struct mempolicy *policy;

@@ -235,6 +237,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
return error_code;
}
break;
+ default:
+ BUG();
}
policy->policy = mode;
policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
@@ -479,7 +483,7 @@ static void mpol_set_task_struct_flag(void)
}

/* Set the process memory policy */
-static long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, nodemask_t *nodes)
{
struct mempolicy *new;

@@ -781,7 +785,7 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
#endif

static long do_mbind(unsigned long start, unsigned long len,
- unsigned long mode, nodemask_t *nmask,
+ unsigned short mode, nodemask_t *nmask,
unsigned long flags)
{
struct vm_area_struct *vma;
@@ -791,9 +795,8 @@ static long do_mbind(unsigned long start, unsigned long len,
int err;
LIST_HEAD(pagelist);

- if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+ if (flags & ~(unsigned long)(MPOL_MF_STRICT |
MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- || mode > MPOL_MAX)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
@@ -826,7 +829,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;

- pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+ pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
mode, nmask ? nodes_addr(*nmask)[0] : -1);

down_write(&mm->mmap_sem);
@@ -927,6 +930,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
nodemask_t nodes;
int err;

+ if (mode >= MPOL_MAX)
+ return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
@@ -940,7 +945,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
int err;
nodemask_t nodes;

- if (mode < 0 || mode > MPOL_MAX)
+ if (mode < 0 || mode >= MPOL_MAX)
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
@@ -1206,7 +1211,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
*/
unsigned slab_node(struct mempolicy *policy)
{
- int pol = policy ? policy->policy : MPOL_DEFAULT;
+ unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;

switch (pol) {
case MPOL_INTERLEAVE:
@@ -1634,7 +1639,7 @@ restart:
return 0;
}

-void mpol_shared_policy_init(struct shared_policy *info, int policy,
+void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
nodemask_t *policy_nodes)
{
info->root = RB_ROOT;
@@ -1853,7 +1858,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
char *p = buffer;
int l;
nodemask_t nodes;
- int mode = pol ? pol->policy : MPOL_DEFAULT;
+ unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;

switch (mode) {
case MPOL_DEFAULT:
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1082,7 +1082,8 @@ redirty:

#ifdef CONFIG_NUMA
#ifdef CONFIG_TMPFS
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static int shmem_parse_mpol(char *value, unsigned short *policy,
+ nodemask_t *policy_nodes)
{
char *nodelist = strchr(value, ':');
int err = 1;
@@ -1131,7 +1132,7 @@ out:
return err;
}

-static void shmem_show_mpol(struct seq_file *seq, int policy,
+static void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
const nodemask_t policy_nodes)
{
char *policy_string;
@@ -1200,13 +1201,13 @@ static struct page *shmem_alloc_page(gfp_t gfp,
}
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
-static inline int shmem_parse_mpol(char *value, int *policy,
+static inline int shmem_parse_mpol(char *value, unsigned short *policy,
nodemask_t *policy_nodes)
{
return 1;
}

-static inline void shmem_show_mpol(struct seq_file *seq, int policy,
+static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
const nodemask_t policy_nodes)
{
}


2008-02-25 15:35:37

by David Rientjes

[permalink] [raw]
Subject: [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag

Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses
the node remap when the policy is rebound.

Adds another member to struct mempolicy, nodemask_t user_nodemask, as
part of a union with cpuset_mems_allowed:

struct mempolicy {
...
union {
nodemask_t cpuset_mems_allowed;
nodemask_t user_nodemask;
} w;
}

that stores the the nodemask that the user passed when he or she created
the mempolicy via set_mempolicy() or mbind(). When using
MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's
passed nodemask intersected with the VMA or task's allowed nodes is always
used when determining the preferred node, building the MPOL_BIND zonelist,
or creating the interleave nodemask. This happens whenever the policy is
rebound, including when a task's cpuset assignment changes or the cpuset's
mems are changed.

This creates an interesting side-effect in that it allows the mempolicy
"intent" to lie dormant and uneffected until it has access to the node(s)
that it desires. For example, if you currently ask for an interleaved
policy over a set of nodes that you do not have access to, the mempolicy
is not created and the task continues to use the previous policy. With
this change, however, it is possible to create the same mempolicy; it is
only effected when access to nodes in the nodemask is acquired.

It is also possible to mount tmpfs with the static nodemask behavior when
specifying a node or nodemask. To do this, simply add "=static"
immediately following the mempolicy mode at mount time:

mount -o remount mpol=interleave=static:1-3

Also removes mpol_check_policy() and folds its logic into mpol_new() since
it is now obsoleted. The unused vma_mpol_equal() is also removed.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
include/linux/mempolicy.h | 11 ++-
mm/mempolicy.c | 208 ++++++++++++++++++++++++---------------------
mm/shmem.c | 2 +
3 files changed, 119 insertions(+), 102 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -24,11 +24,13 @@ enum {
};

/* Flags for set_mempolicy */
+#define MPOL_F_STATIC_NODES (1 << 15)
+
/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
* either set_mempolicy() or mbind().
*/
-#define MPOL_MODE_FLAGS (0)
+#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES)

/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
@@ -84,7 +86,10 @@ struct mempolicy {
nodemask_t nodes; /* interleave */
/* undefined for default */
} v;
- nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
+ union {
+ nodemask_t cpuset_mems_allowed; /* relative to these nodes */
+ nodemask_t user_nodemask; /* nodemask passed by user */
+ } w;
};

/*
@@ -123,7 +128,6 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
return 1;
return __mpol_equal(a, b);
}
-#define vma_mpol_equal(a,b) mpol_equal(vma_policy(a), vma_policy(b))

/* Could later add inheritance of the process policy here. */

@@ -188,7 +192,6 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
return 1;
}
-#define vma_mpol_equal(a,b) 1

#define mpol_set_vma_default(vma) do {} while(0)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -113,58 +113,6 @@ struct mempolicy default_policy = {
static void mpol_rebind_policy(struct mempolicy *pol,
const nodemask_t *newmask);

-/* Do sanity checking on a policy */
-static int mpol_check_policy(unsigned short mode, nodemask_t *nodes)
-{
- int was_empty, is_empty;
-
- if (!nodes)
- return 0;
-
- /*
- * "Contextualize" the in-coming nodemast for cpusets:
- * Remember whether in-coming nodemask was empty, If not,
- * restrict the nodes to the allowed nodes in the cpuset.
- * This is guaranteed to be a subset of nodes with memory.
- */
- cpuset_update_task_memory_state();
- is_empty = was_empty = nodes_empty(*nodes);
- if (!was_empty) {
- nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
- is_empty = nodes_empty(*nodes); /* after "contextualization" */
- }
-
- switch (mode) {
- case MPOL_DEFAULT:
- /*
- * require caller to specify an empty nodemask
- * before "contextualization"
- */
- if (!was_empty)
- return -EINVAL;
- break;
- case MPOL_BIND:
- case MPOL_INTERLEAVE:
- /*
- * require at least 1 valid node after "contextualization"
- */
- if (is_empty)
- return -EINVAL;
- break;
- case MPOL_PREFERRED:
- /*
- * Did caller specify invalid nodes?
- * Don't silently accept this as "local allocation".
- */
- if (!was_empty && is_empty)
- return -EINVAL;
- break;
- default:
- BUG();
- }
- return 0;
-}
-
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
@@ -202,40 +150,48 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
return zl;
}

+static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
+{
+ return !!(pol->flags & MPOL_F_STATIC_NODES);
+}
+
/* Create a new policy */
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *policy;
+ nodemask_t cpuset_context_nmask;
+ void *error = ERR_PTR(-EINVAL);

pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);

if (mode == MPOL_DEFAULT)
- return NULL;
+ return (nodes && nodes_weight(*nodes)) ? error : NULL;
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
+ cpuset_update_task_memory_state();
+ nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
switch (mode) {
case MPOL_INTERLEAVE:
- policy->v.nodes = *nodes;
- if (nodes_weight(policy->v.nodes) == 0) {
- kmem_cache_free(policy_cache, policy);
- return ERR_PTR(-EINVAL);
- }
+ if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
+ goto free;
+ policy->v.nodes = cpuset_context_nmask;
break;
case MPOL_PREFERRED:
- policy->v.preferred_node = first_node(*nodes);
+ policy->v.preferred_node = first_node(cpuset_context_nmask);
if (policy->v.preferred_node >= MAX_NUMNODES)
- policy->v.preferred_node = -1;
+ goto free;
break;
case MPOL_BIND:
- policy->v.zonelist = bind_zonelist(nodes);
+ if (nodes_empty(*nodes))
+ goto free;
+ policy->v.zonelist = bind_zonelist(&cpuset_context_nmask);
if (IS_ERR(policy->v.zonelist)) {
- void *error_code = policy->v.zonelist;
- kmem_cache_free(policy_cache, policy);
- return error_code;
+ error = policy->v.zonelist;
+ goto free;
}
break;
default:
@@ -243,8 +199,15 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
}
policy->policy = mode;
policy->flags = flags;
- policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
+ if (mpol_store_user_nodemask(policy))
+ policy->w.user_nodemask = *nodes;
+ else
+ policy->w.cpuset_mems_allowed = cpuset_mems_allowed(current);
return policy;
+
+free:
+ kmem_cache_free(policy_cache, policy);
+ return error;
}

static void gather_stats(struct page *, void *, int pte_dirty);
@@ -490,15 +453,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
{
struct mempolicy *new;

- if (mpol_check_policy(mode, nodes))
- return -EINVAL;
new = mpol_new(mode, flags, nodes);
if (IS_ERR(new))
return PTR_ERR(new);
mpol_free(current->mempolicy);
current->mempolicy = new;
mpol_set_task_struct_flag();
- if (new && new->policy == MPOL_INTERLEAVE)
+ if (new && new->policy == MPOL_INTERLEAVE &&
+ nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
return 0;
}
@@ -818,9 +780,6 @@ static long do_mbind(unsigned long start, unsigned long len,
if (end == start)
return 0;

- if (mpol_check_policy(mode, nmask))
- return -EINVAL;
-
new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
@@ -1211,7 +1170,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
next = next_node(nid, policy->v.nodes);
if (next >= MAX_NUMNODES)
next = first_node(policy->v.nodes);
- me->il_next = next;
+ if (next < MAX_NUMNODES)
+ me->il_next = next;
return nid;
}

@@ -1249,10 +1209,13 @@ static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
- unsigned target = (unsigned)off % nnodes;
+ unsigned target;
int c;
int nid = -1;

+ if (!nnodes)
+ return numa_node_id();
+ target = (unsigned int)off % nnodes;
c = 0;
do {
nid = next_node(nid, pol->v.nodes);
@@ -1457,6 +1420,16 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
return new;
}

+static int mpol_match_intent(const struct mempolicy *a,
+ const struct mempolicy *b)
+{
+ if (a->flags != b->flags)
+ return 0;
+ if (!mpol_store_user_nodemask(a))
+ return 1;
+ return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
+}
+
/* Slow path of a mempolicy comparison */
int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
@@ -1464,6 +1437,8 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
return 0;
if (a->policy != b->policy)
return 0;
+ if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
+ return 0;
switch (a->policy) {
case MPOL_DEFAULT:
return 1;
@@ -1770,54 +1745,79 @@ void numa_default_policy(void)
static void mpol_rebind_policy(struct mempolicy *pol,
const nodemask_t *newmask)
{
- nodemask_t *mpolmask;
nodemask_t tmp;
+ int static_nodes;

if (!pol)
return;
- mpolmask = &pol->cpuset_mems_allowed;
- if (nodes_equal(*mpolmask, *newmask))
+ static_nodes = pol->flags & MPOL_F_STATIC_NODES;
+ if (!mpol_store_user_nodemask(pol) &&
+ nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;

switch (pol->policy) {
case MPOL_DEFAULT:
break;
case MPOL_INTERLEAVE:
- nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
+ if (static_nodes)
+ nodes_and(tmp, pol->w.user_nodemask, *newmask);
+ else {
+ nodes_remap(tmp, pol->v.nodes,
+ pol->w.cpuset_mems_allowed, *newmask);
+ pol->w.cpuset_mems_allowed = *newmask;
+ }
pol->v.nodes = tmp;
- *mpolmask = *newmask;
- current->il_next = node_remap(current->il_next,
- *mpolmask, *newmask);
+ if (!node_isset(current->il_next, tmp)) {
+ current->il_next = next_node(current->il_next, tmp);
+ if (current->il_next >= MAX_NUMNODES)
+ current->il_next = first_node(tmp);
+ if (current->il_next >= MAX_NUMNODES)
+ current->il_next = numa_node_id();
+ }
break;
case MPOL_PREFERRED:
- pol->v.preferred_node = node_remap(pol->v.preferred_node,
- *mpolmask, *newmask);
- *mpolmask = *newmask;
+ if (static_nodes) {
+ int node = first_node(pol->w.user_nodemask);
+
+ if (node_isset(node, *newmask))
+ pol->v.preferred_node = node;
+ else
+ pol->v.preferred_node = -1;
+ } else {
+ pol->v.preferred_node = node_remap(pol->v.preferred_node,
+ pol->w.cpuset_mems_allowed, *newmask);
+ pol->w.cpuset_mems_allowed = *newmask;
+ }
break;
case MPOL_BIND: {
- nodemask_t nodes;
- struct zone **z;
struct zonelist *zonelist;

- nodes_clear(nodes);
- for (z = pol->v.zonelist->zones; *z; z++)
- node_set(zone_to_nid(*z), nodes);
- nodes_remap(tmp, nodes, *mpolmask, *newmask);
- nodes = tmp;
-
- zonelist = bind_zonelist(&nodes);
+ if (static_nodes)
+ nodes_and(tmp, pol->w.user_nodemask, *newmask);
+ else {
+ nodemask_t nodes;
+ struct zone **z;
+
+ nodes_clear(nodes);
+ for (z = pol->v.zonelist->zones; *z; z++)
+ node_set(zone_to_nid(*z), nodes);
+ nodes_remap(tmp, nodes, pol->w.cpuset_mems_allowed,
+ *newmask);
+ pol->w.cpuset_mems_allowed = *newmask;
+ }

/* If no mem, then zonelist is NULL and we keep old zonelist.
* If that old zonelist has no remaining mems_allowed nodes,
* then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
*/
-
- if (!IS_ERR(zonelist)) {
- /* Good - got mem - substitute new zonelist */
- kfree(pol->v.zonelist);
- pol->v.zonelist = zonelist;
+ if (nodes_weight(tmp)) {
+ zonelist = bind_zonelist(&tmp);
+ if (!IS_ERR(zonelist)) {
+ /* Good - got mem - substitute new zonelist */
+ kfree(pol->v.zonelist);
+ pol->v.zonelist = zonelist;
+ }
}
- *mpolmask = *newmask;
break;
}
default:
@@ -1870,6 +1870,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
int l;
nodemask_t nodes;
unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
+ unsigned short flags = pol ? pol->flags : 0;

switch (mode) {
case MPOL_DEFAULT:
@@ -1901,6 +1902,17 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
strcpy(p, policy_types[mode]);
p += l;

+ if (flags) {
+ int need_bar = 0;
+
+ if (buffer + maxlen < p + 2)
+ return -ENOSPC;
+ *p++ = '=';
+
+ if (flags & MPOL_F_STATIC_NODES)
+ p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
+ }
+
if (!nodes_empty(nodes)) {
if (buffer + maxlen < p + 2)
return -ENOSPC;
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1129,6 +1129,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
err = 0;
}
if (flags) {
+ if (!strcmp(flags, "static"))
+ *mode_flags |= MPOL_F_STATIC_NODES;
}
out:
/* Restore string for error message */

2008-02-25 15:36:25

by David Rientjes

[permalink] [raw]
Subject: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

Adds another optional mode flag, MPOL_F_RELATIVE_NODES, that specifies
nodemasks passed via set_mempolicy() or mbind() should be considered
relative to the current task's mems_allowed.

When the mempolicy is created, the passed nodemask is folded and mapped
onto the current task's mems_allowed. For example, consider a task
using set_mempolicy() to pass MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES
with a nodemask of 1-3. If current's mems_allowed is 4-7, the effected
nodemask is 5-7 (the second, third, and fourth node of mems_allowed).

If the same task is attached to a cpuset, the mempolicy nodemask is
rebound each time the mems are changed. Some possible rebinds and
results are:

mems result
1-3 1-3
1-7 2-4
1,5-6 1,5-6
1,5-7 5-7

Likewise, the zonelist built for MPOL_BIND acts on the set of zones
assigned to the resultant nodemask from the relative remap.

In the MPOL_PREFERRED case, the preferred node is remapped from the
currently effected nodemask to the relative nodemask.

This mempolicy mode flag was conceived of by Paul Jackson <[email protected]>.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
include/linux/mempolicy.h | 3 ++-
mm/mempolicy.c | 40 ++++++++++++++++++++++++++++++++++++++--
mm/shmem.c | 6 ++++++
3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -25,12 +25,13 @@ enum {

/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES (1 << 15)
+#define MPOL_F_RELATIVE_NODES (1 << 14)

/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
* either set_mempolicy() or mbind().
*/
-#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES)
+#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)

/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -152,7 +152,18 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)

static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
- return !!(pol->flags & MPOL_F_STATIC_NODES);
+ return !!(pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES));
+}
+
+static nodemask_t mpol_relative_nodemask(const nodemask_t *orig,
+ const nodemask_t *rel)
+{
+ nodemask_t ret;
+ nodemask_t tmp;
+
+ nodes_fold(tmp, *orig, nodes_weight(*rel));
+ nodes_onto(ret, tmp, *rel);
+ return ret;
}

/* Create a new policy */
@@ -173,7 +184,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
cpuset_update_task_memory_state();
- nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
+ if (flags & MPOL_F_RELATIVE_NODES)
+ cpuset_context_nmask = mpol_relative_nodemask(nodes,
+ &cpuset_current_mems_allowed);
+ else
+ nodes_and(cpuset_context_nmask, *nodes,
+ cpuset_current_mems_allowed);
switch (mode) {
case MPOL_INTERLEAVE:
if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
@@ -898,6 +914,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
mode &= ~MPOL_MODE_FLAGS;
if (mode >= MPOL_MAX)
return -EINVAL;
+ if ((mode_flags & MPOL_F_STATIC_NODES) &&
+ (mode_flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
@@ -916,6 +935,8 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
mode &= ~MPOL_MODE_FLAGS;
if ((unsigned int)mode >= MPOL_MAX)
return -EINVAL;
+ if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
@@ -1747,10 +1768,12 @@ static void mpol_rebind_policy(struct mempolicy *pol,
{
nodemask_t tmp;
int static_nodes;
+ int relative_nodes;

if (!pol)
return;
static_nodes = pol->flags & MPOL_F_STATIC_NODES;
+ relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -1761,6 +1784,9 @@ static void mpol_rebind_policy(struct mempolicy *pol,
case MPOL_INTERLEAVE:
if (static_nodes)
nodes_and(tmp, pol->w.user_nodemask, *newmask);
+ else if (relative_nodes)
+ tmp = mpol_relative_nodemask(&pol->w.user_nodemask,
+ newmask);
else {
nodes_remap(tmp, pol->v.nodes,
pol->w.cpuset_mems_allowed, *newmask);
@@ -1783,6 +1809,11 @@ static void mpol_rebind_policy(struct mempolicy *pol,
pol->v.preferred_node = node;
else
pol->v.preferred_node = -1;
+ } else if (relative_nodes) {
+ tmp = mpol_relative_nodemask(&pol->w.user_nodemask,
+ newmask);
+ pol->v.preferred_node = node_remap(pol->v.preferred_node,
+ pol->w.cpuset_mems_allowed, tmp);
} else {
pol->v.preferred_node = node_remap(pol->v.preferred_node,
pol->w.cpuset_mems_allowed, *newmask);
@@ -1794,6 +1825,9 @@ static void mpol_rebind_policy(struct mempolicy *pol,

if (static_nodes)
nodes_and(tmp, pol->w.user_nodemask, *newmask);
+ else if (relative_nodes)
+ tmp = mpol_relative_nodemask(&pol->w.user_nodemask,
+ newmask);
else {
nodemask_t nodes;
struct zone **z;
@@ -1911,6 +1945,8 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)

if (flags & MPOL_F_STATIC_NODES)
p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
+ if (flags & MPOL_F_RELATIVE_NODES)
+ p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
}

if (!nodes_empty(nodes)) {
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1131,6 +1131,12 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
if (flags) {
if (!strcmp(flags, "static"))
*mode_flags |= MPOL_F_STATIC_NODES;
+ if (!strcmp(flags, "relative"))
+ *mode_flags |= MPOL_F_RELATIVE_NODES;
+
+ if ((*mode_flags & MPOL_F_STATIC_NODES) &&
+ (*mode_flags & MPOL_F_RELATIVE_NODES))
+ err = 1;
}
out:
/* Restore string for error message */

2008-02-25 15:36:41

by David Rientjes

[permalink] [raw]
Subject: [patch 2/6] mempolicy: support optional mode flags

With the evolution of mempolicies, it is necessary to support mempolicy
mode flags that specify how the policy shall behave in certain
circumstances. The most immediate need for mode flag support is to
suppress remapping the nodemask of a policy at the time of rebind.

Both the mempolicy mode and flags are passed by the user in the 'int
policy' formal of either the set_mempolicy() or mbind() syscall. A new
constant, MPOL_MODE_FLAGS, represents the union of legal optional flags
that may be passed as part of this int. Mempolicies that include illegal
flags as part of their policy are rejected as invalid.

An additional member to struct mempolicy is added to support the mode
flags:

struct mempolicy {
...
unsigned short policy;
unsigned short flags;
}

The splitting of the 'int' actual passed by the user is done in
sys_set_mempolicy() and sys_mbind() for their respective syscalls. This
is done by intersecting the actual with MPOL_MODE_FLAGS, rejecting the
syscall of there are additional flags, and storing it in the new 'flags'
member of struct mempolicy. The intersection of the actual with
~MPOL_MODE_FLAGS is stored in the 'policy' member of the struct and all
current users of pol->policy remain unchanged.

The union of the policy mode and optional mode flags is passed back to
the user in get_mempolicy().

This combination of mode and flags within the same actual does not break
userspace code that relies on get_mempolicy(&policy, ...) and either

switch (policy) {
case MPOL_BIND:
...
case MPOL_INTERLEAVE:
...
};

statements or

if (policy == MPOL_INTERLEAVE) {
...
}

statements. Such applications would need to use optional mode flags when
calling set_mempolicy() or mbind() for these previously implemented
statements to stop working. If an application does start using optional
mode flags, it will need to mask the optional flags off the policy in
switch and conditional statements that only test mode.

An additional member is also added to struct shmem_sb_info to store the
optional mode flags.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
fs/hugetlbfs/inode.c | 2 +-
include/linux/mempolicy.h | 20 +++++++++++++++--
include/linux/shmem_fs.h | 1 +
mm/mempolicy.c | 51 +++++++++++++++++++++++++++-----------------
mm/shmem.c | 24 ++++++++++++++-------
5 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -504,7 +504,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
INIT_LIST_HEAD(&inode->i_mapping->private_list);
info = HUGETLBFS_I(inode);
- mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
+ mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0, NULL);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -8,6 +8,12 @@
* Copyright 2003,2004 Andi Kleen SuSE Labs
*/

+/*
+ * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
+ * passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
+ * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
+ */
+
/* Policies */
enum {
MPOL_DEFAULT,
@@ -17,7 +23,14 @@ enum {
MPOL_MAX, /* always last member of enum */
};

-/* Flags for get_mem_policy */
+/* Flags for set_mempolicy */
+/*
+ * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
+ * either set_mempolicy() or mbind().
+ */
+#define MPOL_MODE_FLAGS (0)
+
+/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
@@ -64,6 +77,7 @@ struct mm_struct;
struct mempolicy {
atomic_t refcnt;
unsigned short policy; /* See MPOL_* above */
+ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
union {
struct zonelist *zonelist; /* bind */
short preferred_node; /* preferred */
@@ -135,7 +149,7 @@ struct shared_policy {
};

void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
- nodemask_t *nodes);
+ unsigned short flags, nodemask_t *nodes);
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma,
struct mempolicy *new);
@@ -201,7 +215,7 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
}

static inline void mpol_shared_policy_init(struct shared_policy *info,
- unsigned short policy, nodemask_t *nodes)
+ unsigned short flags, unsigned short policy, nodemask_t *nodes)
{
}

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -35,6 +35,7 @@ struct shmem_sb_info {
gid_t gid; /* Mount gid for root directory */
mode_t mode; /* Mount mode for root directory */
unsigned short policy; /* Default NUMA memory alloc policy */
+ unsigned short flags; /* Optional mempolicy flags */
nodemask_t policy_nodes; /* nodemask for preferred and bind */
};

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -203,12 +203,13 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
}

/* Create a new policy */
-static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
+ nodemask_t *nodes)
{
struct mempolicy *policy;

- pr_debug("setting mode %d nodes[0] %lx\n",
- mode, nodes ? nodes_addr(*nodes)[0] : -1);
+ pr_debug("setting mode %d flags %d nodes[0] %lx\n",
+ mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);

if (mode == MPOL_DEFAULT)
return NULL;
@@ -241,6 +242,7 @@ static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
BUG();
}
policy->policy = mode;
+ policy->flags = flags;
policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
return policy;
}
@@ -483,13 +485,14 @@ static void mpol_set_task_struct_flag(void)
}

/* Set the process memory policy */
-static long do_set_mempolicy(unsigned short mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, unsigned short flags,
+ nodemask_t *nodes)
{
struct mempolicy *new;

if (mpol_check_policy(mode, nodes))
return -EINVAL;
- new = mpol_new(mode, nodes);
+ new = mpol_new(mode, flags, nodes);
if (IS_ERR(new))
return PTR_ERR(new);
mpol_free(current->mempolicy);
@@ -595,7 +598,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
goto out;
}
} else
- *policy = pol->policy;
+ *policy = pol->policy | pol->flags;

if (vma) {
up_read(&current->mm->mmap_sem);
@@ -785,8 +788,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
#endif

static long do_mbind(unsigned long start, unsigned long len,
- unsigned short mode, nodemask_t *nmask,
- unsigned long flags)
+ unsigned short mode, unsigned short mode_flags,
+ nodemask_t *nmask, unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
@@ -818,7 +821,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (mpol_check_policy(mode, nmask))
return -EINVAL;

- new = mpol_new(mode, nmask);
+ new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);

@@ -829,8 +832,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;

- pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
- mode, nmask ? nodes_addr(*nmask)[0] : -1);
+ pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
+ start, start + len, mode, mode_flags,
+ nmask ? nodes_addr(*nmask)[0] : -1);

down_write(&mm->mmap_sem);
vma = check_range(mm, start, end, nmask,
@@ -929,13 +933,16 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
{
nodemask_t nodes;
int err;
+ unsigned short mode_flags;

+ mode_flags = mode & MPOL_MODE_FLAGS;
+ mode &= ~MPOL_MODE_FLAGS;
if (mode >= MPOL_MAX)
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_mbind(start, len, mode, &nodes, flags);
+ return do_mbind(start, len, mode, mode_flags, &nodes, flags);
}

/* Set the process memory policy */
@@ -944,13 +951,16 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
{
int err;
nodemask_t nodes;
+ unsigned short flags;

- if (mode < 0 || mode >= MPOL_MAX)
+ flags = mode & MPOL_MODE_FLAGS;
+ mode &= ~MPOL_MODE_FLAGS;
+ if ((unsigned int)mode >= MPOL_MAX)
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_set_mempolicy(mode, &nodes);
+ return do_set_mempolicy(mode, flags, &nodes);
}

asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
@@ -1640,7 +1650,7 @@ restart:
}

void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
- nodemask_t *policy_nodes)
+ unsigned short flags, nodemask_t *policy_nodes)
{
info->root = RB_ROOT;
spin_lock_init(&info->lock);
@@ -1649,7 +1659,7 @@ void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
struct mempolicy *newpol;

/* Falls back to MPOL_DEFAULT on any error */
- newpol = mpol_new(policy, policy_nodes);
+ newpol = mpol_new(policy, flags, policy_nodes);
if (!IS_ERR(newpol)) {
/* Create pseudo-vma that contains just the policy */
struct vm_area_struct pvma;
@@ -1670,9 +1680,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);

- pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
+ pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
vma->vm_pgoff,
- sz, npol? npol->policy : -1,
+ sz, npol ? npol->policy : -1,
+ npol ? npol->flags : -1,
npol ? nodes_addr(npol->v.nodes)[0] : -1);

if (npol) {
@@ -1745,14 +1756,14 @@ void __init numa_policy_init(void)
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);

- if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
+ if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
}

/* Reset policy of current process to default */
void numa_default_policy(void)
{
- do_set_mempolicy(MPOL_DEFAULT, NULL);
+ do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

/* Migrate a policy to a different set of nodes */
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1083,9 +1083,10 @@ redirty:
#ifdef CONFIG_NUMA
#ifdef CONFIG_TMPFS
static int shmem_parse_mpol(char *value, unsigned short *policy,
- nodemask_t *policy_nodes)
+ unsigned short *mode_flags, nodemask_t *policy_nodes)
{
char *nodelist = strchr(value, ':');
+ char *flags = strchr(value, '=');
int err = 1;

if (nodelist) {
@@ -1096,6 +1097,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
goto out;
}
+ if (flags)
+ *flags++ = '\0';
if (!strcmp(value, "default")) {
*policy = MPOL_DEFAULT;
/* Don't allow a nodelist */
@@ -1125,6 +1128,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
*policy_nodes = node_states[N_HIGH_MEMORY];
err = 0;
}
+ if (flags) {
+ }
out:
/* Restore string for error message */
if (nodelist)
@@ -1133,7 +1138,7 @@ out:
}

static void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
- const nodemask_t policy_nodes)
+ unsigned short flags, const nodemask_t policy_nodes)
{
char *policy_string;

@@ -1202,13 +1207,13 @@ static struct page *shmem_alloc_page(gfp_t gfp,
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
static inline int shmem_parse_mpol(char *value, unsigned short *policy,
- nodemask_t *policy_nodes)
+ unsigned short *flags, nodemask_t *policy_nodes)
{
return 1;
}

static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
- const nodemask_t policy_nodes)
+ unsigned short flags, const nodemask_t policy_nodes)
{
}
#endif /* CONFIG_TMPFS */
@@ -1578,7 +1583,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
inode->i_op = &shmem_inode_operations;
inode->i_fop = &shmem_file_operations;
mpol_shared_policy_init(&info->policy, sbinfo->policy,
- &sbinfo->policy_nodes);
+ sbinfo->flags, &sbinfo->policy_nodes);
break;
case S_IFDIR:
inc_nlink(inode);
@@ -1592,7 +1597,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
* Must not load anything in the rbtree,
* mpol_free_shared_policy will not be called.
*/
- mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+ mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0,
NULL);
break;
}
@@ -2209,7 +2214,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
if (shmem_parse_mpol(value, &sbinfo->policy,
- &sbinfo->policy_nodes))
+ &sbinfo->flags, &sbinfo->policy_nodes))
goto bad_val;
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2261,6 +2266,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
sbinfo->max_inodes = config.max_inodes;
sbinfo->free_inodes = config.max_inodes - inodes;
sbinfo->policy = config.policy;
+ sbinfo->flags = config.flags;
sbinfo->policy_nodes = config.policy_nodes;
out:
spin_unlock(&sbinfo->stat_lock);
@@ -2282,7 +2288,8 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_printf(seq, ",uid=%u", sbinfo->uid);
if (sbinfo->gid != 0)
seq_printf(seq, ",gid=%u", sbinfo->gid);
- shmem_show_mpol(seq, sbinfo->policy, sbinfo->policy_nodes);
+ shmem_show_mpol(seq, sbinfo->policy, sbinfo->flags,
+ sbinfo->policy_nodes);
return 0;
}
#endif /* CONFIG_TMPFS */
@@ -2313,6 +2320,7 @@ static int shmem_fill_super(struct super_block *sb,
sbinfo->uid = current->fsuid;
sbinfo->gid = current->fsgid;
sbinfo->policy = MPOL_DEFAULT;
+ sbinfo->flags = 0;
sbinfo->policy_nodes = node_states[N_HIGH_MEMORY];
sb->s_fs_info = sbinfo;

2008-02-25 15:36:57

by David Rientjes

[permalink] [raw]
Subject: [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations

From: Paul Jackson <[email protected]>

The following adds two more bitmap operators, bitmap_onto() and
bitmap_fold(), with the usual cpumask and nodemask wrappers.

The bitmap_onto() operator computes one bitmap relative to
another. If the n-th bit in the origin mask is set, then the
m-th bit of the destination mask will be set, where m is
the position of the n-th set bit in the relative mask.

The bitmap_fold() operator folds a bitmap into a second that
has bit m set iff the input bitmap has some bit n set, where
m == n mod sz, for the specified sz value.

There are two substantive changes between this patch and its
predecessor bitmap_relative:
1) Renamed bitmap_relative() to be bitmap_onto().
2) Added bitmap_fold().

The essential motivation for bitmap_onto() is to provide
a mechanism for converting a cpuset-relative CPU or Node
mask to an absolute mask. Cpuset relative masks are written
as if the current task were in a cpuset whose CPUs or
Nodes were just the consecutive ones numbered 0..N-1, for
some N. The bitmap_onto() operator is provided in anticipation
of adding support for the first such cpuset relative mask,
by the mbind() and set_mempolicy() system calls, using a
planned flag of MPOL_F_RELATIVE_NODES. These bitmap operators
(and their nodemask wrappers, in particular) will be used in
code that converts the user specified cpuset relative memory
policy to a specific system node numbered policy, given the
current mems_allowed of the tasks cpuset.

Such cpuset relative mempolicies will address two deficiencies
of the existing interface between cpusets and mempolicies:
1) A task cannot at present reliably establish a cpuset
relative mempolicy because there is an essential race
condition, in that the tasks cpuset may be changed in
between the time the task can query its cpuset placement,
and the time the task can issue the applicable mbind or
set_memplicy system call.
2) A task cannot at present establish what cpuset relative
mempolicy it would like to have, if it is in a smaller
cpuset than it might have mempolicy preferences for,
because the existing interface only allows specifying
mempolicies for nodes currently allowed by the cpuset.

Cpuset relative mempolicies are useful for tasks that don't
distinguish particularly between one CPU or Node and another,
but only between how many of each are allowed, and the proper
placement of threads and memory pages on the various CPUs and
Nodes available.

The motivation for the added bitmap_fold() can be seen in
the following example.

Let's say an application has specified some mempolicies
that presume 16 memory nodes, including say a mempolicy that
specified MPOL_F_RELATIVE_NODES (cpuset relative) nodes 12-15.
Then lets say that application is crammed into a cpuset that only
has 8 memory nodes, 0-7. If one just uses bitmap_onto(), this
mempolicy, mapped to that cpuset, would ignore the requested
relative nodes above 7, leaving it empty of nodes. That's not
good; better to fold the higher nodes down, so that some nodes
are included in the resulting mapped mempolicy. In this case,
the mempolicy nodes 12-15 are taken modulo 8 (the weight of the
mems_allowed of the confining cpuset), resulting in a mempolicy
specifying nodes 4-7.

Signed-off-by: Paul Jackson <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
include/linux/bitmap.h | 6 ++
include/linux/cpumask.h | 22 ++++++-
include/linux/nodemask.h | 22 ++++++-
lib/bitmap.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 206 insertions(+), 2 deletions(-)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -46,6 +46,8 @@
* bitmap_shift_left(dst, src, n, nbits) *dst = *src << n
* bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
* bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit)
+ * bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap
+ * bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz
* bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf
* bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf
@@ -120,6 +122,10 @@ extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
const unsigned long *old, const unsigned long *new, int bits);
extern int bitmap_bitremap(int oldbit,
const unsigned long *old, const unsigned long *new, int bits);
+extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
+ const unsigned long *relmap, int bits);
+extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
+ int sz, int bits);
extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -14,6 +14,8 @@
* bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
* For details of cpu_remap(), see bitmap_bitremap in lib/bitmap.c
* For details of cpus_remap(), see bitmap_remap in lib/bitmap.c.
+ * For details of cpus_onto(), see bitmap_onto in lib/bitmap.c.
+ * For details of cpus_fold(), see bitmap_fold in lib/bitmap.c.
*
* The available cpumask operations are:
*
@@ -53,7 +55,9 @@
* int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing
* int cpulist_parse(buf, map) Parse ascii string as cpulist
* int cpu_remap(oldbit, old, new) newbit = map(old, new)(oldbit)
- * int cpus_remap(dst, src, old, new) *dst = map(old, new)(src)
+ * void cpus_remap(dst, src, old, new) *dst = map(old, new)(src)
+ * void cpus_onto(dst, orig, relmap) *dst = orig relative to relmap
+ * void cpus_fold(dst, orig, sz) dst bits = orig bits mod sz
*
* for_each_cpu_mask(cpu, mask) for-loop cpu over mask
*
@@ -311,6 +315,22 @@ static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp,
bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
}

+#define cpus_onto(dst, orig, relmap) \
+ __cpus_onto(&(dst), &(orig), &(relmap), NR_CPUS)
+static inline void __cpus_onto(cpumask_t *dstp, const cpumask_t *origp,
+ const cpumask_t *relmapp, int nbits)
+{
+ bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
+}
+
+#define cpus_fold(dst, orig, sz) \
+ __cpus_fold(&(dst), &(orig), sz, NR_CPUS)
+static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
+ int sz, int nbits)
+{
+ bitmap_fold(dstp->bits, origp->bits, sz, nbits);
+}
+
#if NR_CPUS > 1
#define for_each_cpu_mask(cpu, mask) \
for ((cpu) = first_cpu(mask); \
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -14,6 +14,8 @@
* bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
* For details of node_remap(), see bitmap_bitremap in lib/bitmap.c.
* For details of nodes_remap(), see bitmap_remap in lib/bitmap.c.
+ * For details of nodes_onto(), see bitmap_onto in lib/bitmap.c.
+ * For details of nodes_fold(), see bitmap_fold in lib/bitmap.c.
*
* The available nodemask operations are:
*
@@ -55,7 +57,9 @@
* int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
* int nodelist_parse(buf, map) Parse ascii string as nodelist
* int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit)
- * int nodes_remap(dst, src, old, new) *dst = map(old, new)(dst)
+ * void nodes_remap(dst, src, old, new) *dst = map(old, new)(src)
+ * void nodes_onto(dst, orig, relmap) *dst = orig relative to relmap
+ * void nodes_fold(dst, orig, sz) dst bits = orig bits mod sz
*
* for_each_node_mask(node, mask) for-loop node over mask
*
@@ -326,6 +330,22 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
}

+#define nodes_onto(dst, orig, relmap) \
+ __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
+static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
+ const nodemask_t *relmapp, int nbits)
+{
+ bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
+}
+
+#define nodes_fold(dst, orig, sz) \
+ __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
+static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
+ int sz, int nbits)
+{
+ bitmap_fold(dstp->bits, origp->bits, sz, nbits);
+}
+
#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask) \
for ((node) = first_node(mask); \
diff --git a/lib/bitmap.c b/lib/bitmap.c
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -698,6 +698,164 @@ int bitmap_bitremap(int oldbit, const unsigned long *old,
}
EXPORT_SYMBOL(bitmap_bitremap);

+/**
+ * bitmap_onto - translate one bitmap relative to another
+ * @dst: resulting translated bitmap
+ * @orig: original untranslated bitmap
+ * @relmap: bitmap relative to which translated
+ * @bits: number of bits in each of these bitmaps
+ *
+ * Set the n-th bit of @dst iff there exists some m such that the
+ * n-th bit of @relmap is set, the m-th bit of @orig is set, and
+ * the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
+ * (If you understood the previous sentence the first time your
+ * read it, you're overqualified for your current job.)
+ *
+ * In other words, @orig is mapped onto (surjectively) @dst,
+ * using the the map { <n, m> | the n-th bit of @relmap is the
+ * m-th set bit of @relmap }.
+ *
+ * Any set bits in @orig above bit number W, where W is the
+ * weight of (number of set bits in) @relmap are mapped nowhere.
+ * In particular, if for all bits m set in @orig, m >= W, then
+ * @dst will end up empty. In situations where the possibility
+ * of such an empty result is not desired, one way to avoid it is
+ * to use the bitmap_fold() operator, below, to first fold the
+ * @orig bitmap over itself so that all its set bits x are in the
+ * range 0 <= x < W. The bitmap_fold() operator does this by
+ * setting the bit (m % W) in @dst, for each bit (m) set in @orig.
+ *
+ * Example [1] for bitmap_onto():
+ * Let's say @relmap has bits 30-39 set, and @orig has bits
+ * 1, 3, 5, 7, 9 and 11 set. Then on return from this routine,
+ * @dst will have bits 31, 33, 35, 37 and 39 set.
+ *
+ * When bit 0 is set in @orig, it means turn on the bit in
+ * @dst corresponding to whatever is the first bit (if any)
+ * that is turned on in @relmap. Since bit 0 was off in the
+ * above example, we leave off that bit (bit 30) in @dst.
+ *
+ * When bit 1 is set in @orig (as in the above example), it
+ * means turn on the bit in @dst corresponding to whatever
+ * is the second bit that is turned on in @relmap. The second
+ * bit in @relmap that was turned on in the above example was
+ * bit 31, so we turned on bit 31 in @dst.
+ *
+ * Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
+ * because they were the 4th, 6th, 8th and 10th set bits
+ * set in @relmap, and the 4th, 6th, 8th and 10th bits of
+ * @orig (i.e. bits 3, 5, 7 and 9) were also set.
+ *
+ * When bit 11 is set in @orig, it means turn on the bit in
+ * @dst corresponding to whatever is the twelth bit that is
+ * turned on in @relmap. In the above example, there were
+ * only ten bits turned on in @relmap (30..39), so that bit
+ * 11 was set in @orig had no affect on @dst.
+ *
+ * Example [2] for bitmap_fold() + bitmap_onto():
+ * Let's say @relmap has these ten bits set:
+ * 40 41 42 43 45 48 53 61 74 95
+ * (for the curious, that's 40 plus the first ten terms of the
+ * Fibonacci sequence.)
+ *
+ * Further lets say we use the following code, invoking
+ * bitmap_fold() then bitmap_onto, as suggested above to
+ * avoid the possitility of an empty @dst result:
+ *
+ * unsigned long *tmp; // a temporary bitmap's bits
+ *
+ * bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
+ * bitmap_onto(dst, tmp, relmap, bits);
+ *
+ * Then this table shows what various values of @dst would be, for
+ * various @orig's. I list the zero-based positions of each set bit.
+ * The tmp column shows the intermediate result, as computed by
+ * using bitmap_fold() to fold the @orig bitmap modulo ten
+ * (the weight of @relmap).
+ *
+ * @orig tmp @dst
+ * 0 0 40
+ * 1 1 41
+ * 9 9 95
+ * 10 0 40 (*)
+ * 1 3 5 7 1 3 5 7 41 43 48 61
+ * 0 1 2 3 4 0 1 2 3 4 40 41 42 43 45
+ * 0 9 18 27 0 9 8 7 40 61 74 95
+ * 0 10 20 30 0 40
+ * 0 11 22 33 0 1 2 3 40 41 42 43
+ * 0 12 24 36 0 2 4 6 40 42 45 53
+ * 78 102 211 1 2 8 41 42 74 (*)
+ *
+ * (*) For these marked lines, if we hadn't first done bitmap_fold()
+ * into tmp, then the @dst result would have been empty.
+ *
+ * If either of @orig or @relmap is empty (no set bits), then @dst
+ * will be returned empty.
+ *
+ * If (as explained above) the only set bits in @orig are in positions
+ * m where m >= W, (where W is the weight of @relmap) then @dst will
+ * once again be returned empty.
+ *
+ * All bits in @dst not set by the above rule are cleared.
+ */
+void bitmap_onto(unsigned long *dst, const unsigned long *orig,
+ const unsigned long *relmap, int bits)
+{
+ int n, m; /* same meaning as in above comment */
+
+ if (dst == orig) /* following doesn't handle inplace mappings */
+ return;
+ bitmap_zero(dst, bits);
+
+ /*
+ * The following code is a more efficient, but less
+ * obvious, equivalent to the loop:
+ * for (m = 0; m < bitmap_weight(relmap, bits); m++) {
+ * n = bitmap_ord_to_pos(orig, m, bits);
+ * if (test_bit(m, orig))
+ * set_bit(n, dst);
+ * }
+ */
+
+ m = 0;
+ for (n = find_first_bit(relmap, bits);
+ n < bits;
+ n = find_next_bit(relmap, bits, n + 1)) {
+ /* m == bitmap_pos_to_ord(relmap, n, bits) */
+ if (test_bit(m, orig))
+ set_bit(n, dst);
+ m++;
+ }
+}
+EXPORT_SYMBOL(bitmap_onto);
+
+/**
+ * bitmap_fold - fold larger bitmap into smaller, modulo specified size
+ * @dst: resulting smaller bitmap
+ * @orig: original larger bitmap
+ * @sz: specified size
+ * @bits: number of bits in each of these bitmaps
+ *
+ * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
+ * Clear all other bits in @dst. See further the comment and
+ * Example [2] for bitmap_onto() for why and how to use this.
+ */
+void bitmap_fold(unsigned long *dst, const unsigned long *orig,
+ int sz, int bits)
+{
+ int oldbit;
+
+ if (dst == orig) /* following doesn't handle inplace mappings */
+ return;
+ bitmap_zero(dst, bits);
+
+ for (oldbit = find_first_bit(orig, bits);
+ oldbit < bits;
+ oldbit = find_next_bit(orig, bits, oldbit + 1))
+ set_bit(oldbit % sz, dst);
+}
+EXPORT_SYMBOL(bitmap_fold);
+
/*
* Common code for bitmap_*_region() routines.
* bitmap: array of unsigned longs corresponding to the bitmap

2008-02-25 15:37:20

by David Rientjes

[permalink] [raw]
Subject: [patch 6/6] mempolicy: update NUMA memory policy documentation

Updates Documentation/vm/numa_memory_policy.txt and
Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
flags.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Randy Dunlap <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
Documentation/filesystems/tmpfs.txt | 19 +++++++++++
Documentation/vm/numa_memory_policy.txt | 54 ++++++++++++++++++++++++++++--
2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -92,6 +92,25 @@ NodeList format is a comma-separated list of decimal numbers and ranges,
a range being two hyphen-separated decimal numbers, the smallest and
largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15

+It is possible to specify a static NodeList by appending '=static' to
+the memory policy mode in the mpol= argument. This will require that
+tasks or VMA's restricted to a subset of allowed nodes are only allowed
+to effect the memory policy over those nodes. No remapping of the
+NodeList when the policy is rebound, which is the default behavior, is
+allowed when '=static' is specified. For example:
+
+mpol=bind=static:NodeList will only allocate from each node in
+ the NodeList without remapping the
+ NodeList if the policy is rebound
+
+It is also possible is to specify a relative NodeList by appending
+'=relative' to the memory policy mode in the mpol= argument. When the
+allowed nodes of a task or VMA changes, the mempolicy nodemask is
+rebound to maintain the same context as the previously bound nodemask.
+For example, consider a relative mempolicy nodemask of 1-3 for a task
+that is allowed access to nodes 0-4. If those permissions change to
+allow access to 3-7 instead, the mempolicy nodemask becomes 4-6.
+
Note that trying to mount a tmpfs with an mpol option will fail if the
running kernel does not support NUMA; and will fail if its nodelist
specifies a node which is not online. If your system relies on that
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -135,9 +135,11 @@ most general to most specific:

Components of Memory Policies

- A Linux memory policy is a tuple consisting of a "mode" and an optional set
- of nodes. The mode determine the behavior of the policy, while the
- optional set of nodes can be viewed as the arguments to the behavior.
+ A Linux memory policy consists of a "mode", optional mode flags, and an
+ optional set of nodes. The mode determines the behavior of the policy,
+ the optional mode flags determine the behavior of the mode, and the
+ optional set of nodes can be viewed as the arguments to the policy
+ behavior.

Internally, memory policies are implemented by a reference counted
structure, struct mempolicy. Details of this structure will be discussed
@@ -231,6 +233,48 @@ Components of Memory Policies
the temporary interleaved system default policy works in this
mode.

+ Linux memory policy supports the following optional mode flag:
+
+ MPOL_F_STATIC_NODES: This flag specifies that the nodemask passed by
+ the user should not be remapped if the task or VMA's set of accessible
+ nodes changes after the memory policy has been defined.
+
+ Without this flag, anytime a mempolicy is rebound because of a
+ change in the set of accessible nodes, the node (Preferred) or
+ nodemask (Bind, Interleave) is remapped to the new set of
+ accessible nodes. This may result in nodes being used that were
+ previously undesired. With this flag, the policy is either
+ effected over the user's specified nodemask or the Default
+ behavior is used.
+
+ For example, consider a task that is attached to a cpuset with
+ mems 1-3 that sets an Interleave policy over the same set. If
+ the cpuset's mems change to 3-5, the Interleave will now occur
+ over nodes 3, 4, and 5. With this flag, however, since only
+ node 3 is accessible from the user's nodemask, the "interleave"
+ only occurs over that node. If no nodes from the user's
+ nodemask are now accessible, the Default behavior is used.
+
+ MPOL_F_RELATIVE_NODES: This flag specifies that the nodemask passed
+ by the user should remain in the same context as it is for the
+ current task or VMA's set of accessible nodes after the memory
+ policy has been defined.
+
+ Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+ mempolicy is rebound because of a change in the set of
+ accessible nodes, the node (Preferred) or nodemask (Bind,
+ Interleave) is remapped to the new set of accessible nodes.
+ With this flag, the remap is done to ensure the context of the
+ previous nodemask with its set of allowed mems is preserved.
+
+ For example, consider a task that is attached to a cpuset with
+ mems 1-3 that sets an Interleave policy over the same set. If
+ the cpuset's mems change to 3-7, the Interleave will now occur
+ over nodes 3, 4, and 5. With this flag, however, since a
+ nodemask of 1-3 represents the contextually second, third, and
+ fourth nodes of the allowed mems, the Interleave now occurs
+ over nodes 4-6.
+
MEMORY POLICY APIs

Linux supports 3 system calls for controlling memory policy. These APIS
@@ -251,7 +295,9 @@ Set [Task] Memory Policy:
Set's the calling task's "task/process memory policy" to mode
specified by the 'mode' argument and the set of nodes defined
by 'nmask'. 'nmask' points to a bit mask of node ids containing
- at least 'maxnode' ids.
+ at least 'maxnode' ids. Optional mode flags may be passed by
+ combining the 'mode' argument with the flag (for example:
+ MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).

See the set_mempolicy(2) man page for more details

2008-02-26 03:21:12

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 1/6] mempolicy: convert MPOL constants to enum

David wrote:

+enum {
+ MPOL_DEFAULT,
+ MPOL_PREFERRED,
+ MPOL_BIND,
+ MPOL_INTERLEAVE,
+ MPOL_MAX, /* always last member of enum */

Aren't the values that these constants take part of the
user visible kernel API?

In other words, if someone added another MPOL_* in the middle
of this enum, it would break mbind/set_mempolicy/get_mempolicy
users, right:

+enum {
+ MPOL_DEFAULT,
+ MPOL_PREFERRED,
+ MPOL_YET_ANOTHER_FLAG, /* <== added flag ... oops */
+ MPOL_BIND,
+ MPOL_INTERLEAVE,
+ MPOL_MAX, /* always last member of enum */

I'm thinking that we should still specify the specific value
of each of these flags, by way of documenting these necessary
values, as in:

+enum {
+ MPOL_DEFAULT = 0,
+ MPOL_PREFERRED = 1,
+ MPOL_BIND = 2,
+ MPOL_INTERLEAVE = 3,
+ MPOL_MAX, /* always last member of enum */


--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-26 03:36:24

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/6] mempolicy: convert MPOL constants to enum

On Mon, 25 Feb 2008, Paul Jackson wrote:

> +enum {
> + MPOL_DEFAULT,
> + MPOL_PREFERRED,
> + MPOL_BIND,
> + MPOL_INTERLEAVE,
> + MPOL_MAX, /* always last member of enum */
>
> Aren't the values that these constants take part of the
> user visible kernel API?
>
> In other words, if someone added another MPOL_* in the middle
> of this enum, it would break mbind/set_mempolicy/get_mempolicy
> users, right:
>
> +enum {
> + MPOL_DEFAULT,
> + MPOL_PREFERRED,
> + MPOL_YET_ANOTHER_FLAG, /* <== added flag ... oops */
> + MPOL_BIND,
> + MPOL_INTERLEAVE,
> + MPOL_MAX, /* always last member of enum */
>

I don't suspect that a kernel developer is going to make such an egregious
error. The user would need to be using a new linux/mempolicy.h with an
old kernel to get the wrong behavior.

> I'm thinking that we should still specify the specific value
> of each of these flags, by way of documenting these necessary
> values, as in:
>
> +enum {
> + MPOL_DEFAULT = 0,
> + MPOL_PREFERRED = 1,
> + MPOL_BIND = 2,
> + MPOL_INTERLEAVE = 3,
> + MPOL_MAX, /* always last member of enum */
>

That looks overly redundant to me and doesn't protect against adding
MPOL_YET_ANOTHER_FLAG in the middle of preferred and bind to get two mode
values with the int value of 1.

David

2008-02-26 04:02:56

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 1/6] mempolicy: convert MPOL constants to enum

David wrote:
> I don't suspect that a kernel developer is going
> to make such an egregious error.

Eh ... each bit of added clarity to the code reduces
errors.

Looking around the kernel, it really seems to me to be
a common coding to explicitly spell out enum values
when for some reason they actually matter.

Like most coding mechanisms, nothing guarantees anything.

It just nicely represents one particular detail, that
the values of these MPOL_* terms are not arbitrary.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-26 04:22:49

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/6] mempolicy: convert MPOL constants to enum

On Mon, 25 Feb 2008, Paul Jackson wrote:

> Eh ... each bit of added clarity to the code reduces
> errors.
>
> Looking around the kernel, it really seems to me to be
> a common coding to explicitly spell out enum values
> when for some reason they actually matter.
>
> Like most coding mechanisms, nothing guarantees anything.
>
> It just nicely represents one particular detail, that
> the values of these MPOL_* terms are not arbitrary.
>

Of course the MPOL_* modes aren't arbitrary; they are defined in an enum
that has a well-defined and explicit behavior for how they are mapped to
int values based on a standard.

I have more mempolicy patches that add additional behavior and cleanups so
I can queue the following for a later posting. I'd like to avoid
respinning this set unless there are actual design or implementation
concerns that are raised.
---
include/linux/mempolicy.h | 4 +++-
1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -20,7 +20,9 @@ enum {
MPOL_PREFERRED,
MPOL_BIND,
MPOL_INTERLEAVE,
- MPOL_MAX, /* always last member of enum */
+ /* add additional MPOL_* modes here */
+
+ MPOL_MAX,
};

/* Flags for set_mempolicy */

2008-02-26 04:47:03

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 1/6] mempolicy: convert MPOL constants to enum

David wrote:
+ /* add additional MPOL_* modes here */

That doesn't explicitly say what I was most worried about saying, which
is that those MPOL_* terms have values visible in the kernel's public
API, and it does say more than might be true, and it doesn't explain
why it says what it says.

It kinda looks like an ugly "maybe this will shut Paul up patch
<grin>". I'd rather leave the code the way it was than add that
comment ... sorry.

> I'd like to avoid respinning this set

Ah - now we get to the real issue ?;).

There would be no need to respin; one could do just as you proposed
doing with the above change, queue a little add-on patch to the
existing set.

Really ... look around the kernel. I believe you'll see many instances
of enum values being spelled out, even ones that count 0, 1, 2, ...,
in situations where the values are constrained by outside forces.
People really do avoid relying on the default enum behaviour having any
particular numbering.

Whatever ... do as you will.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-26 05:46:31

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag

David wrote:
+static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
+{
+ return !!(pol->flags & MPOL_F_STATIC_NODES);
+}

Why the double-negative? As best as I can tell, the return value of
mpol_store_user_nodemask() is only used in conditional contexts:

$ grep mpol_store_user_nodemask mm/mempolicy.c
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
if (mpol_store_user_nodemask(policy))
if (!mpol_store_user_nodemask(a))
if (!mpol_store_user_nodemask(pol) &&

So I see no need to waste the instructions needed (in the three copies
of this code, since it's static inline) to convert a non-zero value to
exactly the value 1.

Hmmm ... speaking of static inline ... I can knock 600 bytes (that's
IA64 bytes, so equivalent to about 300 x86 bytes) off the kernel text
size by not inlining the mm/mempolicy.c routines check_pgd_range() and
interleave_nid(). I wonder if that would be worth doing. Perhaps
those two routines are in sufficiently tight corners that the duplicate
copies of them is needed.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-26 06:12:43

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

David wrote:
+static nodemask_t mpol_relative_nodemask(const nodemask_t *orig,
+ const nodemask_t *rel)
+{
+ nodemask_t ret;
+ nodemask_t tmp;

Could you avoid needing the nodemask_t 'ret' on the stack, by passing
in a "nodemask_t *" pointer to where you want the resulting nodemask_t
written, rather than by returning it by value?

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
const nodemask_t *rel)

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-26 06:45:56

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

On Tue, 26 Feb 2008, Paul Jackson wrote:

> David wrote:
> +static nodemask_t mpol_relative_nodemask(const nodemask_t *orig,
> + const nodemask_t *rel)
> +{
> + nodemask_t ret;
> + nodemask_t tmp;
>
> Could you avoid needing the nodemask_t 'ret' on the stack, by passing
> in a "nodemask_t *" pointer to where you want the resulting nodemask_t
> written, rather than by returning it by value?
>
> static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
> const nodemask_t *rel)
>

Done, thanks.

2008-02-26 06:54:20

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag

On Mon, 25 Feb 2008, Paul Jackson wrote:

> $ grep mpol_store_user_nodemask mm/mempolicy.c
> static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
> if (mpol_store_user_nodemask(policy))
> if (!mpol_store_user_nodemask(a))
> if (!mpol_store_user_nodemask(pol) &&
>
> So I see no need to waste the instructions needed (in the three copies
> of this code, since it's static inline) to convert a non-zero value to
> exactly the value 1.
>

Done, thanks.

> Hmmm ... speaking of static inline ... I can knock 600 bytes (that's
> IA64 bytes, so equivalent to about 300 x86 bytes) off the kernel text
> size by not inlining the mm/mempolicy.c routines check_pgd_range() and
> interleave_nid(). I wonder if that would be worth doing. Perhaps
> those two routines are in sufficiently tight corners that the duplicate
> copies of them is needed.
>

It seems like a worthwhile change to me even though gcc will pass the
actuals to check_pgd_range() on the stack. The callers to check_range()
shouldn't be in any fast paths: migrate_to_node() can take a long time
depending on the length of the page list and do_mbind() sleeps on
mmap_sem.

text data bss dec hex filename
11695 24 24 11743 2ddf mm/mempolicy.o.before
11215 24 24 11263 2bff mm/mempolicy.o.after

2008-02-26 17:34:54

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 6/6] mempolicy: update NUMA memory policy documentation

+ MPOL_F_RELATIVE_NODES: This flag specifies that the nodemask passed
+ by the user should remain in the same context as it is for the
+ current task or VMA's set of accessible nodes after the memory
+ policy has been defined.
+
+ Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+ mempolicy is rebound because of a change in the set of
+ accessible nodes, the node (Preferred) or nodemask (Bind,
+ Interleave) is remapped to the new set of accessible nodes.
+ With this flag, the remap is done to ensure the context of the
+ previous nodemask with its set of allowed mems is preserved.

Hmmm ... I've read this several times now ... still can't figure out
what it's saying ;). And it doesn't really explain key aspects of
MPOL_F_RELATIVE_NODES, such as that it provides cpuset relative
numbering (use nodes 0..N-1, regardless of your current cpuset, to
refer to the first N nodes in whatever is your current cpuset.)

Perhaps we'd be further ahead of the game if you started with the
documentation changes to Documentation/vm/numa_memory_policy.txt,
in my patch:
Date: Sun, 23 Dec 2007 22:24:02 -0600
From: Paul Jackson <[email protected]>
To: David Rientjes <[email protected]>
Cc: [email protected], [email protected]
Subject: Re: [RFC] cpuset relative memory policies - second choice
Message-Id: <[email protected]>

Change MPOL_MASK_REL to MPOL_F_RELATIVE_NODES and similar changes.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-26 17:44:27

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

David,

Perhaps I missed it, but could you elaborate on what sort of testing
these patches for MPOL_F_RELATIVE_NODES and MPOL_F_STATIC_NODES have
received?

The main reason I didn't push my version of these patches in December
was I figured it would take a week or three of obsessive-compulsive
testing to verify that we didn't break various corner cases of the
mbind/mempolicy system call interface.

In particular, do we know that Oracle works with this? At least in
years past, when Andi was the primary developer here, he had some
good and detailed awareness of what it took to keep Oracle happy
with this NUMA memory policy apparatus. I don't know if we still
have that awareness.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-26 17:56:38

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag

> return do_mbind(start, len, mode, mode_flags, &nodes, flags);

The intermingling of 'flags', 'mode' and 'mode_flags' to refer to the
low bits, the high bits or all the bits of the flags field is handled
fairly carefully in your patch, but can still be a bit difficult to
keep track of which is which when reading.

I'll wager not many readers can immediately say what the 'mode',
'mode_flags' and 'flags' refer to, in the above code snippet, for
example.

Do you have any suggestions on how to further improve the clarity of
this code?

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-26 21:03:23

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 26 Feb 2008, Paul Jackson wrote:

> > return do_mbind(start, len, mode, mode_flags, &nodes, flags);
>
> The intermingling of 'flags', 'mode' and 'mode_flags' to refer to the
> low bits, the high bits or all the bits of the flags field is handled
> fairly carefully in your patch, but can still be a bit difficult to
> keep track of which is which when reading.
>
> I'll wager not many readers can immediately say what the 'mode',
> 'mode_flags' and 'flags' refer to, in the above code snippet, for
> example.
>
> Do you have any suggestions on how to further improve the clarity of
> this code?
>

This is a natural implementation detail to accomodate your insistance that
the mode and flags be passed as separate actuals throughout many of the
mm/mempolicy.c functions.

No reader is going to understand immediately what 'mode', 'mode_flags',
and 'flags' are if you only provide a single line of the code like that.

It becomes rather obvious what they represent when you read the entire
sys_mbind() implementation, which is serving a syscall that provides its
own formal for passing flags. The name 'mode_flags' is exactly what it
is: flags for the mempolicy mode.

David

2008-02-26 21:18:33

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

On Tue, 26 Feb 2008, Paul Jackson wrote:

> Perhaps I missed it, but could you elaborate on what sort of testing
> these patches for MPOL_F_RELATIVE_NODES and MPOL_F_STATIC_NODES have
> received?
>

Both MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES only really change the
implementation when a new mempolicy is created and on rebind.

So it's possible to test both of these areas by booting with numa=fake=8,
setting up a cpuset, hacking numactl to allow |static or |relative options
to be passed, changing the cpuset's mems, and checking the result. Then
you can repeat that process with a different set of initial cpuset mems
and a different syscall nodemask.

# mount -t cpuset none /dev/cpuset
# mkdir /dev/cpuset/test
# cd /dev/cpuset/test
# echo 0-3 > cpus
# echo 1-3 > mems
# echo $$ > tasks
# numactl --interleave|static=2-4 bash
# numactl --show
[should be interleave over 2-3]
# echo 4-6 > mems
[should be "interleave" over 4]
# numactl --show

> In particular, do we know that Oracle works with this? At least in
> years past, when Andi was the primary developer here, he had some
> good and detailed awareness of what it took to keep Oracle happy
> with this NUMA memory policy apparatus. I don't know if we still
> have that awareness.
>

I don't understand what Oracle has to do with anything here, it is up to
the programmer to determine whether he wants to use MPOL_F_STATIC_NODES or
MPOL_F_RELATIVE_NODES for the benefit of his or her application. There's
now a method for doing that, specifically with optional mempolicy mode
flags.

If you have any examples of unexpected behavior after applying my
patchset, please post it and I'll fix it rather quickly.

David

2008-02-26 21:25:21

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 6/6] mempolicy: update NUMA memory policy documentation

On Tue, 26 Feb 2008, Paul Jackson wrote:

> Hmmm ... I've read this several times now ... still can't figure out
> what it's saying ;). And it doesn't really explain key aspects of
> MPOL_F_RELATIVE_NODES, such as that it provides cpuset relative
> numbering (use nodes 0..N-1, regardless of your current cpuset, to
> refer to the first N nodes in whatever is your current cpuset.)
>

I'll elaborate more on the MPOL_F_RELATIVE_NODES effect.

2008-02-26 21:27:17

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

On Tue, 2008-02-26 at 11:44 -0600, Paul Jackson wrote:
> David,
>
> Perhaps I missed it, but could you elaborate on what sort of testing
> these patches for MPOL_F_RELATIVE_NODES and MPOL_F_STATIC_NODES have
> received?
>
> The main reason I didn't push my version of these patches in December
> was I figured it would take a week or three of obsessive-compulsive
> testing to verify that we didn't break various corner cases of the
> mbind/mempolicy system call interface.
>
> In particular, do we know that Oracle works with this? At least in
> years past, when Andi was the primary developer here, he had some
> good and detailed awareness of what it took to keep Oracle happy
> with this NUMA memory policy apparatus. I don't know if we still
> have that awareness.
>

I've recently been "playing with" Andi's numactl/libnuma/mempolicy
regression tests on 25-rc2-mm1. I found a couple of problems caused by
changes in behavior on newer kernels--not in the mempolicy area, but in
a couple of the tests themselves and in numastat, used by the primary
mempolicy regression test. I haven't seen any discussion of these
issues on the mailing lists, or I missed it. So, I've attempted my own
fixes.

I have placed a tarball containing patches against numactl-1.0.2 sources
at:
http://free.linux.hp.com/~lts/Patches/Numactl/numactl-1.0.2-patches-080226.tar.gz

See the patch descriptions for more info. The individual patches and
series live in the same Numactl directory for easy browsing.

The numactl-1.0.2 sources are located in Andi's repository at:

ftp://ftp.suse.com/pub/people/ak/numa/numactl-1.0.2.tar.gz

With [3(1) of] these patches, the regression tests pass on my x86_64
NUMA test platform [4 socket/node, dual core, 32GB] on 2.6.25-rc2-mm1
with and without Mel Gorman's "two zonelist" patch series. I haven't
gotten around to checking out David's latest series yet, nor have I
tried Andi's tests with cpusets. I don't know whether the tests will
pass in arbitrary cpusets--I expect not. It's on my list. In any case,
you can grab the patches if you want to regression test your own
patches.

Regards,
Lee

(1) patches required for regression tests:

numastat-01-fix-report-order-for-new-kernels.patch
regress2-01-fix-checkaffinity-numnodes-calc.patch
regress2-02-checktopology-numnodes-calc.patch

regress-01-reorg-and-annotate.patch is just a "beautification" [in the
eye of the beholder, of course] patch so that I could figure out what
was going on.



2008-02-26 21:31:10

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

David wrote:
> I don't understand what Oracle has to do with anything here ...

I'm figuring that the risk is high that we broke the existing calls,
when the new MPOL_F_*_NODES flags are not used.

> If you have any examples of unexpected behavior after applying my
> patchset, please post it and I'll fix it rather quickly.

That won't cut it for Oracle, or other such major applications
using NUMA mempolicy calls. If we wait for them to report breakage,
it will be a year or two down the road from now, and it would be
another year or two before they could presume the fix was in the
kernel version they were supporting.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-26 21:32:53

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 2008-02-26 at 13:02 -0800, David Rientjes wrote:
> On Tue, 26 Feb 2008, Paul Jackson wrote:
>
> > > return do_mbind(start, len, mode, mode_flags, &nodes, flags);
> >
> > The intermingling of 'flags', 'mode' and 'mode_flags' to refer to the
> > low bits, the high bits or all the bits of the flags field is handled
> > fairly carefully in your patch, but can still be a bit difficult to
> > keep track of which is which when reading.
> >
> > I'll wager not many readers can immediately say what the 'mode',
> > 'mode_flags' and 'flags' refer to, in the above code snippet, for
> > example.
> >
> > Do you have any suggestions on how to further improve the clarity of
> > this code?
> >
>
> This is a natural implementation detail to accomodate your insistance that
> the mode and flags be passed as separate actuals throughout many of the
> mm/mempolicy.c functions.

[:-(]

>
> No reader is going to understand immediately what 'mode', 'mode_flags',
> and 'flags' are if you only provide a single line of the code like that.
>
> It becomes rather obvious what they represent when you read the entire
> sys_mbind() implementation, which is serving a syscall that provides its
> own formal for passing flags. The name 'mode_flags' is exactly what it
> is: flags for the mempolicy mode.

Not to be confused with the MPOL_MF_* flags which are MemPOLicy Mbind
Flags passed via the flags parameter. Nor the other MPOL_F_* flags
which are get_mempolicy() flags, also passed via the flags arg.

:-)

Later,
Lee
>
> David

2008-02-26 21:39:55

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag

David wrote:
> No reader is going to understand immediately what 'mode', 'mode_flags',
> and 'flags' are if you only provide a single line of the code like that.

Duh - of course not. I only presented that line to demonstrate that all
three words 'mode', 'mode_flags', and 'flags' appeared in the same context.

> It becomes rather obvious what they represent when you read the entire
> sys_mbind() implementation,

How about picking less vacuous names than variations of mode and flag,
so that it's easier for the reader to distinguish without having to
read and understand an entire chunk of code first.

> This is a natural implementation detail to accomodate your insistance

My patch for this had explicit bit fields, which made the names clearer:

+ int mask_opts_rel:1; /* c.original_mask is relative */
+ int mask_opts_abs:1; /* c.original_mask is absolute */
+ /* if neither rel nor abs, then use c.latest_cpuset_mask */

You chose not to do that, which obfuscates what's going on.
Don't blame me that the result is obfuscated.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-26 21:54:57

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 26 Feb 2008, Lee Schermerhorn wrote:

> > This is a natural implementation detail to accomodate your insistance that
> > the mode and flags be passed as separate actuals throughout many of the
> > mm/mempolicy.c functions.
>
> [:-(]
>

I know, I know :)

It became such an elaborate discussion here that I really didn't want the
entire patchset, which adds a lot of power for cpuset-constrained
applications using mempolicies, to stall on it.

I'm not opposed to adding comments in certain places or changing the name
of an automatic variable, but that's been the degree of review that these
patches have had.

> > It becomes rather obvious what they represent when you read the entire
> > sys_mbind() implementation, which is serving a syscall that provides its
> > own formal for passing flags. The name 'mode_flags' is exactly what it
> > is: flags for the mempolicy mode.
>
> Not to be confused with the MPOL_MF_* flags which are MemPOLicy Mbind
> Flags passed via the flags parameter. Nor the other MPOL_F_* flags
> which are get_mempolicy() flags, also passed via the flags arg.
>

Notice in patch 2 of the series where I add the flags member to struct
mempolicy:

+ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */

I had to explicitly say they were for "set_mempolicy() MPOL_F_*" flags in
an attempt to separate them from the various get_mempolicy() MPOL_F_*
flags that already exist. Instead of simply choosing a different format,
I felt that MPOL_F_STATIC_NODES, for example, is exactly what userspace
should use and corresponds well with the existing get_mempolicy() flags.

David

2008-02-26 22:09:16

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag

Clearly enough, David, my sense of what makes for clear code differs
from your sense. Not much either of us can (or would want to) do about
that.

I was reluctant to push my version of this patch publically, because I
couldn't affort do test it adequately, by my definition of adequately.

So, fortunately, you stepped up to push your version.

There's plenty enough good stuff in your version that I definitely
want to see it succeed.

So, after a little bit of spitting at each other, you will take whatever
of my suggestions make sense (though try to avoid doing things to placate
me that neither one of us actually prefer ;), and you will discard the
remainder of my suggestions.

We've done that. Good luck with this.

Acked-by: Paul Jackson <[email protected]>

I still look forward to some further improvements in the Documentation
changes.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-27 01:17:29

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

On Mon, 25 Feb 2008, David Rientjes wrote:

> Adds another optional mode flag, MPOL_F_RELATIVE_NODES, that specifies
> nodemasks passed via set_mempolicy() or mbind() should be considered
> relative to the current task's mems_allowed.
>

Here's some examples of the functional changes between the default
actions of the various mempolicy modes and the new behavior with
MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES.

To read this, the logical order follows from the left-most column to the
right-most:

- "mems" is the task's mems_allowed as constrained by its attached
cpuset,

- "nodemask" is the mask passed with the set_mempolicy() or mbind() call
for that particular policy,

- the first "result" is the nodemask that the policy is effected over,

- "rebind" is the nodemask of a subsequent change to the cpuset's mems,
and

- the second "result" is the nodemask that the policy is now effected
over.

MPOL_INTERLEAVE
---------------
mems nodemask result rebind result
1-3 0-2 1-2[*] 4-6 4-5
1-3 1-2 1-2 0-2 0-1
1-3 1-3 1-3 4-7 4-6
1-3 2-4 2-3 0-2 1-2
1-3 2-6 2-3 4-7 5-6
1-3 4-7 EINVAL
1-3 0-7 1-3 4-7 4-6

MPOL_PREFERRED
--------------
mems nodemask result rebind result
1-3 0 EINVAL
1-3 2 2 4-7 5
1-3 5 EINVAL

MPOL_BIND
---------
mems nodemask result rebind result
1-3 0-2 1-2 0-2 0-1
1-3 1-2 1-2 2-7 2-3
1-3 1-3 1-3 0-1 0-1
1-3 2-4 2-3 3-6 4-5
1-3 2-6 2-3 5 5
1-3 4-7 EINVAL
1-3 0-7 1-3 1-3 1-3

[*] Notice how the resulting nodemask for all of these examples when
creating the mempolicy is intersected with mems_allowed. This is
the current behavior, with contextualize_policy(), and is identical
to the initial result of the MPOL_F_STATIC_NODES case.

Perhaps it would make more sense to remap the nodemask when it is
created, as well, in the ~MPOL_F_STATIC_NODES case. For example, in
this case, the "result" would be 1-3 instead.

That is a departure from what is currently implemented in HEAD (and,
thus, can be used as ample justification for the above behavior) but
makes more sense. Thoughts?

MPOL_INTERLEAVE | MPOL_F_STATIC_NODES
-------------------------------------
mems nodemask result rebind result
1-3 0-2 1-2 4-6 nil
1-3 1-2 1-2 0-2 1-2
1-3 1-3 1-3 4-7 nil
1-3 2-4 2-3 0-2 2
1-3 2-6 2-3 4-7 4-6
1-3 4-7 EINVAL
1-3 0-7 1-3 4-7 4-7

MPOL_PREFERRED | MPOL_F_STATIC_NODES
------------------------------------
mems nodemask result rebind result
1-3 0 EINVAL
1-3 2 2 4-7 -1[**]
1-3 5 EINVAL

[**] Upon further rebind with a nodemask of 2, the preferred node would
again be 2.

MPOL_BIND | MPOL_F_STATIC_NODES
-------------------------------
mems nodemask result rebind result
1-3 0-2 1-2 0-2 0-2
1-3 1-2 1-2 2-7 2
1-3 1-3 1-3 0-1 1
1-3 2-4 2-3 3-6 3-4
1-3 2-6 2-3 5 5
1-3 4-7 EINVAL
1-3 0-7 1-3 1-3 1-3

MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES
---------------------------------------
mems nodemask result rebind result
1-3 0-2 1-3 4-6 4-6
1-3 1-2 2-3 0-2 1-2
1-3 1-3 1-3 4-7 5-7
1-3 2-4 1-3 0-2 0-2
1-3 2-6 1-3 4-7 4-7
1-3 4-7 1-3 0-1,5 0-1,5
1-3 0-7 1-3 4-7 4-7

MPOL_PREFERRED | MPOL_F_RELATIVE_NODES
--------------------------------------
mems nodemask result rebind result[***]
1-3 0 1 0 1
1-3 2 3 4-7 3
1-3 5 3 0-7 3

[***] All of these results are wrong and will be corrected in the next
posting of the patchset. They change the preferred node in some
cases to be a node that is expressly excluded from being accessed
by the cpuset mems change.

MPOL_BIND | MPOL_F_RELATIVE_NODES
---------------------------------
mems nodemask result rebind result
1-3 0-2 1-3 0-2 0-2
1-3 1-2 2-3 2-7 3-4
1-3 1-3 1-3 0-1 0-1
1-3 2-4 1-3 3-6 3,5-6
1-3 2-6 1-3 5 5
1-3 4-7 1-3 0-3,6 0-2,6
1-3 0-7 1-3 1-3 1-3

2008-02-27 01:31:31

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

On Tue, 26 Feb 2008, David Rientjes wrote:

> MPOL_PREFERRED | MPOL_F_RELATIVE_NODES
> --------------------------------------
> mems nodemask result rebind result[***]
> 1-3 0 1 0 1
> 1-3 2 3 4-7 3
> 1-3 5 3 0-7 3
>

I'll fold the following patch into my next posting that corrects this
error as follows:

MPOL_PREFERRED | MPOL_F_RELATIVE_NODES
--------------------------------------
mems nodemask result rebind result
1-3 0 1 0 0
1-3 2 3 4-7 6
1-3 5 3 0-7 5
---
mm/mempolicy.c | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1809,8 +1809,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
} else if (relative_nodes) {
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
newmask);
- pol->v.preferred_node = node_remap(pol->v.preferred_node,
- pol->w.cpuset_mems_allowed, tmp);
+ pol->v.preferred_node = first_node(tmp);
} else {
pol->v.preferred_node = node_remap(pol->v.preferred_node,
pol->w.cpuset_mems_allowed, *newmask);

2008-02-27 02:31:04

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

Nice presentation ... thanks.

I agree with your patch to fix the MPOL_PREFERRED | MPOL_F_RELATIVE_NODES case.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-27 15:37:47

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

On Tue, 2008-02-26 at 17:17 -0800, David Rientjes wrote:
> On Mon, 25 Feb 2008, David Rientjes wrote:
>
> > Adds another optional mode flag, MPOL_F_RELATIVE_NODES, that specifies
> > nodemasks passed via set_mempolicy() or mbind() should be considered
> > relative to the current task's mems_allowed.
> >
>
> Here's some examples of the functional changes between the default
> actions of the various mempolicy modes and the new behavior with
> MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES.

Nice work. Would you consider adding this [with the corrections you
note below] to the memory policy doc under the "interaction with
cpusets" section?

>
> To read this, the logical order follows from the left-most column to the
> right-most:
>
> - "mems" is the task's mems_allowed as constrained by its attached
> cpuset,
>
> - "nodemask" is the mask passed with the set_mempolicy() or mbind() call
> for that particular policy,
>
> - the first "result" is the nodemask that the policy is effected over,
>
> - "rebind" is the nodemask of a subsequent change to the cpuset's mems,
> and
>
> - the second "result" is the nodemask that the policy is now effected
> over.
>
> MPOL_INTERLEAVE
> ---------------
> mems nodemask result rebind result
> 1-3 0-2 1-2[*] 4-6 4-5
> 1-3 1-2 1-2 0-2 0-1
> 1-3 1-3 1-3 4-7 4-6
> 1-3 2-4 2-3 0-2 1-2
> 1-3 2-6 2-3 4-7 5-6
> 1-3 4-7 EINVAL
> 1-3 0-7 1-3 4-7 4-6
>
> MPOL_PREFERRED
> --------------
> mems nodemask result rebind result
> 1-3 0 EINVAL
> 1-3 2 2 4-7 5
> 1-3 5 EINVAL
>
> MPOL_BIND
> ---------
> mems nodemask result rebind result
> 1-3 0-2 1-2 0-2 0-1
> 1-3 1-2 1-2 2-7 2-3
> 1-3 1-3 1-3 0-1 0-1
> 1-3 2-4 2-3 3-6 4-5
> 1-3 2-6 2-3 5 5
> 1-3 4-7 EINVAL
> 1-3 0-7 1-3 1-3 1-3

Just a note here: If you had used the same set of "rebind targets" for
_BIND as you did for _INTERLEAVE, I would expect the same results,
because were just remapping bit masks in both cases. Do you agree?

>
> [*] Notice how the resulting nodemask for all of these examples when
> creating the mempolicy is intersected with mems_allowed. This is
> the current behavior, with contextualize_policy(), and is identical
> to the initial result of the MPOL_F_STATIC_NODES case.
>
> Perhaps it would make more sense to remap the nodemask when it is
> created, as well, in the ~MPOL_F_STATIC_NODES case. For example, in
> this case, the "result" would be 1-3 instead.
>
> That is a departure from what is currently implemented in HEAD (and,
> thus, can be used as ample justification for the above behavior) but
> makes more sense. Thoughts?

Thoughts:

1) this IS a change in behavior, right? My first inclination is to shy
away from this. However, ...

2) the current interaction of mempolicies with cpusets is not well
documented--until Paul's cpuset.4 man page hits the streets, anyway.
That doc does say that mempolicy is not allowed to use a node outside
the cpuset. It does NOT say how this is enforced--reject vs masking vs
remap. The set_mempolicy(2) and mbind(2) man pages [in at least 2.70
man pages] says that you get EINVAL if you specify a node outside the
current cpuset constraints. This was relaxed by the recent patch to
"silently restrict" the nodes to mems allowed.

Since we update the man pages anyway, we COULD change it to say that we
remap policy to allowed nodes. However, the application may have chosen
the nodes based on some knowledge of hardware topology, such as IO
attachement, interrupt handling cpus, ... In this case, remapping
doesn't make so much sense to me.

If you need/want a mode that remaps policy to mems allowed on
installation--e.g., to provide the maximum number of interleave
nodes--how about yet another flag, such as '_REMAP, to effect this
behavior?

Just a thought...

>
> MPOL_INTERLEAVE | MPOL_F_STATIC_NODES
> -------------------------------------
> mems nodemask result rebind result
> 1-3 0-2 1-2 4-6 nil
> 1-3 1-2 1-2 0-2 1-2
> 1-3 1-3 1-3 4-7 nil
> 1-3 2-4 2-3 0-2 2
> 1-3 2-6 2-3 4-7 4-6
> 1-3 4-7 EINVAL
> 1-3 0-7 1-3 4-7 4-7

'nil' falls back to local allocation, right?

>
> MPOL_PREFERRED | MPOL_F_STATIC_NODES
> ------------------------------------
> mems nodemask result rebind result
> 1-3 0 EINVAL
> 1-3 2 2 4-7 -1[**]
> 1-3 5 EINVAL
>
> [**] Upon further rebind with a nodemask of 2, the preferred node would
> again be 2.

Here, '-1' means 'local allocation'. [Note for documentation...]

>
> MPOL_BIND | MPOL_F_STATIC_NODES
> -------------------------------
> mems nodemask result rebind result
> 1-3 0-2 1-2 0-2 0-2
> 1-3 1-2 1-2 2-7 2
> 1-3 1-3 1-3 0-1 1
> 1-3 2-4 2-3 3-6 3-4
> 1-3 2-6 2-3 5 5
> 1-3 4-7 EINVAL
> 1-3 0-7 1-3 1-3 1-3
>
> MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES
> ---------------------------------------
> mems nodemask result rebind result
> 1-3 0-2 1-3 4-6 4-6
> 1-3 1-2 2-3 0-2 1-2
> 1-3 1-3 1-3 4-7 5-7
> 1-3 2-4 1-3 0-2 0-2
> 1-3 2-6 1-3 4-7 4-7
> 1-3 4-7 1-3 0-1,5 0-1,5
> 1-3 0-7 1-3 4-7 4-7
>
> MPOL_PREFERRED | MPOL_F_RELATIVE_NODES
> --------------------------------------
> mems nodemask result rebind result[***]
> 1-3 0 1 0 1
> 1-3 2 3 4-7 3
> 1-3 5 3 0-7 3
>
> [***] All of these results are wrong and will be corrected in the next
> posting of the patchset. They change the preferred node in some
> cases to be a node that is expressly excluded from being accessed
> by the cpuset mems change.
>
> MPOL_BIND | MPOL_F_RELATIVE_NODES
> ---------------------------------
> mems nodemask result rebind result
> 1-3 0-2 1-3 0-2 0-2
> 1-3 1-2 2-3 2-7 3-4
> 1-3 1-3 1-3 0-1 0-1
> 1-3 2-4 1-3 3-6 3,5-6
> 1-3 2-6 1-3 5 5
> 1-3 4-7 1-3 0-3,6 0-2,6
> 1-3 0-7 1-3 1-3 1-3

2008-02-27 17:09:26

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

Lee wrote:
> 1) this IS a change in behavior, right? My first inclination is to shy
> away from this. However, ...
>
> 2) the current interaction of mempolicies with cpusets is not well
> documented--until Paul's cpuset.4 man page hits the streets, anyway.
> That doc does say that mempolicy is not allowed to use a node outside
> the cpuset. It does NOT say how this is enforced--reject vs masking vs

We should strive to minimize change in existing behaviour, especially
on points where either way could be arguably a reasonable choice in
some situations, and neither way is a clearly inescapable bug in other
situations.

One almost always breaks someone, somewhere with such changes. You
need a compelling reason to do that, not just a "on this hand, on that
hand" design choice.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-27 19:35:53

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 1/6] mempolicy: convert MPOL constants to enum

On Mon, 25 Feb 2008, David Rientjes wrote:

> struct mempolicy {
> atomic_t refcnt;
> - short policy; /* See MPOL_* above */
> + unsigned short policy; /* See MPOL_* above */

The point here is to have a 16 bit value? There are no space savins due to
the alignment requirement of the union. Isnt it possible to use the enum
here? If not then what is the point of the enum?

2008-02-27 20:01:09

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/6] mempolicy: convert MPOL constants to enum

On Wed, 27 Feb 2008, Christoph Lameter wrote:

> On Mon, 25 Feb 2008, David Rientjes wrote:
>
> > struct mempolicy {
> > atomic_t refcnt;
> > - short policy; /* See MPOL_* above */
> > + unsigned short policy; /* See MPOL_* above */
>
> The point here is to have a 16 bit value? There are no space savins due to
> the alignment requirement of the union. Isnt it possible to use the enum
> here? If not then what is the point of the enum?
>

It was already a 16-bit value, that is unchanged, so no space savings is
intended.

It is not possible to use the enum here because an additional unsigned
short member to hold the mode flags is added to this struct later. The
increase in size of an int compared to an unsigned short would make the
entire struct bigger; the alignment requirement of the union no longer
prevents that. After this patchset, it is 24 bytes in size while using
unsigned short compared to 32 bytes while using int.

The enum is there to ensure MPOL_MAX is always accurate since it is now
the sole way of testing for a proper mode being passed from the user, and
modes should be sequentially numbered. A later change will add an array
instance of a mempolicy_operations structure for the various modes. It's
just cleaner.

David

2008-02-28 21:08:57

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag

On Wed, 27 Feb 2008, Lee Schermerhorn wrote:

> > Here's some examples of the functional changes between the default
> > actions of the various mempolicy modes and the new behavior with
> > MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES.
>
> Nice work. Would you consider adding this [with the corrections you
> note below] to the memory policy doc under the "interaction with
> cpusets" section?
>

I think it's overly verbose for adding to documentation. Other than
describing the effects of MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES, I
don't think much more in the way of examples need to be provided (they
were here to show that the patchset actually works on creation and
rebind).

> > MPOL_INTERLEAVE
> > ---------------
> > mems nodemask result rebind result
> > 1-3 0-2 1-2[*] 4-6 4-5
> > 1-3 1-2 1-2 0-2 0-1
> > 1-3 1-3 1-3 4-7 4-6
> > 1-3 2-4 2-3 0-2 1-2
> > 1-3 2-6 2-3 4-7 5-6
> > 1-3 4-7 EINVAL
> > 1-3 0-7 1-3 4-7 4-6
> >
> > MPOL_PREFERRED
> > --------------
> > mems nodemask result rebind result
> > 1-3 0 EINVAL
> > 1-3 2 2 4-7 5
> > 1-3 5 EINVAL
> >
> > MPOL_BIND
> > ---------
> > mems nodemask result rebind result
> > 1-3 0-2 1-2 0-2 0-1
> > 1-3 1-2 1-2 2-7 2-3
> > 1-3 1-3 1-3 0-1 0-1
> > 1-3 2-4 2-3 3-6 4-5
> > 1-3 2-6 2-3 5 5
> > 1-3 4-7 EINVAL
> > 1-3 0-7 1-3 1-3 1-3
>
> Just a note here: If you had used the same set of "rebind targets" for
> _BIND as you did for _INTERLEAVE, I would expect the same results,
> because were just remapping bit masks in both cases. Do you agree?
>

Yes, the results are the same for both interleave and bind and that's
actually why I didn't use the same rebind targets.

> > [*] Notice how the resulting nodemask for all of these examples when
> > creating the mempolicy is intersected with mems_allowed. This is
> > the current behavior, with contextualize_policy(), and is identical
> > to the initial result of the MPOL_F_STATIC_NODES case.
> >
> > Perhaps it would make more sense to remap the nodemask when it is
> > created, as well, in the ~MPOL_F_STATIC_NODES case. For example, in
> > this case, the "result" would be 1-3 instead.
> >
> > That is a departure from what is currently implemented in HEAD (and,
> > thus, can be used as ample justification for the above behavior) but
> > makes more sense. Thoughts?
>
> Thoughts:
>
> 1) this IS a change in behavior, right? My first inclination is to shy
> away from this. However, ...
>

Yes, it's the current behavior as a result of contextualize_policy()
before mpol_new() is ever reached.

> 2) the current interaction of mempolicies with cpusets is not well
> documented--until Paul's cpuset.4 man page hits the streets, anyway.
> That doc does say that mempolicy is not allowed to use a node outside
> the cpuset. It does NOT say how this is enforced--reject vs masking vs
> remap. The set_mempolicy(2) and mbind(2) man pages [in at least 2.70
> man pages] says that you get EINVAL if you specify a node outside the
> current cpuset constraints. This was relaxed by the recent patch to
> "silently restrict" the nodes to mems allowed.
>
> Since we update the man pages anyway, we COULD change it to say that we
> remap policy to allowed nodes. However, the application may have chosen
> the nodes based on some knowledge of hardware topology, such as IO
> attachement, interrupt handling cpus, ... In this case, remapping
> doesn't make so much sense to me.
>
> If you need/want a mode that remaps policy to mems allowed on
> installation--e.g., to provide the maximum number of interleave
> nodes--how about yet another flag, such as '_REMAP, to effect this
> behavior?
>

MPOL_F_RELATIVE_NODES actually does map onto and fold the passed nodemask
when the mempolicy is created. That flag specifically refers to relative
nodes within a task's mems_allowed all the time; MPOL_F_STATIC_NODES
refers specifically to physical node ids.

So adding MPOL_F_REMAP doesn't make sense for MPOL_F_RELATIVE_NODES since
its already mapped onto and folded, and doesn't make sense for
MPOL_F_STATIC_NODES since the nodemask is explicitly never remapped.

It would make sense as an optional third flag that would be disjoint from
the other two and would only be useful when the mempolicy is created.

> > MPOL_INTERLEAVE | MPOL_F_STATIC_NODES
> > -------------------------------------
> > mems nodemask result rebind result
> > 1-3 0-2 1-2 4-6 nil
> > 1-3 1-2 1-2 0-2 1-2
> > 1-3 1-3 1-3 4-7 nil
> > 1-3 2-4 2-3 0-2 2
> > 1-3 2-6 2-3 4-7 4-6
> > 1-3 4-7 EINVAL
> > 1-3 0-7 1-3 4-7 4-7
>
> 'nil' falls back to local allocation, right?
>

Yes, it's an interleave over no nodes and is thus a local allocation
(current->il_next is numa_node_id()).

David