2008-02-11 15:31:18

by David Rientjes

[permalink] [raw]
Subject: [patch 1/4] mempolicy: convert MPOL constants to enum

The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND,
and MPOL_INTERLEAVE, are better declared as part of an enum for type
checking.

The policy member of struct mempolicy is also converted from type short
to type unsigned short. A negative policy does not have any legitimate
meaning, so it is possible to change its type in preparation for adding
optional mode flags later.

The equivalent member of struct shmem_sb_info is also changed from int
to unsigned short.

For compatibility, the policy formal to get_mempolicy() remains as a
pointer to an int:

int get_mempolicy(int *policy, unsigned long *nmask,
unsigned long maxnode, unsigned long addr,
unsigned long flags);

although the only possible values is the range of type unsigned short.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
Applies on top of V3 of Lee Schermerhorn's mempolicy patch posted to
LKML on February 8.

include/linux/mempolicy.h | 21 +++++++++++----------
include/linux/shmem_fs.h | 2 +-
mm/mempolicy.c | 29 +++++++++++++++++------------
mm/shmem.c | 11 ++++++-----
4 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -9,12 +9,13 @@
*/

/* Policies */
-#define MPOL_DEFAULT 0
-#define MPOL_PREFERRED 1
-#define MPOL_BIND 2
-#define MPOL_INTERLEAVE 3
-
-#define MPOL_MAX MPOL_INTERLEAVE
+enum mempolicy_mode {
+ MPOL_DEFAULT,
+ MPOL_PREFERRED,
+ MPOL_BIND,
+ MPOL_INTERLEAVE,
+ MPOL_MAX, /* always last member of mempolicy_mode */
+};

/* Flags for get_mem_policy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
@@ -62,7 +63,7 @@ struct mm_struct;
*/
struct mempolicy {
atomic_t refcnt;
- short policy; /* See MPOL_* above */
+ unsigned short policy; /* See MPOL_* above */
union {
struct zonelist *zonelist; /* bind */
short preferred_node; /* preferred */
@@ -133,8 +134,8 @@ struct shared_policy {
spinlock_t lock;
};

-void mpol_shared_policy_init(struct shared_policy *info, int policy,
- nodemask_t *nodes);
+void mpol_shared_policy_init(struct shared_policy *info,
+ enum mempolicy_mode policy, nodemask_t *nodes);
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma,
struct mempolicy *new);
@@ -200,7 +201,7 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
}

static inline void mpol_shared_policy_init(struct shared_policy *info,
- int policy, nodemask_t *nodes)
+ enum mempolicy_type policy, nodemask_t *nodes)
{
}

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -34,7 +34,7 @@ struct shmem_sb_info {
uid_t uid; /* Mount uid for root directory */
gid_t gid; /* Mount gid for root directory */
mode_t mode; /* Mount mode for root directory */
- int policy; /* Default NUMA memory alloc policy */
+ unsigned short policy; /* Default NUMA memory alloc policy */
nodemask_t policy_nodes; /* nodemask for preferred and bind */
};

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -114,7 +114,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
const nodemask_t *newmask);

/* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, nodemask_t *nodes)
+static int mpol_check_policy(enum mempolicy_mode mode, nodemask_t *nodes)
{
int was_empty, is_empty;

@@ -159,6 +159,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
if (!was_empty && is_empty)
return -EINVAL;
break;
+ default:
+ BUG();
}
return 0;
}
@@ -201,7 +203,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
}

/* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
{
struct mempolicy *policy;

@@ -235,6 +237,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
return error_code;
}
break;
+ default:
+ BUG();
}
policy->policy = mode;
policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
@@ -479,7 +483,7 @@ static void mpol_set_task_struct_flag(void)
}

/* Set the process memory policy */
-static long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(enum mempolicy_mode mode, nodemask_t *nodes)
{
struct mempolicy *new;

@@ -781,7 +785,7 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
#endif

static long do_mbind(unsigned long start, unsigned long len,
- unsigned long mode, nodemask_t *nmask,
+ enum mempolicy_mode mode, nodemask_t *nmask,
unsigned long flags)
{
struct vm_area_struct *vma;
@@ -791,9 +795,8 @@ static long do_mbind(unsigned long start, unsigned long len,
int err;
LIST_HEAD(pagelist);

- if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+ if (flags & ~(unsigned long)(MPOL_MF_STRICT |
MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- || mode > MPOL_MAX)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
@@ -826,7 +829,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;

- pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+ pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
mode, nmask ? nodes_addr(*nmask)[0] : -1);

down_write(&mm->mmap_sem);
@@ -927,6 +930,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
nodemask_t nodes;
int err;

+ if (mode >= MPOL_MAX)
+ return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
@@ -940,7 +945,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
int err;
nodemask_t nodes;

- if (mode < 0 || mode > MPOL_MAX)
+ if (mode < 0 || mode >= MPOL_MAX)
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
@@ -1206,7 +1211,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
*/
unsigned slab_node(struct mempolicy *policy)
{
- int pol = policy ? policy->policy : MPOL_DEFAULT;
+ enum mempolicy_mode pol = policy ? policy->policy : MPOL_DEFAULT;

switch (pol) {
case MPOL_INTERLEAVE:
@@ -1634,8 +1639,8 @@ restart:
return 0;
}

-void mpol_shared_policy_init(struct shared_policy *info, int policy,
- nodemask_t *policy_nodes)
+void mpol_shared_policy_init(struct shared_policy *info,
+ enum mempolicy_mode policy, nodemask_t *policy_nodes)
{
info->root = RB_ROOT;
spin_lock_init(&info->lock);
@@ -1853,7 +1858,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
char *p = buffer;
int l;
nodemask_t nodes;
- int mode = pol ? pol->policy : MPOL_DEFAULT;
+ enum mempolicy_mode mode = pol ? pol->policy : MPOL_DEFAULT;

switch (mode) {
case MPOL_DEFAULT:
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1082,7 +1082,8 @@ redirty:

#ifdef CONFIG_NUMA
#ifdef CONFIG_TMPFS
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static int shmem_parse_mpol(char *value, unsigned short *policy,
+ nodemask_t *policy_nodes)
{
char *nodelist = strchr(value, ':');
int err = 1;
@@ -1131,7 +1132,7 @@ out:
return err;
}

-static void shmem_show_mpol(struct seq_file *seq, int policy,
+static void shmem_show_mpol(struct seq_file *seq, enum mempolicy_mode policy,
const nodemask_t policy_nodes)
{
char *policy_string;
@@ -1200,14 +1201,14 @@ static struct page *shmem_alloc_page(gfp_t gfp,
}
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
-static inline int shmem_parse_mpol(char *value, int *policy,
+static inline int shmem_parse_mpol(char *value, unsigned short *policy,
nodemask_t *policy_nodes)
{
return 1;
}

-static inline void shmem_show_mpol(struct seq_file *seq, int policy,
- const nodemask_t policy_nodes)
+static inline void shmem_show_mpol(struct seq_file *seq,
+ enum mempolicy_mode policy, const nodemask_t policy_nodes)
{
}
#endif /* CONFIG_TMPFS */


2008-02-11 15:31:44

by David Rientjes

[permalink] [raw]
Subject: [patch 4/4] mempolicy: update NUMA memory policy documentation

Updates Documentation/vm/numa_memory_policy.txt and
Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
flags.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
Documentation/filesystems/tmpfs.txt | 11 ++++++++
Documentation/vm/numa_memory_policy.txt | 41 +++++++++++++++++++++++++++----
2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -92,6 +92,17 @@ NodeList format is a comma-separated list of decimal numbers and ranges,
a range being two hyphen-separated decimal numbers, the smallest and
largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15

+It is possible to specify a static NodeList by appending '=static' to
+the memory policy mode in the mpol= argument. This will require that
+tasks or VMA's restricted to a subset of allowed nodes are only allowed
+to effect the memory policy over those nodes. No remapping of the
+NodeList when the policy is rebound, which is the default behavior, is
+allowed when '=static' is specified. For example:
+
+mpol=bind=static:NodeList will only allocate from each node in
+ the NodeList without remapping the
+ NodeList if the policy is rebound
+
Note that trying to mount a tmpfs with an mpol option will fail if the
running kernel does not support NUMA; and will fail if its nodelist
specifies a node which is not online. If your system relies on that
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -135,9 +135,11 @@ most general to most specific:

Components of Memory Policies

- A Linux memory policy is a tuple consisting of a "mode" and an optional set
- of nodes. The mode determine the behavior of the policy, while the
- optional set of nodes can be viewed as the arguments to the behavior.
+ A Linux memory policy consists of a "mode", optional mode flags, and an
+ optional set of nodes. The mode determine the behavior of the policy,
+ the optional mode flags determine the behavior of the mode, and the
+ optional set of nodes can be viewed as the arguments to the policy
+ behavior.

Internally, memory policies are implemented by a reference counted
structure, struct mempolicy. Details of this structure will be discussed
@@ -145,7 +147,12 @@ Components of Memory Policies

Note: in some functions AND in the struct mempolicy itself, the mode
is called "policy". However, to avoid confusion with the policy tuple,
- this document will continue to use the term "mode".
+ this document will continue to use the term "mode". Since the mode and
+ optional mode flags are stored in the same struct mempolicy member
+ (specifically, pol->policy), you must use mpol_mode(pol->policy) to
+ access only the mode and mpol_flags(pol->policy) to access only the
+ flags. Any function with a formal of type enum mempolicy_mode only
+ refers to the mode.

Linux memory policy supports the following 4 behavioral modes:

@@ -231,6 +238,28 @@ Components of Memory Policies
the temporary interleaved system default policy works in this
mode.

+ Linux memory policy supports the following optional mode flag:
+
+ MPOL_F_STATIC_NODES: This flag specifies that the nodemask passed by
+ the user should not be remapped if the task or VMA's set of accessible
+ nodes changes after the memory policy has been defined.
+
+ Without this flag, anytime a mempolicy is rebound because of a
+ change in the set of accessible nodes, the node (Preferred) or
+ nodemask (Bind, Interleave) is remapped to the new set of
+ accessible nodes. This may result in nodes being used that were
+ previously undesired. With this flag, the policy is either
+ effected over the user's specified nodemask or the Default
+ behavior is used.
+
+ For example, consider a task that is attached to a cpuset with
+ mems 1-3 that sets an Interleave policy over the same set. If
+ the cpuset's mems change to 3-5, the Interleave will now occur
+ over nodes 3, 4, and 5. With this flag, however, since only
+ node 3 is accessible from the user's nodemask, the "interleave"
+ only occurs over that node. If no nodes from the user's
+ nodemask are now accessible, the Default behavior is used.
+
MEMORY POLICY APIs

Linux supports 3 system calls for controlling memory policy. These APIS
@@ -251,7 +280,9 @@ Set [Task] Memory Policy:
Set's the calling task's "task/process memory policy" to mode
specified by the 'mode' argument and the set of nodes defined
by 'nmask'. 'nmask' points to a bit mask of node ids containing
- at least 'maxnode' ids.
+ at least 'maxnode' ids. Optional mode flags may be passed by
+ intersecting the 'mode' argument with the flag (for example:
+ MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).

See the set_mempolicy(2) man page for more details

2008-02-11 15:32:22

by David Rientjes

[permalink] [raw]
Subject: [patch 2/4] mempolicy: support optional mode flags

With the evolution of mempolicies, it is necessary to support mempolicy
mode flags that specify how the policy shall behave in certain
circumstances. The most immediate need for mode flag support is to
suppress remapping the nodemask of a policy at the time of rebind.

With the small number of possible mempolicy modes (currently four) and
the large size of the struct mempolicy member that stores this mode
(unsigned short), it is possible to store both the mode and optional mode
flags in the same member.

To do this, it is necessary to mask off the optional mode flag bits when
testing the mempolicy mode. A new constant, MPOL_FLAG_SHIFT, indicates
the shift necessary to find the flag bits within struct mempolicy's
policy member. With this, MPOL_MODE_MASK can be defined to mask off the
optional flags, yielding just the mode itself.

A helper function:

unsigned char mpol_mode(unsigned short mode)

has been implemented to return only a policy's mode. Notice that the
return type is unsigned char since MPOL_FLAG_SHIFT is currently defined
at eight. mpol_flags(unsigned short mode) does the inverse.

While this requires frequent use of mpol_mode() within the internal
mempolicy code, it is easy to distinguish between actuals that contain
only modes and those that contain the entire policy member (modes and
flags). If the formal uses enum mempolicy_mode, only the mode itself
may successfully be passed because of type checking.

Note that this does not break userspace code that relies on
get_mempolicy(&policy, ...) and either 'switch (policy)' statements or
'if (policy == MPOL_INTERLEAVE)' statements. Such applications would
need to use optional mode flags when calling set_mempolicy() or mbind()
for these previously implemented statements to stop working. If an
application does start using optional mode flags, it will need to use
mpol_mode() in switch and conditional statements that only test mode.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
fs/hugetlbfs/inode.c | 2 +-
include/linux/mempolicy.h | 39 +++++++++++++++++++++--
mm/mempolicy.c | 77 ++++++++++++++++++++++++++------------------
mm/shmem.c | 15 ++++++--
4 files changed, 93 insertions(+), 40 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -504,7 +504,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
INIT_LIST_HEAD(&inode->i_mapping->private_list);
info = HUGETLBFS_I(inode);
- mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
+ mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0, NULL);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -17,7 +17,18 @@ enum mempolicy_mode {
MPOL_MAX, /* always last member of mempolicy_mode */
};

-/* Flags for get_mem_policy */
+/*
+ * The lower MPOL_FLAG_SHIFT bits of the policy mode represent the MPOL_*
+ * constants defined in enum mempolicy_mode. The upper bits represent
+ * optional set_mempolicy() MPOL_F_* mode flags.
+ */
+#define MPOL_FLAG_SHIFT (8)
+#define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)
+
+/* Flags for set_mempolicy */
+#define MPOL_MODE_FLAGS (0) /* legal set_mempolicy() MPOL_* mode flags */
+
+/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
@@ -115,6 +126,16 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)

#define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)

+static inline unsigned char mpol_mode(unsigned short mode)
+{
+ return mode & MPOL_MODE_MASK;
+}
+
+static inline unsigned short mpol_flags(unsigned short mode)
+{
+ return mode & ~MPOL_MODE_MASK;
+}
+
/*
* Tree of shared policies for a shared memory region.
* Maintain the policies in a pseudo mm that contains vmas. The vmas
@@ -135,7 +156,8 @@ struct shared_policy {
};

void mpol_shared_policy_init(struct shared_policy *info,
- enum mempolicy_mode policy, nodemask_t *nodes);
+ enum mempolicy_mode policy, unsigned short flags,
+ nodemask_t *nodes);
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma,
struct mempolicy *new);
@@ -176,6 +198,16 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
}
#define vma_mpol_equal(a,b) 1

+static inline unsigned char mpol_mode(unsigned short mode)
+{
+ return 0;
+}
+
+static inline unsigned short mpol_flags(unsigned short mode)
+{
+ return 0;
+}
+
#define mpol_set_vma_default(vma) do {} while(0)

static inline void mpol_free(struct mempolicy *p)
@@ -201,7 +233,8 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
}

static inline void mpol_shared_policy_init(struct shared_policy *info,
- enum mempolicy_type policy, nodemask_t *nodes)
+ enum mempolicy_type policy, unsigned short flags,
+ nodemask_t *nodes)
{
}

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -203,18 +203,20 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
}

/* Create a new policy */
-static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(enum mempolicy_mode mode,
+ unsigned short flags, nodemask_t *nodes)
{
struct mempolicy *policy;

- pr_debug("setting mode %d nodes[0] %lx\n",
- mode, nodes ? nodes_addr(*nodes)[0] : -1);
+ pr_debug("setting mode %d flags %d nodes[0] %lx\n",
+ mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);

if (mode == MPOL_DEFAULT)
return NULL;
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
+ flags &= MPOL_MODE_FLAGS;
atomic_set(&policy->refcnt, 1);
switch (mode) {
case MPOL_INTERLEAVE:
@@ -240,7 +242,7 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
default:
BUG();
}
- policy->policy = mode;
+ policy->policy = mode | flags;
policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
return policy;
}
@@ -483,19 +485,20 @@ static void mpol_set_task_struct_flag(void)
}

/* Set the process memory policy */
-static long do_set_mempolicy(enum mempolicy_mode mode, nodemask_t *nodes)
+static long do_set_mempolicy(enum mempolicy_mode mode, unsigned short flags,
+ nodemask_t *nodes)
{
struct mempolicy *new;

if (mpol_check_policy(mode, nodes))
return -EINVAL;
- new = mpol_new(mode, nodes);
+ new = mpol_new(mode, flags, nodes);
if (IS_ERR(new))
return PTR_ERR(new);
mpol_free(current->mempolicy);
current->mempolicy = new;
mpol_set_task_struct_flag();
- if (new && new->policy == MPOL_INTERLEAVE)
+ if (new && mpol_mode(new->policy) == MPOL_INTERLEAVE)
current->il_next = first_node(new->v.nodes);
return 0;
}
@@ -506,7 +509,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
int i;

nodes_clear(*nodes);
- switch (p->policy) {
+ switch (mpol_mode(p->policy)) {
case MPOL_BIND:
for (i = 0; p->v.zonelist->zones[i]; i++)
node_set(zone_to_nid(p->v.zonelist->zones[i]),
@@ -588,7 +591,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
- pol->policy == MPOL_INTERLEAVE) {
+ mpol_mode(pol->policy) == MPOL_INTERLEAVE) {
*policy = current->il_next;
} else {
err = -EINVAL;
@@ -785,8 +788,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
#endif

static long do_mbind(unsigned long start, unsigned long len,
- enum mempolicy_mode mode, nodemask_t *nmask,
- unsigned long flags)
+ enum mempolicy_mode mode, unsigned short mode_flags,
+ nodemask_t *nmask, unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
@@ -818,7 +821,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (mpol_check_policy(mode, nmask))
return -EINVAL;

- new = mpol_new(mode, nmask);
+ new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);

@@ -829,8 +832,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;

- pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
- mode, nmask ? nodes_addr(*nmask)[0] : -1);
+ pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
+ start, start + len, mode, mode_flags,
+ nmask ? nodes_addr(*nmask)[0] : -1);

down_write(&mm->mmap_sem);
vma = check_range(mm, start, end, nmask,
@@ -929,13 +933,16 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
{
nodemask_t nodes;
int err;
+ unsigned short mode_flags;

+ mode_flags = mpol_flags(mode);
+ mode = mpol_mode(mode);
if (mode >= MPOL_MAX)
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_mbind(start, len, mode, &nodes, flags);
+ return do_mbind(start, len, mode, mode_flags, &nodes, flags);
}

/* Set the process memory policy */
@@ -944,13 +951,16 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
{
int err;
nodemask_t nodes;
+ unsigned short flags;

+ flags = mpol_flags(mode);
+ mode = mpol_mode(mode);
if (mode < 0 || mode >= MPOL_MAX)
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_set_mempolicy(mode, &nodes);
+ return do_set_mempolicy(mode, flags, &nodes);
}

asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
@@ -1152,7 +1162,7 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
pol = vma->vm_ops->get_policy(vma, addr);
shared_pol = 1; /* if pol non-NULL, add ref below */
} else if (vma->vm_policy &&
- vma->vm_policy->policy != MPOL_DEFAULT)
+ mpol_mode(vma->vm_policy->policy) != MPOL_DEFAULT)
pol = vma->vm_policy;
}
if (!pol)
@@ -1167,7 +1177,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
{
int nd;

- switch (policy->policy) {
+ switch (mpol_mode(policy->policy)) {
case MPOL_PREFERRED:
nd = policy->v.preferred_node;
if (nd < 0)
@@ -1211,7 +1221,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
*/
unsigned slab_node(struct mempolicy *policy)
{
- enum mempolicy_mode pol = policy ? policy->policy : MPOL_DEFAULT;
+ enum mempolicy_mode pol = policy ? mpol_mode(policy->policy) :
+ MPOL_DEFAULT;

switch (pol) {
case MPOL_INTERLEAVE:
@@ -1297,7 +1308,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
struct zonelist *zl;

*mpol = NULL; /* probably no unref needed */
- if (pol->policy == MPOL_INTERLEAVE) {
+ if (mpol_mode(pol->policy) == MPOL_INTERLEAVE) {
unsigned nid;

nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
@@ -1307,7 +1318,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,

zl = zonelist_policy(GFP_HIGHUSER, pol);
if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
- if (pol->policy != MPOL_BIND)
+ if (mpol_mode(pol->policy) != MPOL_BIND)
__mpol_free(pol); /* finished with pol */
else
*mpol = pol; /* unref needed after allocation */
@@ -1361,7 +1372,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)

cpuset_update_task_memory_state();

- if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
+ if (unlikely(mpol_mode(pol->policy) == MPOL_INTERLEAVE)) {
unsigned nid;

nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
@@ -1409,7 +1420,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
cpuset_update_task_memory_state();
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
- if (pol->policy == MPOL_INTERLEAVE)
+ if (mpol_mode(pol->policy) == MPOL_INTERLEAVE)
return alloc_page_interleave(gfp, order, interleave_nodes(pol));
return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
}
@@ -1436,7 +1447,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
}
*new = *old;
atomic_set(&new->refcnt, 1);
- if (new->policy == MPOL_BIND) {
+ if (mpol_mode(new->policy) == MPOL_BIND) {
int sz = ksize(old->v.zonelist);
new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
if (!new->v.zonelist) {
@@ -1454,7 +1465,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
return 0;
if (a->policy != b->policy)
return 0;
- switch (a->policy) {
+ switch (mpol_mode(a->policy)) {
case MPOL_DEFAULT:
return 1;
case MPOL_INTERLEAVE:
@@ -1479,7 +1490,7 @@ void __mpol_free(struct mempolicy *p)
{
if (!atomic_dec_and_test(&p->refcnt))
return;
- if (p->policy == MPOL_BIND)
+ if (mpol_mode(p->policy) == MPOL_BIND)
kfree(p->v.zonelist);
p->policy = MPOL_DEFAULT;
kmem_cache_free(policy_cache, p);
@@ -1640,7 +1651,8 @@ restart:
}

void mpol_shared_policy_init(struct shared_policy *info,
- enum mempolicy_mode policy, nodemask_t *policy_nodes)
+ enum mempolicy_mode policy, unsigned short flags,
+ nodemask_t *policy_nodes)
{
info->root = RB_ROOT;
spin_lock_init(&info->lock);
@@ -1649,7 +1661,7 @@ void mpol_shared_policy_init(struct shared_policy *info,
struct mempolicy *newpol;

/* Falls back to MPOL_DEFAULT on any error */
- newpol = mpol_new(policy, policy_nodes);
+ newpol = mpol_new(policy, flags, policy_nodes);
if (!IS_ERR(newpol)) {
/* Create pseudo-vma that contains just the policy */
struct vm_area_struct pvma;
@@ -1745,14 +1757,14 @@ void __init numa_policy_init(void)
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);

- if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
+ if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
}

/* Reset policy of current process to default */
void numa_default_policy(void)
{
- do_set_mempolicy(MPOL_DEFAULT, NULL);
+ do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

/* Migrate a policy to a different set of nodes */
@@ -1768,7 +1780,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
if (nodes_equal(*mpolmask, *newmask))
return;

- switch (pol->policy) {
+ switch (mpol_mode(pol->policy)) {
case MPOL_DEFAULT:
break;
case MPOL_INTERLEAVE:
@@ -1858,7 +1870,8 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
char *p = buffer;
int l;
nodemask_t nodes;
- enum mempolicy_mode mode = pol ? pol->policy : MPOL_DEFAULT;
+ enum mempolicy_mode mode = pol ? mpol_mode(pol->policy) :
+ MPOL_DEFAULT;

switch (mode) {
case MPOL_DEFAULT:
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1086,6 +1086,7 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
nodemask_t *policy_nodes)
{
char *nodelist = strchr(value, ':');
+ char *flags = strchr(value, '=');
int err = 1;

if (nodelist) {
@@ -1096,6 +1097,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
goto out;
}
+ if (flags)
+ *flags++ = '\0';
if (!strcmp(value, "default")) {
*policy = MPOL_DEFAULT;
/* Don't allow a nodelist */
@@ -1125,6 +1128,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
*policy_nodes = node_states[N_HIGH_MEMORY];
err = 0;
}
+ if (flags) {
+ }
out:
/* Restore string for error message */
if (nodelist)
@@ -1154,7 +1159,7 @@ static void shmem_show_mpol(struct seq_file *seq, enum mempolicy_mode policy,

seq_printf(seq, ",mpol=%s", policy_string);

- if (policy != MPOL_INTERLEAVE ||
+ if (mpol_mode(policy) != MPOL_INTERLEAVE ||
!nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) {
char buffer[64];
int len;
@@ -1577,8 +1582,10 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
case S_IFREG:
inode->i_op = &shmem_inode_operations;
inode->i_fop = &shmem_file_operations;
- mpol_shared_policy_init(&info->policy, sbinfo->policy,
- &sbinfo->policy_nodes);
+ mpol_shared_policy_init(&info->policy,
+ mpol_mode(sbinfo->policy),
+ mpol_flags(sbinfo->policy),
+ &sbinfo->policy_nodes);
break;
case S_IFDIR:
inc_nlink(inode);
@@ -1592,7 +1599,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
* Must not load anything in the rbtree,
* mpol_free_shared_policy will not be called.
*/
- mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+ mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0,
NULL);
break;
}

2008-02-11 15:32:34

by David Rientjes

[permalink] [raw]
Subject: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses
the node remap when the policy is rebound.

Adds another member to struct mempolicy,

nodemask_t user_nodemask

that stores the the nodemask that the user passed when he or she created
the mempolicy via set_mempolicy() or mbind(). When using
MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's
passed nodemask is always used when determining the preferred node,
building the MPOL_BIND zonelist, or creating the interleave nodemask.
This happens whenever the policy is rebound, including when a task's
cpuset assignment changes or the cpuset's mems are changed.

This creates an interesting side-effect in that it allows the mempolicy
"intent" to lie dormant and uneffected until it has access to the node(s)
that it desires. For example, if you currently ask for an interleaved
policy over a set of nodes that you do not have access to, the mempolicy
is not created and the task continues to use the equivalent of
MPOL_DEFAULT. With this change, however, it is possible to create the
same mempolicy; it is effected only when access to the nodes is acquired.

It is also possible to mount tmpfs with the static nodemask behavior when
specifying a node or nodemask. To do this, simply add "=static"
immediately following the mempolicy mode at mount time:

mount -o remount mpol=interleave=static:1-3

Also removes mpol_check_policy() and folds some of its logic into
mpol_new() since it is now mostly obsoleted.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
include/linux/mempolicy.h | 4 +-
mm/mempolicy.c | 137 ++++++++++++++++++---------------------------
mm/shmem.c | 2 +
3 files changed, 60 insertions(+), 83 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -26,7 +26,8 @@ enum mempolicy_mode {
#define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)

/* Flags for set_mempolicy */
-#define MPOL_MODE_FLAGS (0) /* legal set_mempolicy() MPOL_* mode flags */
+#define MPOL_F_STATIC_NODES (1 << MPOL_FLAG_SHIFT)
+#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES) /* legal set_mempolicy() MPOL_* mode flags */

/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
@@ -82,6 +83,7 @@ struct mempolicy {
/* undefined for default */
} v;
nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
+ nodemask_t user_nodemask; /* nodemask passed by user */
};

/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -113,58 +113,6 @@ struct mempolicy default_policy = {
static void mpol_rebind_policy(struct mempolicy *pol,
const nodemask_t *newmask);

-/* Do sanity checking on a policy */
-static int mpol_check_policy(enum mempolicy_mode mode, nodemask_t *nodes)
-{
- int was_empty, is_empty;
-
- if (!nodes)
- return 0;
-
- /*
- * "Contextualize" the in-coming nodemast for cpusets:
- * Remember whether in-coming nodemask was empty, If not,
- * restrict the nodes to the allowed nodes in the cpuset.
- * This is guaranteed to be a subset of nodes with memory.
- */
- cpuset_update_task_memory_state();
- is_empty = was_empty = nodes_empty(*nodes);
- if (!was_empty) {
- nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
- is_empty = nodes_empty(*nodes); /* after "contextualization" */
- }
-
- switch (mode) {
- case MPOL_DEFAULT:
- /*
- * require caller to specify an empty nodemask
- * before "contextualization"
- */
- if (!was_empty)
- return -EINVAL;
- break;
- case MPOL_BIND:
- case MPOL_INTERLEAVE:
- /*
- * require at least 1 valid node after "contextualization"
- */
- if (is_empty)
- return -EINVAL;
- break;
- case MPOL_PREFERRED:
- /*
- * Did caller specify invalid nodes?
- * Don't silently accept this as "local allocation".
- */
- if (!was_empty && is_empty)
- return -EINVAL;
- break;
- default:
- BUG();
- }
- return 0;
-}
-
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
@@ -207,6 +155,7 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
unsigned short flags, nodemask_t *nodes)
{
struct mempolicy *policy;
+ nodemask_t cpuset_context_nmask;

pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -218,21 +167,27 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
return ERR_PTR(-ENOMEM);
flags &= MPOL_MODE_FLAGS;
atomic_set(&policy->refcnt, 1);
+ cpuset_update_task_memory_state();
+ nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
switch (mode) {
case MPOL_INTERLEAVE:
- policy->v.nodes = *nodes;
+ if (nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ policy->v.nodes = cpuset_context_nmask;
if (nodes_weight(policy->v.nodes) == 0) {
kmem_cache_free(policy_cache, policy);
return ERR_PTR(-EINVAL);
}
break;
case MPOL_PREFERRED:
- policy->v.preferred_node = first_node(*nodes);
+ policy->v.preferred_node = first_node(cpuset_context_nmask);
if (policy->v.preferred_node >= MAX_NUMNODES)
policy->v.preferred_node = -1;
break;
case MPOL_BIND:
- policy->v.zonelist = bind_zonelist(nodes);
+ if (nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ policy->v.zonelist = bind_zonelist(&cpuset_context_nmask);
if (IS_ERR(policy->v.zonelist)) {
void *error_code = policy->v.zonelist;
kmem_cache_free(policy_cache, policy);
@@ -244,6 +199,7 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
}
policy->policy = mode | flags;
policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
+ policy->user_nodemask = *nodes;
return policy;
}

@@ -490,15 +446,14 @@ static long do_set_mempolicy(enum mempolicy_mode mode, unsigned short flags,
{
struct mempolicy *new;

- if (mpol_check_policy(mode, nodes))
- return -EINVAL;
new = mpol_new(mode, flags, nodes);
if (IS_ERR(new))
return PTR_ERR(new);
mpol_free(current->mempolicy);
current->mempolicy = new;
mpol_set_task_struct_flag();
- if (new && mpol_mode(new->policy) == MPOL_INTERLEAVE)
+ if (new && mpol_mode(new->policy) == MPOL_INTERLEAVE &&
+ nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
return 0;
}
@@ -818,9 +773,6 @@ static long do_mbind(unsigned long start, unsigned long len,
if (end == start)
return 0;

- if (mpol_check_policy(mode, nmask))
- return -EINVAL;
-
new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
@@ -1211,7 +1163,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
next = next_node(nid, policy->v.nodes);
if (next >= MAX_NUMNODES)
next = first_node(policy->v.nodes);
- me->il_next = next;
+ if (next < MAX_NUMNODES)
+ me->il_next = next;
return nid;
}

@@ -1250,10 +1203,13 @@ static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
- unsigned target = (unsigned)off % nnodes;
+ unsigned target;
int c;
int nid = -1;

+ if (!nnodes)
+ return numa_node_id();
+ target = (unsigned)off % nnodes;
c = 0;
do {
nid = next_node(nid, pol->v.nodes);
@@ -1773,6 +1729,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
{
nodemask_t *mpolmask;
nodemask_t tmp;
+ int remap;

if (!pol)
return;
@@ -1780,51 +1737,67 @@ static void mpol_rebind_policy(struct mempolicy *pol,
if (nodes_equal(*mpolmask, *newmask))
return;

+ remap = !(mpol_flags(pol->policy) & MPOL_F_STATIC_NODES);
switch (mpol_mode(pol->policy)) {
case MPOL_DEFAULT:
break;
case MPOL_INTERLEAVE:
- nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
+ if (!remap)
+ nodes_and(tmp, pol->user_nodemask, *newmask);
+ else
+ nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
pol->v.nodes = tmp;
- *mpolmask = *newmask;
- current->il_next = node_remap(current->il_next,
- *mpolmask, *newmask);
+ if (!node_isset(current->il_next, tmp)) {
+ current->il_next = next_node(current->il_next, tmp);
+ if (current->il_next >= MAX_NUMNODES)
+ current->il_next = first_node(tmp);
+ if (current->il_next >= MAX_NUMNODES)
+ current->il_next = numa_node_id();
+ }
break;
case MPOL_PREFERRED:
- pol->v.preferred_node = node_remap(pol->v.preferred_node,
- *mpolmask, *newmask);
- *mpolmask = *newmask;
+ if (!remap && !node_isset(pol->v.preferred_node, *newmask))
+ pol->v.preferred_node = -1;
+ else
+ pol->v.preferred_node = node_remap(pol->v.preferred_node,
+ *mpolmask, *newmask);
break;
case MPOL_BIND: {
- nodemask_t nodes;
- struct zone **z;
- struct zonelist *zonelist;
-
- nodes_clear(nodes);
- for (z = pol->v.zonelist->zones; *z; z++)
- node_set(zone_to_nid(*z), nodes);
- nodes_remap(tmp, nodes, *mpolmask, *newmask);
- nodes = tmp;
+ struct zonelist *zonelist = NULL;
+
+ if (!remap)
+ nodes_and(tmp, pol->user_nodemask, *newmask);
+ else {
+ nodemask_t nodes;
+ struct zone **z;
+
+ nodes_clear(nodes);
+ for (z = pol->v.zonelist->zones; *z; z++)
+ node_set(zone_to_nid(*z), nodes);
+ nodes_remap(tmp, nodes, *mpolmask, *newmask);
+ }

- zonelist = bind_zonelist(&nodes);
+ if (nodes_weight(tmp))
+ zonelist = bind_zonelist(&tmp);

/* If no mem, then zonelist is NULL and we keep old zonelist.
* If that old zonelist has no remaining mems_allowed nodes,
* then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
*/

- if (!IS_ERR(zonelist)) {
+ if (zonelist && !IS_ERR(zonelist)) {
/* Good - got mem - substitute new zonelist */
kfree(pol->v.zonelist);
pol->v.zonelist = zonelist;
}
- *mpolmask = *newmask;
break;
}
default:
BUG();
break;
}
+ if (mpol_mode(pol->policy) != MPOL_DEFAULT)
+ *mpolmask = *newmask;
}

/*
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1129,6 +1129,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
err = 0;
}
if (flags) {
+ if (!strcmp(flags, "static"))
+ *policy |= MPOL_F_STATIC_NODES;
}
out:
/* Restore string for error message */

2008-02-11 16:36:49

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 2/4] mempolicy: support optional mode flags

On Mon, 2008-02-11 at 07:30 -0800, David Rientjes wrote:
> With the evolution of mempolicies, it is necessary to support mempolicy
> mode flags that specify how the policy shall behave in certain
> circumstances. The most immediate need for mode flag support is to
> suppress remapping the nodemask of a policy at the time of rebind.
>
> With the small number of possible mempolicy modes (currently four) and
> the large size of the struct mempolicy member that stores this mode
> (unsigned short), it is possible to store both the mode and optional mode
> flags in the same member.
>
> To do this, it is necessary to mask off the optional mode flag bits when
> testing the mempolicy mode. A new constant, MPOL_FLAG_SHIFT, indicates
> the shift necessary to find the flag bits within struct mempolicy's
> policy member. With this, MPOL_MODE_MASK can be defined to mask off the
> optional flags, yielding just the mode itself.
>
> A helper function:
>
> unsigned char mpol_mode(unsigned short mode)
>
> has been implemented to return only a policy's mode. Notice that the
> return type is unsigned char since MPOL_FLAG_SHIFT is currently defined
> at eight. mpol_flags(unsigned short mode) does the inverse.
>
> While this requires frequent use of mpol_mode() within the internal
> mempolicy code, it is easy to distinguish between actuals that contain
> only modes and those that contain the entire policy member (modes and
> flags). If the formal uses enum mempolicy_mode, only the mode itself
> may successfully be passed because of type checking.
>
> Note that this does not break userspace code that relies on
> get_mempolicy(&policy, ...) and either 'switch (policy)' statements or
> 'if (policy == MPOL_INTERLEAVE)' statements. Such applications would
> need to use optional mode flags when calling set_mempolicy() or mbind()
> for these previously implemented statements to stop working. If an
> application does start using optional mode flags, it will need to use
> mpol_mode() in switch and conditional statements that only test mode.
>

Hi, David:

These patches look good--well, interesting, anyway. I'm "off on
assignment" this week, so I won't get to review in detail, merge and
test them until next...

This helper functions introduced by this patch are similar in nature
[but go further] to one I introduced in the reference counting cleanup
RFC [http://marc.info/?l=linux-mm&m=119697614520515&w=4] I posted a
while back. I've been holding these cleanup patches until Andrew starts
accepting this sort of thing again. I have my series based atop Mel
Gorman's [added to cc] "two zonelist" series, as it depends on removing
the custom zonelist from the mempolicy.

[I'm hoping, as mempolicies evolve, we can refrain from hanging any
allocated structs off them as that does complicate reference counting --
especially in swap read-ahead where we can attempt to allocate several
pages using a single policy lookup. See the patch linked above]

We need to sort out with Andrew, Mel, Paul, ... the order in which these
interdependent changes go in. Given such an order, I'm willing to merge
them all up, test them, and post them [after running checkpatch, of
course].

One other thing: In the recent mempolicy patch to "silently restrict
nodemask], I mentioned the issue with regards to whether and when to
"contextualize" tmpfs/hugetlbfs policies--if/when we fold
mpol_check_policy() into mpol_new(), as you suggested. Once we can
agree on the desired semantics, I had been thinking that an additional
mode flag could be added to policies obtained from the superblock, and
passed via mpol_shared_policy_init() [which calls mpol_new()] could be
used for this purpose. Your change here seems to lay the foundation for
implementing that.

Lee

Cc: Paul Jackson <[email protected]>
> Cc: Christoph Lameter <[email protected]>
> Cc: Lee Schermerhorn <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> fs/hugetlbfs/inode.c | 2 +-
> include/linux/mempolicy.h | 39 +++++++++++++++++++++--
> mm/mempolicy.c | 77 ++++++++++++++++++++++++++------------------
> mm/shmem.c | 15 ++++++--
> 4 files changed, 93 insertions(+), 40 deletions(-)
>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -504,7 +504,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
> inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> INIT_LIST_HEAD(&inode->i_mapping->private_list);
> info = HUGETLBFS_I(inode);
> - mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
> + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0, NULL);
> switch (mode & S_IFMT) {
> default:
> init_special_inode(inode, mode, dev);
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -17,7 +17,18 @@ enum mempolicy_mode {
> MPOL_MAX, /* always last member of mempolicy_mode */
> };
>
> -/* Flags for get_mem_policy */
> +/*
> + * The lower MPOL_FLAG_SHIFT bits of the policy mode represent the MPOL_*
> + * constants defined in enum mempolicy_mode. The upper bits represent
> + * optional set_mempolicy() MPOL_F_* mode flags.
> + */
> +#define MPOL_FLAG_SHIFT (8)
> +#define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)
> +
> +/* Flags for set_mempolicy */
> +#define MPOL_MODE_FLAGS (0) /* legal set_mempolicy() MPOL_* mode flags */
> +
> +/* Flags for get_mempolicy */
> #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
> #define MPOL_F_ADDR (1<<1) /* look up vma using address */
> #define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
> @@ -115,6 +126,16 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
>
> #define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)
>
> +static inline unsigned char mpol_mode(unsigned short mode)
> +{
> + return mode & MPOL_MODE_MASK;
> +}
> +
> +static inline unsigned short mpol_flags(unsigned short mode)
> +{
> + return mode & ~MPOL_MODE_MASK;
> +}
> +
> /*
> * Tree of shared policies for a shared memory region.
> * Maintain the policies in a pseudo mm that contains vmas. The vmas
> @@ -135,7 +156,8 @@ struct shared_policy {
> };
>
> void mpol_shared_policy_init(struct shared_policy *info,
> - enum mempolicy_mode policy, nodemask_t *nodes);
> + enum mempolicy_mode policy, unsigned short flags,
> + nodemask_t *nodes);
> int mpol_set_shared_policy(struct shared_policy *info,
> struct vm_area_struct *vma,
> struct mempolicy *new);
> @@ -176,6 +198,16 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
> }
> #define vma_mpol_equal(a,b) 1
>
> +static inline unsigned char mpol_mode(unsigned short mode)
> +{
> + return 0;
> +}
> +
> +static inline unsigned short mpol_flags(unsigned short mode)
> +{
> + return 0;
> +}
> +
> #define mpol_set_vma_default(vma) do {} while(0)
>
> static inline void mpol_free(struct mempolicy *p)
> @@ -201,7 +233,8 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
> }
>
> static inline void mpol_shared_policy_init(struct shared_policy *info,
> - enum mempolicy_type policy, nodemask_t *nodes)
> + enum mempolicy_type policy, unsigned short flags,
> + nodemask_t *nodes)
> {
> }
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -203,18 +203,20 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
> }
>
> /* Create a new policy */
> -static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
> +static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> + unsigned short flags, nodemask_t *nodes)
> {
> struct mempolicy *policy;
>
> - pr_debug("setting mode %d nodes[0] %lx\n",
> - mode, nodes ? nodes_addr(*nodes)[0] : -1);
> + pr_debug("setting mode %d flags %d nodes[0] %lx\n",
> + mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
>
> if (mode == MPOL_DEFAULT)
> return NULL;
> policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
> if (!policy)
> return ERR_PTR(-ENOMEM);
> + flags &= MPOL_MODE_FLAGS;
> atomic_set(&policy->refcnt, 1);
> switch (mode) {
> case MPOL_INTERLEAVE:
> @@ -240,7 +242,7 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
> default:
> BUG();
> }
> - policy->policy = mode;
> + policy->policy = mode | flags;
> policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
> return policy;
> }
> @@ -483,19 +485,20 @@ static void mpol_set_task_struct_flag(void)
> }
>
> /* Set the process memory policy */
> -static long do_set_mempolicy(enum mempolicy_mode mode, nodemask_t *nodes)
> +static long do_set_mempolicy(enum mempolicy_mode mode, unsigned short flags,
> + nodemask_t *nodes)
> {
> struct mempolicy *new;
>
> if (mpol_check_policy(mode, nodes))
> return -EINVAL;
> - new = mpol_new(mode, nodes);
> + new = mpol_new(mode, flags, nodes);
> if (IS_ERR(new))
> return PTR_ERR(new);
> mpol_free(current->mempolicy);
> current->mempolicy = new;
> mpol_set_task_struct_flag();
> - if (new && new->policy == MPOL_INTERLEAVE)
> + if (new && mpol_mode(new->policy) == MPOL_INTERLEAVE)
> current->il_next = first_node(new->v.nodes);
> return 0;
> }
> @@ -506,7 +509,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
> int i;
>
> nodes_clear(*nodes);
> - switch (p->policy) {
> + switch (mpol_mode(p->policy)) {
> case MPOL_BIND:
> for (i = 0; p->v.zonelist->zones[i]; i++)
> node_set(zone_to_nid(p->v.zonelist->zones[i]),
> @@ -588,7 +591,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
> goto out;
> *policy = err;
> } else if (pol == current->mempolicy &&
> - pol->policy == MPOL_INTERLEAVE) {
> + mpol_mode(pol->policy) == MPOL_INTERLEAVE) {
> *policy = current->il_next;
> } else {
> err = -EINVAL;
> @@ -785,8 +788,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
> #endif
>
> static long do_mbind(unsigned long start, unsigned long len,
> - enum mempolicy_mode mode, nodemask_t *nmask,
> - unsigned long flags)
> + enum mempolicy_mode mode, unsigned short mode_flags,
> + nodemask_t *nmask, unsigned long flags)
> {
> struct vm_area_struct *vma;
> struct mm_struct *mm = current->mm;
> @@ -818,7 +821,7 @@ static long do_mbind(unsigned long start, unsigned long len,
> if (mpol_check_policy(mode, nmask))
> return -EINVAL;
>
> - new = mpol_new(mode, nmask);
> + new = mpol_new(mode, mode_flags, nmask);
> if (IS_ERR(new))
> return PTR_ERR(new);
>
> @@ -829,8 +832,9 @@ static long do_mbind(unsigned long start, unsigned long len,
> if (!new)
> flags |= MPOL_MF_DISCONTIG_OK;
>
> - pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
> - mode, nmask ? nodes_addr(*nmask)[0] : -1);
> + pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
> + start, start + len, mode, mode_flags,
> + nmask ? nodes_addr(*nmask)[0] : -1);
>
> down_write(&mm->mmap_sem);
> vma = check_range(mm, start, end, nmask,
> @@ -929,13 +933,16 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
> {
> nodemask_t nodes;
> int err;
> + unsigned short mode_flags;
>
> + mode_flags = mpol_flags(mode);
> + mode = mpol_mode(mode);
> if (mode >= MPOL_MAX)
> return -EINVAL;
> err = get_nodes(&nodes, nmask, maxnode);
> if (err)
> return err;
> - return do_mbind(start, len, mode, &nodes, flags);
> + return do_mbind(start, len, mode, mode_flags, &nodes, flags);
> }
>
> /* Set the process memory policy */
> @@ -944,13 +951,16 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
> {
> int err;
> nodemask_t nodes;
> + unsigned short flags;
>
> + flags = mpol_flags(mode);
> + mode = mpol_mode(mode);
> if (mode < 0 || mode >= MPOL_MAX)
> return -EINVAL;
> err = get_nodes(&nodes, nmask, maxnode);
> if (err)
> return err;
> - return do_set_mempolicy(mode, &nodes);
> + return do_set_mempolicy(mode, flags, &nodes);
> }
>
> asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
> @@ -1152,7 +1162,7 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
> pol = vma->vm_ops->get_policy(vma, addr);
> shared_pol = 1; /* if pol non-NULL, add ref below */
> } else if (vma->vm_policy &&
> - vma->vm_policy->policy != MPOL_DEFAULT)
> + mpol_mode(vma->vm_policy->policy) != MPOL_DEFAULT)
> pol = vma->vm_policy;
> }
> if (!pol)
> @@ -1167,7 +1177,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
> {
> int nd;
>
> - switch (policy->policy) {
> + switch (mpol_mode(policy->policy)) {
> case MPOL_PREFERRED:
> nd = policy->v.preferred_node;
> if (nd < 0)
> @@ -1211,7 +1221,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
> */
> unsigned slab_node(struct mempolicy *policy)
> {
> - enum mempolicy_mode pol = policy ? policy->policy : MPOL_DEFAULT;
> + enum mempolicy_mode pol = policy ? mpol_mode(policy->policy) :
> + MPOL_DEFAULT;
>
> switch (pol) {
> case MPOL_INTERLEAVE:
> @@ -1297,7 +1308,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
> struct zonelist *zl;
>
> *mpol = NULL; /* probably no unref needed */
> - if (pol->policy == MPOL_INTERLEAVE) {
> + if (mpol_mode(pol->policy) == MPOL_INTERLEAVE) {
> unsigned nid;
>
> nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
> @@ -1307,7 +1318,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
>
> zl = zonelist_policy(GFP_HIGHUSER, pol);
> if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
> - if (pol->policy != MPOL_BIND)
> + if (mpol_mode(pol->policy) != MPOL_BIND)
> __mpol_free(pol); /* finished with pol */
> else
> *mpol = pol; /* unref needed after allocation */
> @@ -1361,7 +1372,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
>
> cpuset_update_task_memory_state();
>
> - if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
> + if (unlikely(mpol_mode(pol->policy) == MPOL_INTERLEAVE)) {
> unsigned nid;
>
> nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
> @@ -1409,7 +1420,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
> cpuset_update_task_memory_state();
> if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
> pol = &default_policy;
> - if (pol->policy == MPOL_INTERLEAVE)
> + if (mpol_mode(pol->policy) == MPOL_INTERLEAVE)
> return alloc_page_interleave(gfp, order, interleave_nodes(pol));
> return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
> }
> @@ -1436,7 +1447,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
> }
> *new = *old;
> atomic_set(&new->refcnt, 1);
> - if (new->policy == MPOL_BIND) {
> + if (mpol_mode(new->policy) == MPOL_BIND) {
> int sz = ksize(old->v.zonelist);
> new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
> if (!new->v.zonelist) {
> @@ -1454,7 +1465,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
> return 0;
> if (a->policy != b->policy)
> return 0;
> - switch (a->policy) {
> + switch (mpol_mode(a->policy)) {
> case MPOL_DEFAULT:
> return 1;
> case MPOL_INTERLEAVE:
> @@ -1479,7 +1490,7 @@ void __mpol_free(struct mempolicy *p)
> {
> if (!atomic_dec_and_test(&p->refcnt))
> return;
> - if (p->policy == MPOL_BIND)
> + if (mpol_mode(p->policy) == MPOL_BIND)
> kfree(p->v.zonelist);
> p->policy = MPOL_DEFAULT;
> kmem_cache_free(policy_cache, p);
> @@ -1640,7 +1651,8 @@ restart:
> }
>
> void mpol_shared_policy_init(struct shared_policy *info,
> - enum mempolicy_mode policy, nodemask_t *policy_nodes)
> + enum mempolicy_mode policy, unsigned short flags,
> + nodemask_t *policy_nodes)
> {
> info->root = RB_ROOT;
> spin_lock_init(&info->lock);
> @@ -1649,7 +1661,7 @@ void mpol_shared_policy_init(struct shared_policy *info,
> struct mempolicy *newpol;
>
> /* Falls back to MPOL_DEFAULT on any error */
> - newpol = mpol_new(policy, policy_nodes);
> + newpol = mpol_new(policy, flags, policy_nodes);
> if (!IS_ERR(newpol)) {
> /* Create pseudo-vma that contains just the policy */
> struct vm_area_struct pvma;
> @@ -1745,14 +1757,14 @@ void __init numa_policy_init(void)
> if (unlikely(nodes_empty(interleave_nodes)))
> node_set(prefer, interleave_nodes);
>
> - if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
> + if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
> printk("numa_policy_init: interleaving failed\n");
> }
>
> /* Reset policy of current process to default */
> void numa_default_policy(void)
> {
> - do_set_mempolicy(MPOL_DEFAULT, NULL);
> + do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
> }
>
> /* Migrate a policy to a different set of nodes */
> @@ -1768,7 +1780,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
> if (nodes_equal(*mpolmask, *newmask))
> return;
>
> - switch (pol->policy) {
> + switch (mpol_mode(pol->policy)) {
> case MPOL_DEFAULT:
> break;
> case MPOL_INTERLEAVE:
> @@ -1858,7 +1870,8 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
> char *p = buffer;
> int l;
> nodemask_t nodes;
> - enum mempolicy_mode mode = pol ? pol->policy : MPOL_DEFAULT;
> + enum mempolicy_mode mode = pol ? mpol_mode(pol->policy) :
> + MPOL_DEFAULT;
>
> switch (mode) {
> case MPOL_DEFAULT:
> diff --git a/mm/shmem.c b/mm/shmem.c
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1086,6 +1086,7 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
> nodemask_t *policy_nodes)
> {
> char *nodelist = strchr(value, ':');
> + char *flags = strchr(value, '=');
> int err = 1;
>
> if (nodelist) {
> @@ -1096,6 +1097,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
> if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
> goto out;
> }
> + if (flags)
> + *flags++ = '\0';
> if (!strcmp(value, "default")) {
> *policy = MPOL_DEFAULT;
> /* Don't allow a nodelist */
> @@ -1125,6 +1128,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
> *policy_nodes = node_states[N_HIGH_MEMORY];
> err = 0;
> }
> + if (flags) {
> + }
> out:
> /* Restore string for error message */
> if (nodelist)
> @@ -1154,7 +1159,7 @@ static void shmem_show_mpol(struct seq_file *seq, enum mempolicy_mode policy,
>
> seq_printf(seq, ",mpol=%s", policy_string);
>
> - if (policy != MPOL_INTERLEAVE ||
> + if (mpol_mode(policy) != MPOL_INTERLEAVE ||
> !nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) {
> char buffer[64];
> int len;
> @@ -1577,8 +1582,10 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
> case S_IFREG:
> inode->i_op = &shmem_inode_operations;
> inode->i_fop = &shmem_file_operations;
> - mpol_shared_policy_init(&info->policy, sbinfo->policy,
> - &sbinfo->policy_nodes);
> + mpol_shared_policy_init(&info->policy,
> + mpol_mode(sbinfo->policy),
> + mpol_flags(sbinfo->policy),
> + &sbinfo->policy_nodes);
> break;
> case S_IFDIR:
> inc_nlink(inode);
> @@ -1592,7 +1599,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
> * Must not load anything in the rbtree,
> * mpol_free_shared_policy will not be called.
> */
> - mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
> + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0,
> NULL);
> break;
> }ahead where we want to

2008-02-11 18:25:27

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

Hi David,

In general, I like this feature.
and I found no bug in patch [1/4] and [2/4].

> @@ -218,21 +167,27 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> return ERR_PTR(-ENOMEM);
> flags &= MPOL_MODE_FLAGS;
> atomic_set(&policy->refcnt, 1);
> + cpuset_update_task_memory_state();
> + nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
> switch (mode) {
> case MPOL_INTERLEAVE:
> - policy->v.nodes = *nodes;
> + if (nodes_empty(*nodes))
> + return ERR_PTR(-EINVAL);

need kmem_cache_free(policy_cache, policy) before return?

> + policy->v.nodes = cpuset_context_nmask;
> if (nodes_weight(policy->v.nodes) == 0) {
> kmem_cache_free(policy_cache, policy);
> return ERR_PTR(-EINVAL);
> }
> break;
> case MPOL_PREFERRED:
> - policy->v.preferred_node = first_node(*nodes);
> + policy->v.preferred_node = first_node(cpuset_context_nmask);
> if (policy->v.preferred_node >= MAX_NUMNODES)
> policy->v.preferred_node = -1;
> break;
> case MPOL_BIND:
> - policy->v.zonelist = bind_zonelist(nodes);
> + if (nodes_empty(*nodes))
> + return ERR_PTR(-EINVAL);

ditto

Thanks!

2008-02-11 18:45:25

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Monday 11 February 2008 16:30:32 David Rientjes wrote:
> The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND,
> and MPOL_INTERLEAVE, are better declared as part of an enum for type
> checking.

What type checking? There is none in standard C for enums.

-Andi

2008-02-11 19:27:36

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Mon, 11 Feb 2008, Andi Kleen wrote:

> > The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND,
> > and MPOL_INTERLEAVE, are better declared as part of an enum for type
> > checking.
>
> What type checking? There is none in standard C for enums.
>

"Type checking" probably isn't the best description for it. As I
mentioned in the changelog for the second patch in this series, a function
with a formal type of 'enum mempolicy_mode' indicates that the optional
mode flags have already been stripped off and the only possible values are
those of 'enum mempolicy_mode'. The implementation will not need to use
mpol_mode() in conditionals or switch statements. I think it's a clean
way of describing what is acting on modes and what is acting on flags.

Functions with a formal type of an 'int' contain both the mode and flags.

David

2008-02-11 19:32:29

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Mon, 11 Feb 2008, David Rientjes wrote:

> The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND,
> and MPOL_INTERLEAVE, are better declared as part of an enum for type
> checking.
>
> The policy member of struct mempolicy is also converted from type short
> to type unsigned short. A negative policy does not have any legitimate
> meaning, so it is possible to change its type in preparation for adding
> optional mode flags later.

The second paragraphs seems to indicate that such an approach does not
work since we also use MPOL_xx constants to set flags in the memory
policies?

2008-02-11 19:34:58

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

Looks conceptually good (I have not looked at the details).

2008-02-11 19:36:53

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 2/4] mempolicy: support optional mode flags

On Mon, 11 Feb 2008, Lee Schermerhorn wrote:

> These patches look good--well, interesting, anyway. I'm "off on
> assignment" this week, so I won't get to review in detail, merge and
> test them until next...
>

If, by "interesting", you mean that they give the most power to the user
in setting up their mempolicies than they have ever had before, then I
agree.

> This helper functions introduced by this patch are similar in nature
> [but go further] to one I introduced in the reference counting cleanup
> RFC [http://marc.info/?l=linux-mm&m=119697614520515&w=4] I posted a
> while back. I've been holding these cleanup patches until Andrew starts
> accepting this sort of thing again. I have my series based atop Mel
> Gorman's [added to cc] "two zonelist" series, as it depends on removing
> the custom zonelist from the mempolicy.
>

If my helper functions are similar to yours then basing either of our
patchsets on top of the other should not be difficult.

> We need to sort out with Andrew, Mel, Paul, ... the order in which these
> interdependent changes go in. Given such an order, I'm willing to merge
> them all up, test them, and post them [after running checkpatch, of
> course].
>

Thanks for volunteering to test the changes. I don't know how many
patchsets are currently outstanding that touch mempolicies. So far we
have mine and the refcounting cleanup of yours that you mentioned.

I think the best way of dealing with it would be for the author of
whatever patchset is merged second to rebase off the current -mm just like
I based this entire patchset on your V3 contextualize_policy() patch from
a couple days ago.

> One other thing: In the recent mempolicy patch to "silently restrict
> nodemask], I mentioned the issue with regards to whether and when to
> "contextualize" tmpfs/hugetlbfs policies--if/when we fold
> mpol_check_policy() into mpol_new(), as you suggested. Once we can
> agree on the desired semantics, I had been thinking that an additional
> mode flag could be added to policies obtained from the superblock, and
> passed via mpol_shared_policy_init() [which calls mpol_new()] could be
> used for this purpose. Your change here seems to lay the foundation for
> implementing that.
>

My patchset already supports contextualized tmpfs mempolicies with a
template for how to specify them (see patch 4 in this series for the
documentation update). For example, mpol=interleave:1-3 is the equivalent
of MPOL_INTERLEAVE over nodes 1-3 while mpol=interleave=static:1-3 is the
equivalent of MPOL_INTERLEAVE | MPOL_F_STATIC_NODES.

David

2008-02-11 19:45:35

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Mon, 11 Feb 2008, Christoph Lameter wrote:

> > The policy member of struct mempolicy is also converted from type short
> > to type unsigned short. A negative policy does not have any legitimate
> > meaning, so it is possible to change its type in preparation for adding
> > optional mode flags later.
>
> The second paragraphs seems to indicate that such an approach does not
> work since we also use MPOL_xx constants to set flags in the memory
> policies?
>

Not sure I'm understanding your question, sorry.

Mempolicy modes have always been int constants because it doesn't make
sense to combine them. Putting them into 'enum mempolicy_mode' leaves
that unchanged.

Mempolicy flags can be combined (even though my patchset only currently
implements one, it's easy to implement others). So they definitely cannot
be enum constants.

Regardless, storing the policy (mode | flags) in struct mempolicy as a
'short' doesn't help since a negative policy doesn't mean anything. In
preparation for allowing the upper MPOL_FLAG_SHIFT bits to be used to
store the flags of this member, I converted it to 'unsigned short'. This
is because the API with userspace is through 'int', which is implicitly
signed, and we don't want to sign-extend the upper bit if it's ever used
to hold a mempolicy flag.

David

2008-02-11 19:49:17

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Mon, 11 Feb 2008, David Rientjes wrote:

> > The second paragraphs seems to indicate that such an approach does not
> > work since we also use MPOL_xx constants to set flags in the memory
> > policies?
> >
>
> Not sure I'm understanding your question, sorry.
>
> Mempolicy modes have always been int constants because it doesn't make
> sense to combine them. Putting them into 'enum mempolicy_mode' leaves
> that unchanged.

> Mempolicy flags can be combined (even though my patchset only currently
> implements one, it's easy to implement others). So they definitely cannot
> be enum constants.

> Regardless, storing the policy (mode | flags) in struct mempolicy as a
> 'short' doesn't help since a negative policy doesn't mean anything. In
> preparation for allowing the upper MPOL_FLAG_SHIFT bits to be used to
> store the flags of this member, I converted it to 'unsigned short'. This
> is because the API with userspace is through 'int', which is implicitly
> signed, and we don't want to sign-extend the upper bit if it's ever used
> to hold a mempolicy flag.

Then you could follow through with the enum mempolicy thing
throughtout. Why not use enum mempolicy in struct mempolicy?

2008-02-11 19:58:12

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 12 Feb 2008, KOSAKI Motohiro wrote:

> > @@ -218,21 +167,27 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> > return ERR_PTR(-ENOMEM);
> > flags &= MPOL_MODE_FLAGS;
> > atomic_set(&policy->refcnt, 1);
> > + cpuset_update_task_memory_state();
> > + nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
> > switch (mode) {
> > case MPOL_INTERLEAVE:
> > - policy->v.nodes = *nodes;
> > + if (nodes_empty(*nodes))
> > + return ERR_PTR(-EINVAL);
>
> need kmem_cache_free(policy_cache, policy) before return?
>

Very good catch!



mempolicy: fix policy memory leak in mpol_new()

If mpol_new() cannot setup a new mempolicy because of an invalid argument
provided by the user, avoid leaking the mempolicy that has been dynamically
allocated.

Reported by KOSAKI Motohiro.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: KOSAKI Motohiro <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
mm/mempolicy.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -171,13 +171,11 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
switch (mode) {
case MPOL_INTERLEAVE:
- if (nodes_empty(*nodes))
- return ERR_PTR(-EINVAL);
- policy->v.nodes = cpuset_context_nmask;
- if (nodes_weight(policy->v.nodes) == 0) {
+ if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask)) {
kmem_cache_free(policy_cache, policy);
return ERR_PTR(-EINVAL);
}
+ policy->v.nodes = cpuset_context_nmask;
break;
case MPOL_PREFERRED:
policy->v.preferred_node = first_node(cpuset_context_nmask);
@@ -185,8 +183,10 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
policy->v.preferred_node = -1;
break;
case MPOL_BIND:
- if (nodes_empty(*nodes))
+ if (nodes_empty(*nodes)) {
+ kmem_cache_free(policy_cache, policy);
return ERR_PTR(-EINVAL);
+ }
policy->v.zonelist = bind_zonelist(&cpuset_context_nmask);
if (IS_ERR(policy->v.zonelist)) {
void *error_code = policy->v.zonelist;

2008-02-11 19:59:01

by Randy Dunlap

[permalink] [raw]
Subject: Re: [patch 4/4] mempolicy: update NUMA memory policy documentation

On Mon, 11 Feb 2008 07:30:35 -0800 (PST) David Rientjes wrote:

> Updates Documentation/vm/numa_memory_policy.txt and
> Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
> flags.
>
> Cc: Paul Jackson <[email protected]>
> Cc: Christoph Lameter <[email protected]>
> Cc: Lee Schermerhorn <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> Documentation/filesystems/tmpfs.txt | 11 ++++++++
> Documentation/vm/numa_memory_policy.txt | 41 +++++++++++++++++++++++++++----
> 2 files changed, 47 insertions(+), 5 deletions(-)
>

> diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
> --- a/Documentation/vm/numa_memory_policy.txt
> +++ b/Documentation/vm/numa_memory_policy.txt
> @@ -135,9 +135,11 @@ most general to most specific:
>
> Components of Memory Policies
>
> - A Linux memory policy is a tuple consisting of a "mode" and an optional set
> - of nodes. The mode determine the behavior of the policy, while the
> - optional set of nodes can be viewed as the arguments to the behavior.
> + A Linux memory policy consists of a "mode", optional mode flags, and an
> + optional set of nodes. The mode determine the behavior of the policy,

determines

> + the optional mode flags determine the behavior of the mode, and the
> + optional set of nodes can be viewed as the arguments to the policy
> + behavior.
>
> Internally, memory policies are implemented by a reference counted
> structure, struct mempolicy. Details of this structure will be discussed
> @@ -145,7 +147,12 @@ Components of Memory Policies
>
> Note: in some functions AND in the struct mempolicy itself, the mode
> is called "policy". However, to avoid confusion with the policy tuple,
> - this document will continue to use the term "mode".
> + this document will continue to use the term "mode". Since the mode and
> + optional mode flags are stored in the same struct mempolicy member
> + (specifically, pol->policy), you must use mpol_mode(pol->policy) to
> + access only the mode and mpol_flags(pol->policy) to access only the
> + flags. Any function with a formal of type enum mempolicy_mode only
> + refers to the mode.
>
> Linux memory policy supports the following 4 behavioral modes:
>
> @@ -231,6 +238,28 @@ Components of Memory Policies
> the temporary interleaved system default policy works in this
> mode.
>
> + Linux memory policy supports the following optional mode flag:
> +
> + MPOL_F_STATIC_NODES: This flag specifies that the nodemask passed by
> + the user should not be remapped if the task or VMA's set of accessible
> + nodes changes after the memory policy has been defined.
> +
> + Without this flag, anytime a mempolicy is rebound because of a
> + change in the set of accessible nodes, the node (Preferred) or
> + nodemask (Bind, Interleave) is remapped to the new set of
> + accessible nodes. This may result in nodes being used that were
> + previously undesired. With this flag, the policy is either
> + effected over the user's specified nodemask or the Default
> + behavior is used.
> +
> + For example, consider a task that is attached to a cpuset with
> + mems 1-3 that sets an Interleave policy over the same set. If
> + the cpuset's mems change to 3-5, the Interleave will now occur
> + over nodes 3, 4, and 5. With this flag, however, since only
> + node 3 is accessible from the user's nodemask, the "interleave"
> + only occurs over that node. If no nodes from the user's
> + nodemask are now accessible, the Default behavior is used.
> +
> MEMORY POLICY APIs
>
> Linux supports 3 system calls for controlling memory policy. These APIS
> @@ -251,7 +280,9 @@ Set [Task] Memory Policy:
> Set's the calling task's "task/process memory policy" to mode
> specified by the 'mode' argument and the set of nodes defined
> by 'nmask'. 'nmask' points to a bit mask of node ids containing
> - at least 'maxnode' ids.
> + at least 'maxnode' ids. Optional mode flags may be passed by
> + intersecting the 'mode' argument with the flag (for example:

or combining ? I'm used to intersection being more of an
and-op instead of an or-op, but maybe I'm missing something.

> + MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).
>
> See the set_mempolicy(2) man page for more details

---
~Randy

2008-02-11 20:04:47

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Mon, 11 Feb 2008, Christoph Lameter wrote:

> Then you could follow through with the enum mempolicy thing
> throughtout. Why not use enum mempolicy in struct mempolicy?
>

Mempolicy flags, as I implemented them in patch 2 in this series, are not
integer constants that are enumerated starting at 0. They are individual
bits that are shifted a pre-defined length and intersected with the
enumerated mode. This allows both the mode and the flags to be stored in
the same object.

Just because enum mempolicy_mode is the equivalent of passing an int in C
is irrelevant; its semantics are that the value is coming from enum
mempolicy_mode. That includes _only_ the mode itself:

enum mempolicy_mode {
MPOL_DEFAULT,
MPOL_BIND,
MPOL_PREFERRED,
MPOL_INTERLEAVE,
MPOL_MAX,
};

And changing the policy member of struct mempolicy to 'enum
mempolicy_mode' instead of 'unsigned short' would increase its size. Not
that it matters, since in the third patch I add a whole nodemask_t, but
it's simply unnecessary. Right now we have the capacity to store 256
individual mempolicy modes (we currently use four) and eight mempolicy
flags with unsigned short.

David

2008-02-11 20:09:19

by David Rientjes

[permalink] [raw]
Subject: [patch 4/4 v2] mempolicy: update NUMA memory policy documentation

Updates Documentation/vm/numa_memory_policy.txt and
Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
flags.

Cc: Paul Jackson <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Randy Dunlap <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
Includes fixes to problems identified by Randy Dunlap.

Documentation/filesystems/tmpfs.txt | 11 ++++++++
Documentation/vm/numa_memory_policy.txt | 41 +++++++++++++++++++++++++++----
2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -92,6 +92,17 @@ NodeList format is a comma-separated list of decimal numbers and ranges,
a range being two hyphen-separated decimal numbers, the smallest and
largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15

+It is possible to specify a static NodeList by appending '=static' to
+the memory policy mode in the mpol= argument. This will require that
+tasks or VMA's restricted to a subset of allowed nodes are only allowed
+to effect the memory policy over those nodes. No remapping of the
+NodeList when the policy is rebound, which is the default behavior, is
+allowed when '=static' is specified. For example:
+
+mpol=bind=static:NodeList will only allocate from each node in
+ the NodeList without remapping the
+ NodeList if the policy is rebound
+
Note that trying to mount a tmpfs with an mpol option will fail if the
running kernel does not support NUMA; and will fail if its nodelist
specifies a node which is not online. If your system relies on that
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -135,9 +135,11 @@ most general to most specific:

Components of Memory Policies

- A Linux memory policy is a tuple consisting of a "mode" and an optional set
- of nodes. The mode determine the behavior of the policy, while the
- optional set of nodes can be viewed as the arguments to the behavior.
+ A Linux memory policy consists of a "mode", optional mode flags, and an
+ optional set of nodes. The mode determines the behavior of the policy,
+ the optional mode flags determine the behavior of the mode, and the
+ optional set of nodes can be viewed as the arguments to the policy
+ behavior.

Internally, memory policies are implemented by a reference counted
structure, struct mempolicy. Details of this structure will be discussed
@@ -145,7 +147,12 @@ Components of Memory Policies

Note: in some functions AND in the struct mempolicy itself, the mode
is called "policy". However, to avoid confusion with the policy tuple,
- this document will continue to use the term "mode".
+ this document will continue to use the term "mode". Since the mode and
+ optional mode flags are stored in the same struct mempolicy member
+ (specifically, pol->policy), you must use mpol_mode(pol->policy) to
+ access only the mode and mpol_flags(pol->policy) to access only the
+ flags. Any function with a formal of type enum mempolicy_mode only
+ refers to the mode.

Linux memory policy supports the following 4 behavioral modes:

@@ -231,6 +238,28 @@ Components of Memory Policies
the temporary interleaved system default policy works in this
mode.

+ Linux memory policy supports the following optional mode flag:
+
+ MPOL_F_STATIC_NODES: This flag specifies that the nodemask passed by
+ the user should not be remapped if the task or VMA's set of accessible
+ nodes changes after the memory policy has been defined.
+
+ Without this flag, anytime a mempolicy is rebound because of a
+ change in the set of accessible nodes, the node (Preferred) or
+ nodemask (Bind, Interleave) is remapped to the new set of
+ accessible nodes. This may result in nodes being used that were
+ previously undesired. With this flag, the policy is either
+ effected over the user's specified nodemask or the Default
+ behavior is used.
+
+ For example, consider a task that is attached to a cpuset with
+ mems 1-3 that sets an Interleave policy over the same set. If
+ the cpuset's mems change to 3-5, the Interleave will now occur
+ over nodes 3, 4, and 5. With this flag, however, since only
+ node 3 is accessible from the user's nodemask, the "interleave"
+ only occurs over that node. If no nodes from the user's
+ nodemask are now accessible, the Default behavior is used.
+
MEMORY POLICY APIs

Linux supports 3 system calls for controlling memory policy. These APIS
@@ -251,7 +280,9 @@ Set [Task] Memory Policy:
Set's the calling task's "task/process memory policy" to mode
specified by the 'mode' argument and the set of nodes defined
by 'nmask'. 'nmask' points to a bit mask of node ids containing
- at least 'maxnode' ids.
+ at least 'maxnode' ids. Optional mode flags may be passed by
+ combining the 'mode' argument with the flag (for example:
+ MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).

See the set_mempolicy(2) man page for more details

2008-02-11 20:17:47

by Randy Dunlap

[permalink] [raw]
Subject: Re: [patch 4/4 v2] mempolicy: update NUMA memory policy documentation

David Rientjes wrote:
> Updates Documentation/vm/numa_memory_policy.txt and
> Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
> flags.
>
> Cc: Paul Jackson <[email protected]>
> Cc: Christoph Lameter <[email protected]>
> Cc: Lee Schermerhorn <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Cc: Randy Dunlap <[email protected]>
> Signed-off-by: David Rientjes <[email protected]>

Acked-by: Randy Dunlap <[email protected]>

Thanks.

> ---
> Includes fixes to problems identified by Randy Dunlap.
>
> Documentation/filesystems/tmpfs.txt | 11 ++++++++
> Documentation/vm/numa_memory_policy.txt | 41 +++++++++++++++++++++++++++----
> 2 files changed, 47 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
> --- a/Documentation/filesystems/tmpfs.txt
> +++ b/Documentation/filesystems/tmpfs.txt
> @@ -92,6 +92,17 @@ NodeList format is a comma-separated list of decimal numbers and ranges,
> a range being two hyphen-separated decimal numbers, the smallest and
> largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15
>
> +It is possible to specify a static NodeList by appending '=static' to
> +the memory policy mode in the mpol= argument. This will require that
> +tasks or VMA's restricted to a subset of allowed nodes are only allowed
> +to effect the memory policy over those nodes. No remapping of the
> +NodeList when the policy is rebound, which is the default behavior, is
> +allowed when '=static' is specified. For example:
> +
> +mpol=bind=static:NodeList will only allocate from each node in
> + the NodeList without remapping the
> + NodeList if the policy is rebound
> +
> Note that trying to mount a tmpfs with an mpol option will fail if the
> running kernel does not support NUMA; and will fail if its nodelist
> specifies a node which is not online. If your system relies on that
> diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
> --- a/Documentation/vm/numa_memory_policy.txt
> +++ b/Documentation/vm/numa_memory_policy.txt
> @@ -135,9 +135,11 @@ most general to most specific:
>
> Components of Memory Policies
>
> - A Linux memory policy is a tuple consisting of a "mode" and an optional set
> - of nodes. The mode determine the behavior of the policy, while the
> - optional set of nodes can be viewed as the arguments to the behavior.
> + A Linux memory policy consists of a "mode", optional mode flags, and an
> + optional set of nodes. The mode determines the behavior of the policy,
> + the optional mode flags determine the behavior of the mode, and the
> + optional set of nodes can be viewed as the arguments to the policy
> + behavior.
>
> Internally, memory policies are implemented by a reference counted
> structure, struct mempolicy. Details of this structure will be discussed
> @@ -145,7 +147,12 @@ Components of Memory Policies
>
> Note: in some functions AND in the struct mempolicy itself, the mode
> is called "policy". However, to avoid confusion with the policy tuple,
> - this document will continue to use the term "mode".
> + this document will continue to use the term "mode". Since the mode and
> + optional mode flags are stored in the same struct mempolicy member
> + (specifically, pol->policy), you must use mpol_mode(pol->policy) to
> + access only the mode and mpol_flags(pol->policy) to access only the
> + flags. Any function with a formal of type enum mempolicy_mode only
> + refers to the mode.
>
> Linux memory policy supports the following 4 behavioral modes:
>
> @@ -231,6 +238,28 @@ Components of Memory Policies
> the temporary interleaved system default policy works in this
> mode.
>
> + Linux memory policy supports the following optional mode flag:
> +
> + MPOL_F_STATIC_NODES: This flag specifies that the nodemask passed by
> + the user should not be remapped if the task or VMA's set of accessible
> + nodes changes after the memory policy has been defined.
> +
> + Without this flag, anytime a mempolicy is rebound because of a
> + change in the set of accessible nodes, the node (Preferred) or
> + nodemask (Bind, Interleave) is remapped to the new set of
> + accessible nodes. This may result in nodes being used that were
> + previously undesired. With this flag, the policy is either
> + effected over the user's specified nodemask or the Default
> + behavior is used.
> +
> + For example, consider a task that is attached to a cpuset with
> + mems 1-3 that sets an Interleave policy over the same set. If
> + the cpuset's mems change to 3-5, the Interleave will now occur
> + over nodes 3, 4, and 5. With this flag, however, since only
> + node 3 is accessible from the user's nodemask, the "interleave"
> + only occurs over that node. If no nodes from the user's
> + nodemask are now accessible, the Default behavior is used.
> +
> MEMORY POLICY APIs
>
> Linux supports 3 system calls for controlling memory policy. These APIS
> @@ -251,7 +280,9 @@ Set [Task] Memory Policy:
> Set's the calling task's "task/process memory policy" to mode
> specified by the 'mode' argument and the set of nodes defined
> by 'nmask'. 'nmask' points to a bit mask of node ids containing
> - at least 'maxnode' ids.
> + at least 'maxnode' ids. Optional mode flags may be passed by
> + combining the 'mode' argument with the flag (for example:
> + MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).
>
> See the set_mempolicy(2) man page for more details
>


--
~Randy

2008-02-11 20:45:55

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Mon, 11 Feb 2008, David Rientjes wrote:

> On Mon, 11 Feb 2008, Christoph Lameter wrote:
>
> > Then you could follow through with the enum mempolicy thing
> > throughtout. Why not use enum mempolicy in struct mempolicy?
> >
>
> Mempolicy flags, as I implemented them in patch 2 in this series, are not
> integer constants that are enumerated starting at 0. They are individual
> bits that are shifted a pre-defined length and intersected with the
> enumerated mode. This allows both the mode and the flags to be stored in
> the same object.

Ok. Now are you are making the same point that I did before. Lets leave
enum out if we need to do these tricks with the upper bits.

2008-02-11 20:55:55

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 2/4] mempolicy: support optional mode flags

Lee wrote:
> We need to sort out with Andrew, Mel, Paul, ... the order in which these
> interdependent changes go in. Given such an order, I'm willing to merge
> them all up, test them, and post them [after running checkpatch, of
> course].

These patches look like good stuff at first glance. Thanks, David.

If things go as I hope, I expect to spend a couple of days this week
reviving my earlier patch RFC that a couple of you on this cc list saw,
concerning how nodes are numbered in mempolicy nodemasks. Certainly
the work being done in these various recent patches will affect my
patch ... it's in the same code.

David or Lee -- could you recommend to me which of these various patches
of late I should apply first in my workarea, in what order, before I
add my patch? After fixing up conflicts between my earlier patch and
these good patches of yours, I'll try to do some testing of the code
paths that my patch most likely affected, to ensure that I don't break
any existing code. I'll also probably have some more detailed review
comments on these patches, as I work through them and resolve conflicts
between them and my patch.

For lurkers who are wondering just what my earlier RFC patch will did,
I'm avoiding explaining that here. It will take a bit of careful
explanation, or else things get rather confused. My goal is to propose
my nodemask numbering patch by the end of this week, as an RFC,
publicly and clearly presented, and built and tested (somewhat) on top
of these other patches. My nodemask numbering patch is more of a new
feature than a code cleanup, so probably should end up going in after
these other patches.

Then if Lee can add his testing, and after Lee and/or David post
these patches, they can either push mine too, or I can follow up with
a final PATCH that I recommend for *-mm, based on feedback from my
RFC of this week, that applies on top of this good work.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-11 21:54:55

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 2/4] mempolicy: support optional mode flags

On Mon, 11 Feb 2008, Paul Jackson wrote:

> If things go as I hope, I expect to spend a couple of days this week
> reviving my earlier patch RFC that a couple of you on this cc list saw,
> concerning how nodes are numbered in mempolicy nodemasks. Certainly
> the work being done in these various recent patches will affect my
> patch ... it's in the same code.
>

With my patchset I don't believe there is any ambiguity in how nodes are
numbered anymore.

The node remap is suppressed on policy rebind if MPOL_F_STATIC_NODES is
passed by the user either on mbind() or set_mempolicy()[*]. So in that
case, the user is always referring to absolte physical node numbers;
without the flag, the user is referring to relative node numbers since it
can be remapped when the mempolicy context changes.

[*] Or with tmpfs by appending '=static' to the policy mode at mount
time.

> David or Lee -- could you recommend to me which of these various patches
> of late I should apply first in my workarea, in what order, before I
> add my patch?

That's a good question; when I prepared this patchset I simply based it
off Linus' latest git + V3 of Lee's contextualize_policy() patch that was
posted to LKML on February 8:

http://marc.info/?l=linux-kernel&m=120250000001182

Lee has talked about another patchset that he hasn't posted (at least not
lately) based on Mel's two zonelist work. I'm not sure of that status of
that right now and since Lee is idle this week I would recommend applying
V3 of his patch plus my five (there was a fifth patch posted in response
to KOSAKI Motohiro's findings of a mpol_new() memory leak:

http://marc.info/?l=linux-kernel&m=120276002725325

> After fixing up conflicts between my earlier patch and
> these good patches of yours, I'll try to do some testing of the code
> paths that my patch most likely affected, to ensure that I don't break
> any existing code. I'll also probably have some more detailed review
> comments on these patches, as I work through them and resolve conflicts
> between them and my patch.
>

Sounds good.

David

2008-02-11 21:57:34

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 2/4] mempolicy: support optional mode flags

David wrote:
> With my patchset I don't believe there is any ambiguity in how nodes are
> numbered anymore.

I'll have to study your patchset more closely tomorrow then.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-12 15:31:41

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 2/4] mempolicy: support optional mode flags

On Mon, 2008-02-11 at 11:34 -0800, David Rientjes wrote:
> On Mon, 11 Feb 2008, Lee Schermerhorn wrote:
>
> > These patches look good--well, interesting, anyway. I'm "off on
> > assignment" this week, so I won't get to review in detail, merge and
> > test them until next...
> >
>
> If, by "interesting", you mean that they give the most power to the user
> in setting up their mempolicies than they have ever had before, then I
> agree.

Hi David: to clarify: I added the "interesting" because I didn't want
the "look good" to be interpreted as an informed judgement. I hadn't
time to review in detail. I do have some comments, which I'll send in
response to the original patch messages [or any later posting thereof,
should that occur before I have time].

>
> > This helper functions introduced by this patch are similar in nature
> > [but go further] to one I introduced in the reference counting cleanup
> > RFC [http://marc.info/?l=linux-mm&m=119697614520515&w=4] I posted a
> > while back. I've been holding these cleanup patches until Andrew starts
> > accepting this sort of thing again. I have my series based atop Mel
> > Gorman's [added to cc] "two zonelist" series, as it depends on removing
> > the custom zonelist from the mempolicy.
> >
>
> If my helper functions are similar to yours then basing either of our
> patchsets on top of the other should not be difficult.
>
> > We need to sort out with Andrew, Mel, Paul, ... the order in which these
> > interdependent changes go in. Given such an order, I'm willing to merge
> > them all up, test them, and post them [after running checkpatch, of
> > course].
> >
>
> Thanks for volunteering to test the changes. I don't know how many
> patchsets are currently outstanding that touch mempolicies. So far we
> have mine and the refcounting cleanup of yours that you mentioned.
>
> I think the best way of dealing with it would be for the author of
> whatever patchset is merged second to rebase off the current -mm just like
> I based this entire patchset on your V3 contextualize_policy() patch from
> a couple days ago.
>
> > One other thing: In the recent mempolicy patch to "silently restrict
> > nodemask], I mentioned the issue with regards to whether and when to
> > "contextualize" tmpfs/hugetlbfs policies--if/when we fold
> > mpol_check_policy() into mpol_new(), as you suggested. Once we can
> > agree on the desired semantics, I had been thinking that an additional
> > mode flag could be added to policies obtained from the superblock, and
> > passed via mpol_shared_policy_init() [which calls mpol_new()] could be
> > used for this purpose. Your change here seems to lay the foundation for
> > implementing that.
> >
>
> My patchset already supports contextualized tmpfs mempolicies with a
> template for how to specify them (see patch 4 in this series for the
> documentation update). For example, mpol=interleave:1-3 is the equivalent
> of MPOL_INTERLEAVE over nodes 1-3 while mpol=interleave=static:1-3 is the
> equivalent of MPOL_INTERLEAVE | MPOL_F_STATIC_NODES.

Hmmm, so 'static' means "don't contexutalize"--i.e., don't mask off
disallowed or memoryless nodes? That means we'll need to skip them in
the interleave node calculation in the allocation path. Perhaps your
patch already addresses this, but I haven't had time to look.

Later,
Lee

2008-02-12 19:18:41

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 2/4] mempolicy: support optional mode flags

On Tue, 12 Feb 2008, Lee Schermerhorn wrote:

> Hmmm, so 'static' means "don't contexutalize"--i.e., don't mask off
> disallowed or memoryless nodes? That means we'll need to skip them in
> the interleave node calculation in the allocation path. Perhaps your
> patch already addresses this, but I haven't had time to look.
>

All MPOL_F_STATIC_NODES does (by mbind(), set_mempolicy(), or adding
'=static' to the mempolicy mode on tmpfs mount) is suppress the node remap
in mpol_rebind_policy().

In mpol_new(), the intent of the user is stored in a new nodemask_t member
of struct mempolicy and in the static case the passed nodemask is
intersected with that member. The policy is then effected over the
intersection.

If that nodemask includes no accessible nodes, then the mempolicy is not
effected but rather lies dormant until access to those nodes is attained.
If and when that happens, the mempolicy will then be effected again
without any additional set_mempolicy() or mbind() from the user.

David

2008-02-13 00:10:43

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Mon, 2008-02-11 at 07:30 -0800, David Rientjes wrote:
> The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND,
> and MPOL_INTERLEAVE, are better declared as part of an enum for type
> checking.
>
> The policy member of struct mempolicy is also converted from type short
> to type unsigned short. A negative policy does not have any legitimate
> meaning, so it is possible to change its type in preparation for adding
> optional mode flags later.
>
> The equivalent member of struct shmem_sb_info is also changed from int
> to unsigned short.
>
> For compatibility, the policy formal to get_mempolicy() remains as a
> pointer to an int:
>
> int get_mempolicy(int *policy, unsigned long *nmask,
> unsigned long maxnode, unsigned long addr,
> unsigned long flags);
>
> although the only possible values is the range of type unsigned short.
>
> Cc: Paul Jackson <[email protected]>
> Cc: Christoph Lameter <[email protected]>
> Cc: Lee Schermerhorn <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> Applies on top of V3 of Lee Schermerhorn's mempolicy patch posted to
> LKML on February 8.
>
> include/linux/mempolicy.h | 21 +++++++++++----------
> include/linux/shmem_fs.h | 2 +-
> mm/mempolicy.c | 29 +++++++++++++++++------------
> mm/shmem.c | 11 ++++++-----
> 4 files changed, 35 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -9,12 +9,13 @@
> */
>
> /* Policies */
> -#define MPOL_DEFAULT 0
> -#define MPOL_PREFERRED 1
> -#define MPOL_BIND 2
> -#define MPOL_INTERLEAVE 3
> -
> -#define MPOL_MAX MPOL_INTERLEAVE
> +enum mempolicy_mode {
> + MPOL_DEFAULT,
> + MPOL_PREFERRED,
> + MPOL_BIND,
> + MPOL_INTERLEAVE,
> + MPOL_MAX, /* always last member of mempolicy_mode */
> +};
>
> /* Flags for get_mem_policy */
> #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
> @@ -62,7 +63,7 @@ struct mm_struct;
> */
> struct mempolicy {
> atomic_t refcnt;
> - short policy; /* See MPOL_* above */
> + unsigned short policy; /* See MPOL_* above */
> union {
> struct zonelist *zonelist; /* bind */
> short preferred_node; /* preferred */
> @@ -133,8 +134,8 @@ struct shared_policy {
> spinlock_t lock;
> };
>
> -void mpol_shared_policy_init(struct shared_policy *info, int policy,
> - nodemask_t *nodes);
> +void mpol_shared_policy_init(struct shared_policy *info,
> + enum mempolicy_mode policy, nodemask_t *nodes);

Perhaps just my own myopia, but I don't see the benefit of this change;
especially when taken with the subsequent patch [2/4]. The only place
we pass around a "naked" policy member [outside of a mempolicy struct]
is between the sys call interface [and shmem_sb_info] and the
mpol_check_policy()/mpol_new() functions. In the subsequent patch,
you'll add the optional flags to this member and this type change either
requires a separate argument [as you've done] or requires undoing this
change [as I'd like to see].

Why not leave it as an int. Will simplify this and subsequent patch.
See comments there.

Other than that, I'm fine with this patch.

> int mpol_set_shared_policy(struct shared_policy *info,
> struct vm_area_struct *vma,
> struct mempolicy *new);
> @@ -200,7 +201,7 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
> }
>
> static inline void mpol_shared_policy_init(struct shared_policy *info,
> - int policy, nodemask_t *nodes)
> + enum mempolicy_type policy, nodemask_t *nodes)
> {
> }
>
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -34,7 +34,7 @@ struct shmem_sb_info {
> uid_t uid; /* Mount uid for root directory */
> gid_t gid; /* Mount gid for root directory */
> mode_t mode; /* Mount mode for root directory */
> - int policy; /* Default NUMA memory alloc policy */
> + unsigned short policy; /* Default NUMA memory alloc policy */
> nodemask_t policy_nodes; /* nodemask for preferred and bind */
> };
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -114,7 +114,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
> const nodemask_t *newmask);
>
> /* Do sanity checking on a policy */
> -static int mpol_check_policy(int mode, nodemask_t *nodes)
> +static int mpol_check_policy(enum mempolicy_mode mode, nodemask_t *nodes)
> {
> int was_empty, is_empty;
>
> @@ -159,6 +159,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
> if (!was_empty && is_empty)
> return -EINVAL;
> break;
> + default:
> + BUG();
> }
> return 0;
> }
> @@ -201,7 +203,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
> }
>
> /* Create a new policy */
> -static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
> +static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
> {
> struct mempolicy *policy;
>
> @@ -235,6 +237,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
> return error_code;
> }
> break;
> + default:
> + BUG();
> }
> policy->policy = mode;
> policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
> @@ -479,7 +483,7 @@ static void mpol_set_task_struct_flag(void)
> }
>
> /* Set the process memory policy */
> -static long do_set_mempolicy(int mode, nodemask_t *nodes)
> +static long do_set_mempolicy(enum mempolicy_mode mode, nodemask_t *nodes)
> {
> struct mempolicy *new;
>
> @@ -781,7 +785,7 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
> #endif
>
> static long do_mbind(unsigned long start, unsigned long len,
> - unsigned long mode, nodemask_t *nmask,
> + enum mempolicy_mode mode, nodemask_t *nmask,
> unsigned long flags)
> {
> struct vm_area_struct *vma;
> @@ -791,9 +795,8 @@ static long do_mbind(unsigned long start, unsigned long len,
> int err;
> LIST_HEAD(pagelist);
>
> - if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
> + if (flags & ~(unsigned long)(MPOL_MF_STRICT |
> MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
> - || mode > MPOL_MAX)
> return -EINVAL;
> if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
> return -EPERM;
> @@ -826,7 +829,7 @@ static long do_mbind(unsigned long start, unsigned long len,
> if (!new)
> flags |= MPOL_MF_DISCONTIG_OK;
>
> - pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
> + pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
> mode, nmask ? nodes_addr(*nmask)[0] : -1);
>
> down_write(&mm->mmap_sem);
> @@ -927,6 +930,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
> nodemask_t nodes;
> int err;
>
> + if (mode >= MPOL_MAX)
> + return -EINVAL;
> err = get_nodes(&nodes, nmask, maxnode);
> if (err)
> return err;
> @@ -940,7 +945,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
> int err;
> nodemask_t nodes;
>
> - if (mode < 0 || mode > MPOL_MAX)
> + if (mode < 0 || mode >= MPOL_MAX)
> return -EINVAL;
> err = get_nodes(&nodes, nmask, maxnode);
> if (err)
> @@ -1206,7 +1211,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
> */
> unsigned slab_node(struct mempolicy *policy)
> {
> - int pol = policy ? policy->policy : MPOL_DEFAULT;
> + enum mempolicy_mode pol = policy ? policy->policy : MPOL_DEFAULT;
>
> switch (pol) {
> case MPOL_INTERLEAVE:
> @@ -1634,8 +1639,8 @@ restart:
> return 0;
> }
>
> -void mpol_shared_policy_init(struct shared_policy *info, int policy,
> - nodemask_t *policy_nodes)
> +void mpol_shared_policy_init(struct shared_policy *info,
> + enum mempolicy_mode policy, nodemask_t *policy_nodes)
> {
> info->root = RB_ROOT;
> spin_lock_init(&info->lock);
> @@ -1853,7 +1858,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
> char *p = buffer;
> int l;
> nodemask_t nodes;
> - int mode = pol ? pol->policy : MPOL_DEFAULT;
> + enum mempolicy_mode mode = pol ? pol->policy : MPOL_DEFAULT;
>
> switch (mode) {
> case MPOL_DEFAULT:
> diff --git a/mm/shmem.c b/mm/shmem.c
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1082,7 +1082,8 @@ redirty:
>
> #ifdef CONFIG_NUMA
> #ifdef CONFIG_TMPFS
> -static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
> +static int shmem_parse_mpol(char *value, unsigned short *policy,
> + nodemask_t *policy_nodes)
> {
> char *nodelist = strchr(value, ':');
> int err = 1;
> @@ -1131,7 +1132,7 @@ out:
> return err;
> }
>
> -static void shmem_show_mpol(struct seq_file *seq, int policy,
> +static void shmem_show_mpol(struct seq_file *seq, enum mempolicy_mode policy,
> const nodemask_t policy_nodes)
> {
> char *policy_string;
> @@ -1200,14 +1201,14 @@ static struct page *shmem_alloc_page(gfp_t gfp,
> }
> #else /* !CONFIG_NUMA */
> #ifdef CONFIG_TMPFS
> -static inline int shmem_parse_mpol(char *value, int *policy,
> +static inline int shmem_parse_mpol(char *value, unsigned short *policy,
> nodemask_t *policy_nodes)
> {
> return 1;
> }
>
> -static inline void shmem_show_mpol(struct seq_file *seq, int policy,
> - const nodemask_t policy_nodes)
> +static inline void shmem_show_mpol(struct seq_file *seq,
> + enum mempolicy_mode policy, const nodemask_t policy_nodes)
> {
> }
> #endif /* CONFIG_TMPFS */

2008-02-13 00:14:36

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 2/4] mempolicy: support optional mode flags

On Mon, 2008-02-11 at 07:30 -0800, David Rientjes wrote:
> With the evolution of mempolicies, it is necessary to support mempolicy
> mode flags that specify how the policy shall behave in certain
> circumstances. The most immediate need for mode flag support is to
> suppress remapping the nodemask of a policy at the time of rebind.
>
> With the small number of possible mempolicy modes (currently four) and
> the large size of the struct mempolicy member that stores this mode
> (unsigned short), it is possible to store both the mode and optional mode
> flags in the same member.
>
> To do this, it is necessary to mask off the optional mode flag bits when
> testing the mempolicy mode. A new constant, MPOL_FLAG_SHIFT, indicates
> the shift necessary to find the flag bits within struct mempolicy's
> policy member. With this, MPOL_MODE_MASK can be defined to mask off the
> optional flags, yielding just the mode itself.
>
> A helper function:
>
> unsigned char mpol_mode(unsigned short mode)
>
> has been implemented to return only a policy's mode. Notice that the
> return type is unsigned char since MPOL_FLAG_SHIFT is currently defined
> at eight. mpol_flags(unsigned short mode) does the inverse.
>
> While this requires frequent use of mpol_mode() within the internal
> mempolicy code, it is easy to distinguish between actuals that contain
> only modes and those that contain the entire policy member (modes and
> flags). If the formal uses enum mempolicy_mode, only the mode itself
> may successfully be passed because of type checking.

What is the benefit of pulling the flags and mode apart at the user
interface, passing them as separate args to mpol_new(), do_* and
mpol_shared_policy_init() and then stitching them back together in
mpol_new()? Modes passed in via [sys_]{set_mempolicy|mbind}() and
those stored in the the shmem_sb_info can already have the flags there,
so this seems like extra work.

I think this patch and the previous one [1/4] would be simplified if the
formal parameters were left as an int [mabye, unsigned] and the flags
were masked of when necessary in mpol_new() and elsewhere.

[more comments below]

>
> Note that this does not break userspace code that relies on
> get_mempolicy(&policy, ...) and either 'switch (policy)' statements or
> 'if (policy == MPOL_INTERLEAVE)' statements. Such applications would
> need to use optional mode flags when calling set_mempolicy() or mbind()
> for these previously implemented statements to stop working. If an
> application does start using optional mode flags, it will need to use
> mpol_mode() in switch and conditional statements that only test mode.

> Cc: Paul Jackson <[email protected]>
> Cc: Christoph Lameter <[email protected]>
> Cc: Lee Schermerhorn <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> fs/hugetlbfs/inode.c | 2 +-
> include/linux/mempolicy.h | 39 +++++++++++++++++++++--
> mm/mempolicy.c | 77 ++++++++++++++++++++++++++------------------
> mm/shmem.c | 15 ++++++--
> 4 files changed, 93 insertions(+), 40 deletions(-)
>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -504,7 +504,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
> inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> INIT_LIST_HEAD(&inode->i_mapping->private_list);
> info = HUGETLBFS_I(inode);
> - mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
> + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0, NULL);
> switch (mode & S_IFMT) {
> default:
> init_special_inode(inode, mode, dev);
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -17,7 +17,18 @@ enum mempolicy_mode {
> MPOL_MAX, /* always last member of mempolicy_mode */
> };
>
> -/* Flags for get_mem_policy */
> +/*
> + * The lower MPOL_FLAG_SHIFT bits of the policy mode represent the MPOL_*
> + * constants defined in enum mempolicy_mode. The upper bits represent
> + * optional set_mempolicy() MPOL_F_* mode flags.
> + */
> +#define MPOL_FLAG_SHIFT (8)
> +#define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)
> +
> +/* Flags for set_mempolicy */
> +#define MPOL_MODE_FLAGS (0) /* legal set_mempolicy() MPOL_* mode flags */
> +
> +/* Flags for get_mempolicy */
> #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
> #define MPOL_F_ADDR (1<<1) /* look up vma using address */
> #define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
> @@ -115,6 +126,16 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
>
> #define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)
>
> +static inline unsigned char mpol_mode(unsigned short mode)
> +{
> + return mode & MPOL_MODE_MASK;
> +}
> +
> +static inline unsigned short mpol_flags(unsigned short mode)
> +{
> + return mode & ~MPOL_MODE_MASK;
> +}
> +
> /*
> * Tree of shared policies for a shared memory region.
> * Maintain the policies in a pseudo mm that contains vmas. The vmas
> @@ -135,7 +156,8 @@ struct shared_policy {
> };
>
> void mpol_shared_policy_init(struct shared_policy *info,
> - enum mempolicy_mode policy, nodemask_t *nodes);
> + enum mempolicy_mode policy, unsigned short flags,
> + nodemask_t *nodes);
> int mpol_set_shared_policy(struct shared_policy *info,
> struct vm_area_struct *vma,
> struct mempolicy *new);
> @@ -176,6 +198,16 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
> }
> #define vma_mpol_equal(a,b) 1
>
> +static inline unsigned char mpol_mode(unsigned short mode)
> +{
> + return 0;
> +}
> +
> +static inline unsigned short mpol_flags(unsigned short mode)
> +{
> + return 0;
> +}
> +
> #define mpol_set_vma_default(vma) do {} while(0)
>
> static inline void mpol_free(struct mempolicy *p)
> @@ -201,7 +233,8 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
> }
>
> static inline void mpol_shared_policy_init(struct shared_policy *info,
> - enum mempolicy_type policy, nodemask_t *nodes)
> + enum mempolicy_type policy, unsigned short flags,
> + nodemask_t *nodes)
> {
> }
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -203,18 +203,20 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
> }
>
> /* Create a new policy */
> -static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
> +static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> + unsigned short flags, nodemask_t *nodes)
> {
> struct mempolicy *policy;
>
> - pr_debug("setting mode %d nodes[0] %lx\n",
> - mode, nodes ? nodes_addr(*nodes)[0] : -1);
> + pr_debug("setting mode %d flags %d nodes[0] %lx\n",
> + mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
>
> if (mode == MPOL_DEFAULT)
> return NULL;
> policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
> if (!policy)
> return ERR_PTR(-ENOMEM);
> + flags &= MPOL_MODE_FLAGS;
> atomic_set(&policy->refcnt, 1);
> switch (mode) {
> case MPOL_INTERLEAVE:
> @@ -240,7 +242,7 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
> default:
> BUG();
> }
> - policy->policy = mode;
> + policy->policy = mode | flags;
> policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
> return policy;
> }
> @@ -483,19 +485,20 @@ static void mpol_set_task_struct_flag(void)
> }
>
> /* Set the process memory policy */
> -static long do_set_mempolicy(enum mempolicy_mode mode, nodemask_t *nodes)
> +static long do_set_mempolicy(enum mempolicy_mode mode, unsigned short flags,
> + nodemask_t *nodes)
> {
> struct mempolicy *new;
>
> if (mpol_check_policy(mode, nodes))
> return -EINVAL;
> - new = mpol_new(mode, nodes);
> + new = mpol_new(mode, flags, nodes);
> if (IS_ERR(new))
> return PTR_ERR(new);
> mpol_free(current->mempolicy);
> current->mempolicy = new;
> mpol_set_task_struct_flag();
> - if (new && new->policy == MPOL_INTERLEAVE)
> + if (new && mpol_mode(new->policy) == MPOL_INTERLEAVE)
> current->il_next = first_node(new->v.nodes);
> return 0;
> }
> @@ -506,7 +509,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
> int i;
>
> nodes_clear(*nodes);
> - switch (p->policy) {
> + switch (mpol_mode(p->policy)) {
> case MPOL_BIND:
> for (i = 0; p->v.zonelist->zones[i]; i++)
> node_set(zone_to_nid(p->v.zonelist->zones[i]),
> @@ -588,7 +591,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
> goto out;
> *policy = err;
> } else if (pol == current->mempolicy &&
> - pol->policy == MPOL_INTERLEAVE) {
> + mpol_mode(pol->policy) == MPOL_INTERLEAVE) {
> *policy = current->il_next;
> } else {
> err = -EINVAL;
> @@ -785,8 +788,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
> #endif
>
> static long do_mbind(unsigned long start, unsigned long len,
> - enum mempolicy_mode mode, nodemask_t *nmask,
> - unsigned long flags)
> + enum mempolicy_mode mode, unsigned short mode_flags,
> + nodemask_t *nmask, unsigned long flags)
> {
> struct vm_area_struct *vma;
> struct mm_struct *mm = current->mm;
> @@ -818,7 +821,7 @@ static long do_mbind(unsigned long start, unsigned long len,
> if (mpol_check_policy(mode, nmask))
> return -EINVAL;
>
> - new = mpol_new(mode, nmask);
> + new = mpol_new(mode, mode_flags, nmask);
> if (IS_ERR(new))
> return PTR_ERR(new);
>
> @@ -829,8 +832,9 @@ static long do_mbind(unsigned long start, unsigned long len,
> if (!new)
> flags |= MPOL_MF_DISCONTIG_OK;
>
> - pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
> - mode, nmask ? nodes_addr(*nmask)[0] : -1);
> + pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
> + start, start + len, mode, mode_flags,
> + nmask ? nodes_addr(*nmask)[0] : -1);
>
> down_write(&mm->mmap_sem);
> vma = check_range(mm, start, end, nmask,
> @@ -929,13 +933,16 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
> {
> nodemask_t nodes;
> int err;
> + unsigned short mode_flags;
>
> + mode_flags = mpol_flags(mode);
>
>
I suggest restricting the flags to the defined flags here. This gives
us the ability to defined new flags w/o breaking [changing behavior of]
applications that accidently pass undefined flags. An error return
forced the user to fix the application [assuming they check error
returns].

> + mode = mpol_mode(mode);
> if (mode >= MPOL_MAX)
> return -EINVAL;
> err = get_nodes(&nodes, nmask, maxnode);
> if (err)
> return err;
> - return do_mbind(start, len, mode, &nodes, flags);
> + return do_mbind(start, len, mode, mode_flags, &nodes, flags);
> }
>
> /* Set the process memory policy */
> @@ -944,13 +951,16 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
> {
> int err;
> nodemask_t nodes;
> + unsigned short flags;
>
> + flags = mpol_flags(mode);

Suggest similar arg checking here.

> + mode = mpol_mode(mode);
> if (mode < 0 || mode >= MPOL_MAX)

Now that we're extracting an unsigned char from the mode via
mpol_mode(), I think we can drop the 'mode < 0 ||'. This would also be
the case if we didn't pull the flags and mode apart and just used:

if (mpol_mode(mode) < MPOL_MAX)
...

> return -EINVAL;
> err = get_nodes(&nodes, nmask, maxnode);
> if (err)
> return err;
> - return do_set_mempolicy(mode, &nodes);
> + return do_set_mempolicy(mode, flags, &nodes);
> }
>
> asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
> @@ -1152,7 +1162,7 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
> pol = vma->vm_ops->get_policy(vma, addr);
> shared_pol = 1; /* if pol non-NULL, add ref below */
> } else if (vma->vm_policy &&
> - vma->vm_policy->policy != MPOL_DEFAULT)
> + mpol_mode(vma->vm_policy->policy) != MPOL_DEFAULT)
> pol = vma->vm_policy;
> }
> if (!pol)
> @@ -1167,7 +1177,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
> {
> int nd;
>
> - switch (policy->policy) {
> + switch (mpol_mode(policy->policy)) {
> case MPOL_PREFERRED:
> nd = policy->v.preferred_node;
> if (nd < 0)
> @@ -1211,7 +1221,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
> */
> unsigned slab_node(struct mempolicy *policy)
> {
> - enum mempolicy_mode pol = policy ? policy->policy : MPOL_DEFAULT;
> + enum mempolicy_mode pol = policy ? mpol_mode(policy->policy) :
> + MPOL_DEFAULT;
>
> switch (pol) {
> case MPOL_INTERLEAVE:
> @@ -1297,7 +1308,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
> struct zonelist *zl;
>
> *mpol = NULL; /* probably no unref needed */
> - if (pol->policy == MPOL_INTERLEAVE) {
> + if (mpol_mode(pol->policy) == MPOL_INTERLEAVE) {
> unsigned nid;
>
> nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
> @@ -1307,7 +1318,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
>
> zl = zonelist_policy(GFP_HIGHUSER, pol);
> if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
> - if (pol->policy != MPOL_BIND)
> + if (mpol_mode(pol->policy) != MPOL_BIND)
> __mpol_free(pol); /* finished with pol */
> else
> *mpol = pol; /* unref needed after allocation */
> @@ -1361,7 +1372,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
>
> cpuset_update_task_memory_state();
>
> - if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
> + if (unlikely(mpol_mode(pol->policy) == MPOL_INTERLEAVE)) {
> unsigned nid;
>
> nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
> @@ -1409,7 +1420,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
> cpuset_update_task_memory_state();
> if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
> pol = &default_policy;
> - if (pol->policy == MPOL_INTERLEAVE)
> + if (mpol_mode(pol->policy) == MPOL_INTERLEAVE)
> return alloc_page_interleave(gfp, order, interleave_nodes(pol));
> return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
> }
> @@ -1436,7 +1447,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
> }
> *new = *old;
> atomic_set(&new->refcnt, 1);
> - if (new->policy == MPOL_BIND) {
> + if (mpol_mode(new->policy) == MPOL_BIND) {
> int sz = ksize(old->v.zonelist);
> new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
> if (!new->v.zonelist) {
> @@ -1454,7 +1465,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
> return 0;
> if (a->policy != b->policy)
> return 0;
> - switch (a->policy) {
> + switch (mpol_mode(a->policy)) {
> case MPOL_DEFAULT:
> return 1;
> case MPOL_INTERLEAVE:
> @@ -1479,7 +1490,7 @@ void __mpol_free(struct mempolicy *p)
> {
> if (!atomic_dec_and_test(&p->refcnt))
> return;
> - if (p->policy == MPOL_BIND)
> + if (mpol_mode(p->policy) == MPOL_BIND)
> kfree(p->v.zonelist);
> p->policy = MPOL_DEFAULT;
> kmem_cache_free(policy_cache, p);
> @@ -1640,7 +1651,8 @@ restart:
> }
>
> void mpol_shared_policy_init(struct shared_policy *info,
> - enum mempolicy_mode policy, nodemask_t *policy_nodes)
> + enum mempolicy_mode policy, unsigned short flags,
> + nodemask_t *policy_nodes)
> {
> info->root = RB_ROOT;
> spin_lock_init(&info->lock);
> @@ -1649,7 +1661,7 @@ void mpol_shared_policy_init(struct shared_policy *info,
> struct mempolicy *newpol;
>
> /* Falls back to MPOL_DEFAULT on any error */
> - newpol = mpol_new(policy, policy_nodes);
> + newpol = mpol_new(policy, flags, policy_nodes);
> if (!IS_ERR(newpol)) {
> /* Create pseudo-vma that contains just the policy */
> struct vm_area_struct pvma;
> @@ -1745,14 +1757,14 @@ void __init numa_policy_init(void)
> if (unlikely(nodes_empty(interleave_nodes)))
> node_set(prefer, interleave_nodes);
>
> - if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
> + if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
> printk("numa_policy_init: interleaving failed\n");
> }
>
> /* Reset policy of current process to default */
> void numa_default_policy(void)
> {
> - do_set_mempolicy(MPOL_DEFAULT, NULL);
> + do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
> }
>
> /* Migrate a policy to a different set of nodes */
> @@ -1768,7 +1780,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
> if (nodes_equal(*mpolmask, *newmask))
> return;
>
> - switch (pol->policy) {
> + switch (mpol_mode(pol->policy)) {
> case MPOL_DEFAULT:
> break;
> case MPOL_INTERLEAVE:
> @@ -1858,7 +1870,8 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
> char *p = buffer;
> int l;
> nodemask_t nodes;
> - enum mempolicy_mode mode = pol ? pol->policy : MPOL_DEFAULT;
> + enum mempolicy_mode mode = pol ? mpol_mode(pol->policy) :
> + MPOL_DEFAULT;
>
> switch (mode) {
> case MPOL_DEFAULT:
> diff --git a/mm/shmem.c b/mm/shmem.c
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1086,6 +1086,7 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
> nodemask_t *policy_nodes)
> {
> char *nodelist = strchr(value, ':');
> + char *flags = strchr(value, '=');
> int err = 1;
>
> if (nodelist) {
> @@ -1096,6 +1097,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
> if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
> goto out;
> }
> + if (flags)
> + *flags++ = '\0';
> if (!strcmp(value, "default")) {
> *policy = MPOL_DEFAULT;
> /* Don't allow a nodelist */
> @@ -1125,6 +1128,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
> *policy_nodes = node_states[N_HIGH_MEMORY];
> err = 0;
> }
> + if (flags) {
> + }
> out:
> /* Restore string for error message */
> if (nodelist)
> @@ -1154,7 +1159,7 @@ static void shmem_show_mpol(struct seq_file *seq, enum mempolicy_mode policy,
>
> seq_printf(seq, ",mpol=%s", policy_string);
>
> - if (policy != MPOL_INTERLEAVE ||
> + if (mpol_mode(policy) != MPOL_INTERLEAVE ||
> !nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) {
> char buffer[64];
> int len;
> @@ -1577,8 +1582,10 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
> case S_IFREG:
> inode->i_op = &shmem_inode_operations;
> inode->i_fop = &shmem_file_operations;
> - mpol_shared_policy_init(&info->policy, sbinfo->policy,
> - &sbinfo->policy_nodes);
> + mpol_shared_policy_init(&info->policy,
> + mpol_mode(sbinfo->policy),
> + mpol_flags(sbinfo->policy),
> + &sbinfo->policy_nodes);
> break;
> case S_IFDIR:
> inc_nlink(inode);
> @@ -1592,7 +1599,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
> * Must not load anything in the rbtree,
> * mpol_free_shared_policy will not be called.
> */
> - mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
> + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0,
> NULL);
> break;
> }

2008-02-13 00:22:24

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Mon, 2008-02-11 at 07:30 -0800, David Rientjes wrote:
> Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses
> the node remap when the policy is rebound.
>
> Adds another member to struct mempolicy,
>
> nodemask_t user_nodemask
>
> that stores the the nodemask that the user passed when he or she created
> the mempolicy via set_mempolicy() or mbind(). When using
> MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's
> passed nodemask is always used when determining the preferred node,
> building the MPOL_BIND zonelist, or creating the interleave nodemask.
> This happens whenever the policy is rebound, including when a task's
> cpuset assignment changes or the cpuset's mems are changed.

When you say that the user's nodemask is "always used" you mean "after
cpuset contextualization", right? I.e., after masking with mems_allowed
[which also restricts to nodes with memory]. That is what the code
still does.

>
> This creates an interesting side-effect in that it allows the mempolicy
> "intent" to lie dormant and uneffected until it has access to the node(s)
> that it desires. For example, if you currently ask for an interleaved
> policy over a set of nodes that you do not have access to, the mempolicy
> is not created and the task continues to use the equivalent of
> MPOL_DEFAULT.

You get an error [EINVAL], right? The target policy [task or vma]
remains unchanged. That may or may not be MPOL_DEFAULT, depending on
how the task was started [via numactl()] or the success of prior
set_mempolicy()/mbind() calls.

> With this change, however, it is possible to create the
> same mempolicy; it is effected only when access to the nodes is acquired.
>
> It is also possible to mount tmpfs with the static nodemask behavior when
> specifying a node or nodemask. To do this, simply add "=static"
> immediately following the mempolicy mode at mount time:
>
> mount -o remount mpol=interleave=static:1-3
>
> Also removes mpol_check_policy() and folds some of its logic into
> mpol_new() since it is now mostly obsoleted.

Couple of comments here:

1) we've discussed the issue of returning EINVAL for non-empty nodemasks
with MPOL_DEFAULT. By removing this restriction, we run the risk of
breaking applications if we should ever want to define a semantic to
non-empty node mask for MPOL_DEFAULT. I think this is probably
unlikely, but consider the new mode flags part of the mode/policy
parameter: by not rejecting undefined flags, we may break applications
by later defining additional flags. I'd argue for fairly strict
argument checking.

2) Before this patch, policy_nodemask from the shmem_sb_info was NOT
masked with allowed nodes because we passed this mask directly to
mpol_new() without "contextualization". We DO guarantee that this
policy nodemask contains only nodes with memory [see
shmem_parse_mpol()]. Now that you've moved the contextualization to
mpol_new(), we are masking this policy with the cpuset mems allowed.
This is a change in behavior. Do we want this? I.e., are we preserving
the "intent" of the system administrator in setting the tmpfs policy?
Maybe they wanted to shared interleaved shmem areas between cpusets with
disjoint mems allowed. Nothing prevents this...

If we just want to preserve existing behavior, we can define an
additional mode flag that we set in the sbinfo policy in
shmem_parse_mpol() and test in mpol_new(). If we want to be able to
specify existing or new behavior, we can use the same flag, but set it
or not based on an additional qualifier specified via the mount option.

[more below]

>
> Cc: Paul Jackson <[email protected]>
> Cc: Christoph Lameter <[email protected]>
> Cc: Lee Schermerhorn <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> include/linux/mempolicy.h | 4 +-
> mm/mempolicy.c | 137 ++++++++++++++++++---------------------------
> mm/shmem.c | 2 +
> 3 files changed, 60 insertions(+), 83 deletions(-)
>
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -26,7 +26,8 @@ enum mempolicy_mode {
> #define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)
>
> /* Flags for set_mempolicy */
> -#define MPOL_MODE_FLAGS (0) /* legal set_mempolicy() MPOL_* mode flags */
> +#define MPOL_F_STATIC_NODES (1 << MPOL_FLAG_SHIFT)
> +#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES) /* legal set_mempolicy() MPOL_* mode flags */
>
> /* Flags for get_mempolicy */
> #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
> @@ -82,6 +83,7 @@ struct mempolicy {
> /* undefined for default */
> } v;
> nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
> + nodemask_t user_nodemask; /* nodemask passed by user */
> };
>
> /*
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -113,58 +113,6 @@ struct mempolicy default_policy = {
> static void mpol_rebind_policy(struct mempolicy *pol,
> const nodemask_t *newmask);
>
> -/* Do sanity checking on a policy */
> -static int mpol_check_policy(enum mempolicy_mode mode, nodemask_t *nodes)
> -{
> - int was_empty, is_empty;
> -
> - if (!nodes)
> - return 0;
> -
> - /*
> - * "Contextualize" the in-coming nodemast for cpusets:
> - * Remember whether in-coming nodemask was empty, If not,
> - * restrict the nodes to the allowed nodes in the cpuset.
> - * This is guaranteed to be a subset of nodes with memory.
> - */
> - cpuset_update_task_memory_state();
> - is_empty = was_empty = nodes_empty(*nodes);
> - if (!was_empty) {
> - nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
> - is_empty = nodes_empty(*nodes); /* after "contextualization" */
> - }
> -
> - switch (mode) {
> - case MPOL_DEFAULT:
> - /*
> - * require caller to specify an empty nodemask
> - * before "contextualization"
> - */
> - if (!was_empty)
> - return -EINVAL;

I'd like to see an equivalent check added to mpol_new().

> - break;
> - case MPOL_BIND:
> - case MPOL_INTERLEAVE:
> - /*
> - * require at least 1 valid node after "contextualization"
> - */
> - if (is_empty)
> - return -EINVAL;
> - break;
> - case MPOL_PREFERRED:
> - /*
> - * Did caller specify invalid nodes?
> - * Don't silently accept this as "local allocation".
> - */
> - if (!was_empty && is_empty)
> - return -EINVAL;

Similar here: without an equivalent check in mpol_new(), one can
specify a dis-allowed node, and end up with explict local allocation. I
think that an error would be preferable in this case.

> - break;
> - default:
> - BUG();
> - }
> - return 0;
> -}
> -
> /* Generate a custom zonelist for the BIND policy. */
> static struct zonelist *bind_zonelist(nodemask_t *nodes)
> {
> @@ -207,6 +155,7 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> unsigned short flags, nodemask_t *nodes)
> {
> struct mempolicy *policy;
> + nodemask_t cpuset_context_nmask;
>
> pr_debug("setting mode %d flags %d nodes[0] %lx\n",
> mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
> @@ -218,21 +167,27 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> return ERR_PTR(-ENOMEM);
> flags &= MPOL_MODE_FLAGS;
> atomic_set(&policy->refcnt, 1);
> + cpuset_update_task_memory_state();
> + nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
> switch (mode) {
> case MPOL_INTERLEAVE:
> - policy->v.nodes = *nodes;
> + if (nodes_empty(*nodes))
> + return ERR_PTR(-EINVAL);
> + policy->v.nodes = cpuset_context_nmask;
> if (nodes_weight(policy->v.nodes) == 0) {
> kmem_cache_free(policy_cache, policy);
> return ERR_PTR(-EINVAL);
> }
> break;
> case MPOL_PREFERRED:
> - policy->v.preferred_node = first_node(*nodes);
> + policy->v.preferred_node = first_node(cpuset_context_nmask);
> if (policy->v.preferred_node >= MAX_NUMNODES)
> policy->v.preferred_node = -1;
> break;
> case MPOL_BIND:
> - policy->v.zonelist = bind_zonelist(nodes);
> + if (nodes_empty(*nodes))

Kosaki-san already pointed out the need to free the policy struct
here.

> + return ERR_PTR(-EINVAL);
> + policy->v.zonelist = bind_zonelist(&cpuset_context_nmask);
> if (IS_ERR(policy->v.zonelist)) {
> void *error_code = policy->v.zonelist;
> kmem_cache_free(policy_cache, policy);
> @@ -244,6 +199,7 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> }
> policy->policy = mode | flags;
> policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
> + policy->user_nodemask = *nodes;
> return policy;
> }
>
> @@ -490,15 +446,14 @@ static long do_set_mempolicy(enum mempolicy_mode mode, unsigned short flags,
> {
> struct mempolicy *new;
>
> - if (mpol_check_policy(mode, nodes))
> - return -EINVAL;
> new = mpol_new(mode, flags, nodes);
> if (IS_ERR(new))
> return PTR_ERR(new);
> mpol_free(current->mempolicy);
> current->mempolicy = new;
> mpol_set_task_struct_flag();
> - if (new && mpol_mode(new->policy) == MPOL_INTERLEAVE)
> + if (new && mpol_mode(new->policy) == MPOL_INTERLEAVE &&
> + nodes_weight(new->v.nodes))
> current->il_next = first_node(new->v.nodes);
> return 0;
> }
> @@ -818,9 +773,6 @@ static long do_mbind(unsigned long start, unsigned long len,
> if (end == start)
> return 0;
>
> - if (mpol_check_policy(mode, nmask))
> - return -EINVAL;
> -
> new = mpol_new(mode, mode_flags, nmask);
> if (IS_ERR(new))
> return PTR_ERR(new);
> @@ -1211,7 +1163,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
> next = next_node(nid, policy->v.nodes);
> if (next >= MAX_NUMNODES)
> next = first_node(policy->v.nodes);
> - me->il_next = next;
> + if (next < MAX_NUMNODES)
> + me->il_next = next;
> return nid;
> }
>
> @@ -1250,10 +1203,13 @@ static unsigned offset_il_node(struct mempolicy *pol,
> struct vm_area_struct *vma, unsigned long off)
> {
> unsigned nnodes = nodes_weight(pol->v.nodes);
> - unsigned target = (unsigned)off % nnodes;
> + unsigned target;
> int c;
> int nid = -1;
>
> + if (!nnodes)
> + return numa_node_id();
> + target = (unsigned)off % nnodes;
> c = 0;
> do {
> nid = next_node(nid, pol->v.nodes);
> @@ -1773,6 +1729,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
> {
> nodemask_t *mpolmask;
> nodemask_t tmp;
> + int remap;
>
> if (!pol)
> return;
> @@ -1780,51 +1737,67 @@ static void mpol_rebind_policy(struct mempolicy *pol,
> if (nodes_equal(*mpolmask, *newmask))
> return;
>
> + remap = !(mpol_flags(pol->policy) & MPOL_F_STATIC_NODES);
> switch (mpol_mode(pol->policy)) {
> case MPOL_DEFAULT:
> break;
> case MPOL_INTERLEAVE:
> - nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
> + if (!remap)
> + nodes_and(tmp, pol->user_nodemask, *newmask);
> + else
> + nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
> pol->v.nodes = tmp;
> - *mpolmask = *newmask;
> - current->il_next = node_remap(current->il_next,
> - *mpolmask, *newmask);
> + if (!node_isset(current->il_next, tmp)) {
> + current->il_next = next_node(current->il_next, tmp);
> + if (current->il_next >= MAX_NUMNODES)
> + current->il_next = first_node(tmp);
> + if (current->il_next >= MAX_NUMNODES)
> + current->il_next = numa_node_id();
> + }
> break;
> case MPOL_PREFERRED:
> - pol->v.preferred_node = node_remap(pol->v.preferred_node,
> - *mpolmask, *newmask);
> - *mpolmask = *newmask;
> + if (!remap && !node_isset(pol->v.preferred_node, *newmask))
> + pol->v.preferred_node = -1;
> + else
> + pol->v.preferred_node = node_remap(pol->v.preferred_node,
> + *mpolmask, *newmask);

Note: "local allocation" [MPOL_PREFERRED with preferred_node == -1]
does not need and, IMO, should not be remapped. Current code doesn't
check for this, but I think that it will do the right thing because
node_remap() will not remap an invalid node id. However, I think it's
cleaner to explicitly check in the code. I have a pending
MPOL_PREFERRED cleanup patch that address this.

That being said, I don't understand what you're trying to do with the
"then" clause, or rather, why. Looks like you're explicitly dropping
back to local allocation? Would you/could you add a comment explaining
the "intent"?

> break;
> case MPOL_BIND: {
> - nodemask_t nodes;
> - struct zone **z;
> - struct zonelist *zonelist;
> -
> - nodes_clear(nodes);
> - for (z = pol->v.zonelist->zones; *z; z++)
> - node_set(zone_to_nid(*z), nodes);
> - nodes_remap(tmp, nodes, *mpolmask, *newmask);
> - nodes = tmp;
> + struct zonelist *zonelist = NULL;
> +
> + if (!remap)
> + nodes_and(tmp, pol->user_nodemask, *newmask);
> + else {
> + nodemask_t nodes;
> + struct zone **z;
> +
> + nodes_clear(nodes);
> + for (z = pol->v.zonelist->zones; *z; z++)
> + node_set(zone_to_nid(*z), nodes);
> + nodes_remap(tmp, nodes, *mpolmask, *newmask);
> + }
>
> - zonelist = bind_zonelist(&nodes);
> + if (nodes_weight(tmp))
> + zonelist = bind_zonelist(&tmp);

Not sure I understand what's going on here [nor with the comment below
about "falling thru'" to MPOL_DEFAULT means]. However, this area is
vastly simplified by Mel Gorman's "two zonelist" patch series. He
replaces the zonelist with a nodemask, so _BIND can share the
_INTERLEAVE case code.

Suggest you consider basing atop those.

>
> /* If no mem, then zonelist is NULL and we keep old zonelist.
> * If that old zonelist has no remaining mems_allowed nodes,
> * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
> */
>
> - if (!IS_ERR(zonelist)) {
> + if (zonelist && !IS_ERR(zonelist)) {
> /* Good - got mem - substitute new zonelist */
> kfree(pol->v.zonelist);
> pol->v.zonelist = zonelist;
> }
> - *mpolmask = *newmask;
> break;
> }
> default:
> BUG();
> break;
> }
> + if (mpol_mode(pol->policy) != MPOL_DEFAULT)
> + *mpolmask = *newmask;
> }
>
> /*
> diff --git a/mm/shmem.c b/mm/shmem.c
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1129,6 +1129,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
> err = 0;
> }
> if (flags) {
> + if (!strcmp(flags, "static"))
> + *policy |= MPOL_F_STATIC_NODES;
> }
> out:
> /* Restore string for error message */

2008-02-13 00:26:10

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Mon, 2008-02-11 at 11:56 -0800, David Rientjes wrote:
> On Tue, 12 Feb 2008, KOSAKI Motohiro wrote:
>
> > > @@ -218,21 +167,27 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> > > return ERR_PTR(-ENOMEM);
> > > flags &= MPOL_MODE_FLAGS;
> > > atomic_set(&policy->refcnt, 1);
> > > + cpuset_update_task_memory_state();
> > > + nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
> > > switch (mode) {
> > > case MPOL_INTERLEAVE:
> > > - policy->v.nodes = *nodes;
> > > + if (nodes_empty(*nodes))
> > > + return ERR_PTR(-EINVAL);
> >
> > need kmem_cache_free(policy_cache, policy) before return?
> >
>
> Very good catch!
>
>
>
> mempolicy: fix policy memory leak in mpol_new()
>
> If mpol_new() cannot setup a new mempolicy because of an invalid argument
> provided by the user, avoid leaking the mempolicy that has been dynamically
> allocated.
>
> Reported by KOSAKI Motohiro.
>
> Cc: Paul Jackson <[email protected]>
> Cc: Christoph Lameter <[email protected]>
> Cc: Lee Schermerhorn <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Cc: KOSAKI Motohiro <[email protected]>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> mm/mempolicy.c | 10 +++++-----
> 1 files changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -171,13 +171,11 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
> switch (mode) {
> case MPOL_INTERLEAVE:
> - if (nodes_empty(*nodes))
> - return ERR_PTR(-EINVAL);
> - policy->v.nodes = cpuset_context_nmask;
> - if (nodes_weight(policy->v.nodes) == 0) {
> + if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask)) {
> kmem_cache_free(policy_cache, policy);
> return ERR_PTR(-EINVAL);
> }
> + policy->v.nodes = cpuset_context_nmask;
> break;
> case MPOL_PREFERRED:
> policy->v.preferred_node = first_node(cpuset_context_nmask);
> @@ -185,8 +183,10 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> policy->v.preferred_node = -1;
> break;
> case MPOL_BIND:
> - if (nodes_empty(*nodes))
> + if (nodes_empty(*nodes)) {
> + kmem_cache_free(policy_cache, policy);
> return ERR_PTR(-EINVAL);
> + }
> policy->v.zonelist = bind_zonelist(&cpuset_context_nmask);
> if (IS_ERR(policy->v.zonelist)) {
> void *error_code = policy->v.zonelist;

With this patch, we now have 3 error paths from mpol_new that need to
free the mempolicy struct. How about consolidating them with something
like this [uncompiled/untested]:

PATCH mempolicy - consolidate mpol_new() error paths

Use common error path in mpol_new() for errors that we discover
after allocation the new mempolicy structure. Free the mempolicy
in the common error path.

Signed-off-by: Lee Schermerhorn <[email protected]>

mm/mempolicy.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)

Index: Linux/mm/mempolicy.c
===================================================================
--- Linux.orig/mm/mempolicy.c 2008-02-12 15:18:12.000000000 -0700
+++ Linux/mm/mempolicy.c 2008-02-12 15:22:07.000000000 -0700
@@ -156,6 +156,7 @@ static struct mempolicy *mpol_new(enum m
{
struct mempolicy *policy;
nodemask_t cpuset_context_nmask;
+ void *error_code = ERR_PTR(-EINVAL);

pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -172,8 +173,7 @@ static struct mempolicy *mpol_new(enum m
switch (mode) {
case MPOL_INTERLEAVE:
if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask)) {
- kmem_cache_free(policy_cache, policy);
- return ERR_PTR(-EINVAL);
+ goto free_mpol;
}
policy->v.nodes = cpuset_context_nmask;
break;
@@ -184,14 +184,12 @@ static struct mempolicy *mpol_new(enum m
break;
case MPOL_BIND:
if (nodes_empty(*nodes)) {
- kmem_cache_free(policy_cache, policy);
- return ERR_PTR(-EINVAL);
+ goto free_mpol;
}
policy->v.zonelist = bind_zonelist(&cpuset_context_nmask);
if (IS_ERR(policy->v.zonelist)) {
- void *error_code = policy->v.zonelist;
- kmem_cache_free(policy_cache, policy);
- return error_code;
+ error_code = policy->v.zonelist;
+ goto free_mpol;
}
break;
default:
@@ -201,6 +199,10 @@ static struct mempolicy *mpol_new(enum m
policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
policy->user_nodemask = *nodes;
return policy;
+
+free_mpol:
+ kmem_cache_free(policy_cache, policy);
+ return error_code;
}

static void gather_stats(struct page *, void *, int pte_dirty);

2008-02-13 00:26:34

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 2/4] mempolicy: support optional mode flags

On Tue, 12 Feb 2008, Lee Schermerhorn wrote:

> What is the benefit of pulling the flags and mode apart at the user
> interface, passing them as separate args to mpol_new(), do_* and
> mpol_shared_policy_init() and then stitching them back together in
> mpol_new()? Modes passed in via [sys_]{set_mempolicy|mbind}() and
> those stored in the the shmem_sb_info can already have the flags there,
> so this seems like extra work.
>
> I think this patch and the previous one [1/4] would be simplified if the
> formal parameters were left as an int [mabye, unsigned] and the flags
> were masked of when necessary in mpol_new() and elsewhere.
>

Christoph made a similiar suggestion.

> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -203,18 +203,20 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
> > }
> >
> > /* Create a new policy */
> > -static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
> > +static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> > + unsigned short flags, nodemask_t *nodes)
> > {
> > struct mempolicy *policy;
> >
> > - pr_debug("setting mode %d nodes[0] %lx\n",
> > - mode, nodes ? nodes_addr(*nodes)[0] : -1);
> > + pr_debug("setting mode %d flags %d nodes[0] %lx\n",
> > + mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
> >
> > if (mode == MPOL_DEFAULT)
> > return NULL;
> > policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
> > if (!policy)
> > return ERR_PTR(-ENOMEM);
> > + flags &= MPOL_MODE_FLAGS;
> > atomic_set(&policy->refcnt, 1);
> > switch (mode) {
> > case MPOL_INTERLEAVE:
> > @@ -240,7 +242,7 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
> > default:
> > BUG();
> > }
> > - policy->policy = mode;
> > + policy->policy = mode | flags;
> > policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
> > return policy;
> > }
> > @@ -483,19 +485,20 @@ static void mpol_set_task_struct_flag(void)
> > }
> >
> > /* Set the process memory policy */
> > -static long do_set_mempolicy(enum mempolicy_mode mode, nodemask_t *nodes)
> > +static long do_set_mempolicy(enum mempolicy_mode mode, unsigned short flags,
> > + nodemask_t *nodes)
> > {
> > struct mempolicy *new;
> >
> > if (mpol_check_policy(mode, nodes))
> > return -EINVAL;
> > - new = mpol_new(mode, nodes);
> > + new = mpol_new(mode, flags, nodes);
> > if (IS_ERR(new))
> > return PTR_ERR(new);
> > mpol_free(current->mempolicy);
> > current->mempolicy = new;
> > mpol_set_task_struct_flag();
> > - if (new && new->policy == MPOL_INTERLEAVE)
> > + if (new && mpol_mode(new->policy) == MPOL_INTERLEAVE)
> > current->il_next = first_node(new->v.nodes);
> > return 0;
> > }
> > @@ -506,7 +509,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
> > int i;
> >
> > nodes_clear(*nodes);
> > - switch (p->policy) {
> > + switch (mpol_mode(p->policy)) {
> > case MPOL_BIND:
> > for (i = 0; p->v.zonelist->zones[i]; i++)
> > node_set(zone_to_nid(p->v.zonelist->zones[i]),
> > @@ -588,7 +591,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
> > goto out;
> > *policy = err;
> > } else if (pol == current->mempolicy &&
> > - pol->policy == MPOL_INTERLEAVE) {
> > + mpol_mode(pol->policy) == MPOL_INTERLEAVE) {
> > *policy = current->il_next;
> > } else {
> > err = -EINVAL;
> > @@ -785,8 +788,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
> > #endif
> >
> > static long do_mbind(unsigned long start, unsigned long len,
> > - enum mempolicy_mode mode, nodemask_t *nmask,
> > - unsigned long flags)
> > + enum mempolicy_mode mode, unsigned short mode_flags,
> > + nodemask_t *nmask, unsigned long flags)
> > {
> > struct vm_area_struct *vma;
> > struct mm_struct *mm = current->mm;
> > @@ -818,7 +821,7 @@ static long do_mbind(unsigned long start, unsigned long len,
> > if (mpol_check_policy(mode, nmask))
> > return -EINVAL;
> >
> > - new = mpol_new(mode, nmask);
> > + new = mpol_new(mode, mode_flags, nmask);
> > if (IS_ERR(new))
> > return PTR_ERR(new);
> >
> > @@ -829,8 +832,9 @@ static long do_mbind(unsigned long start, unsigned long len,
> > if (!new)
> > flags |= MPOL_MF_DISCONTIG_OK;
> >
> > - pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
> > - mode, nmask ? nodes_addr(*nmask)[0] : -1);
> > + pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
> > + start, start + len, mode, mode_flags,
> > + nmask ? nodes_addr(*nmask)[0] : -1);
> >
> > down_write(&mm->mmap_sem);
> > vma = check_range(mm, start, end, nmask,
> > @@ -929,13 +933,16 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
> > {
> > nodemask_t nodes;
> > int err;
> > + unsigned short mode_flags;
> >
> > + mode_flags = mpol_flags(mode);
> >
> >
> I suggest restricting the flags to the defined flags here. This gives
> us the ability to defined new flags w/o breaking [changing behavior of]
> applications that accidently pass undefined flags. An error return
> forced the user to fix the application [assuming they check error
> returns].
>

That looks appropriate. It will also help in the future if certain flags
are only defined for set_mempolicy() or only defined for mbind() to either
mask them out in their respective functions or return -EINVAL.

Thanks for the comments.

David

2008-02-13 00:32:17

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

Lee wrote:
> Why not leave it as an int. Will simplify this and subsequent patch.

I tend to agree with Lee on this one. If I recall correctly, Christoph
said something similar, regarding the change of the 'policy' field
of struct mempolicy from a short to an enum.

I'm inclined toward the original types for the 'policy' field.

Also, rather than trying to pack the new flag, MPOL_F_STATIC_NODES,
into the existing 'policy' field, I'd suggest instead adding a new
field to 'struct mempolicy' for this flag. Since 'policy' is only a
short, and since the next field in that struct, is a union that
includes a pointer that is aligned on most arch's to at least a 4 byte
boundary, therefore there is a hole of at least two bytes, following
the short policy field, in which another short or some flag bits can be
placed, with no increase in the size of struct mempolicy.

Specifically, I'd suggest adding the one line for 'mode_f_static_nodes'
as below, and leaving the code involving the encoding of the policy
field alone.

struct mempolicy {
atomic_t refcnt;
short policy; /* See MPOL_* above */
int mode_f_static_nodes:1; /* <== Added line <== */
union {
struct zonelist *zonelist; /* bind */
short preferred_node; /* preferred */
nodemask_t nodes; /* interleave */
/* undefined for default */
} v;
nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
};

Single bit fields (The ":1" above) provide the simplest way to add
boolean flags to structs. Let the compiler do the work of packing
and unpacking the field.

Then, rather than having to code double-negative explicit masking
operations such as:

remap = !(mpol_flags(pol->policy) & MPOL_F_STATIC_NODES);
if (!remap)
blah blah ...

one can simply code:

if (pol->mode_f_static_nodes)
blah blah ...

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 00:53:41

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Tue, 12 Feb 2008, Paul Jackson wrote:

> I tend to agree with Lee on this one. If I recall correctly, Christoph
> said something similar, regarding the change of the 'policy' field
> of struct mempolicy from a short to an enum.
>

I've already made the change in my patchset as a result of Christoph's
comment.

> I'm inclined toward the original types for the 'policy' field.
>
> Also, rather than trying to pack the new flag, MPOL_F_STATIC_NODES,
> into the existing 'policy' field, I'd suggest instead adding a new
> field to 'struct mempolicy' for this flag. Since 'policy' is only a
> short, and since the next field in that struct, is a union that
> includes a pointer that is aligned on most arch's to at least a 4 byte
> boundary, therefore there is a hole of at least two bytes, following
> the short policy field, in which another short or some flag bits can be
> placed, with no increase in the size of struct mempolicy.
>

I disagree, that space can be reserved for future use that will perhaps be
much needed at that time.

The type 'short' is obviously overkill to hold the value of the policy
mode since we only have four currently defined modes. We'll never exceed
the capacity of type 'unsigned char' if mempolicies are going to remain
useful.

If the flag bits are going to be stored in the same member as the mode, it
is necessary to make the change, as I have, to be unsigned to avoid
sign-extension when this is promoted to 'int' that is required as part of
the API.

> Specifically, I'd suggest adding the one line for 'mode_f_static_nodes'
> as below, and leaving the code involving the encoding of the policy
> field alone.
>
> struct mempolicy {
> atomic_t refcnt;
> short policy; /* See MPOL_* above */
> int mode_f_static_nodes:1; /* <== Added line <== */
> union {
> struct zonelist *zonelist; /* bind */
> short preferred_node; /* preferred */
> nodemask_t nodes; /* interleave */
> /* undefined for default */
> } v;
> nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
> };
>
> Single bit fields (The ":1" above) provide the simplest way to add
> boolean flags to structs. Let the compiler do the work of packing
> and unpacking the field.
>
> Then, rather than having to code double-negative explicit masking
> operations such as:
>
> remap = !(mpol_flags(pol->policy) & MPOL_F_STATIC_NODES);
> if (!remap)
> blah blah ...
>
> one can simply code:
>
> if (pol->mode_f_static_nodes)
> blah blah ...
>

The problem with your approach is that we need to pass the optional mode
flags back to the user through get_mempolicy() and the user needs to pass
them to us via set_mempolicy() or mbind(). So we'll still need the
#define in linux/mempolicy.h:

#define MPOL_F_STATIC_NODES (1 << MPOL_FLAG_SHIFT)

We could deal with this as follows:

if (mode & MPOL_F_STATIC_NODES)
pol->mode_f_static_nodes = 1;

but that doesn't make much sense.

Once Christoph's comment is taken into consideration and we start passing
around 'int policy' instead of 'enum mempolicy_mode mode' again, I don't
think anybody will struggle with doing:

if (mode & MPOL_F_STATIC_NODES)

to check for the prescence of a flag.

David

2008-02-13 00:58:31

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 12 Feb 2008, Lee Schermerhorn wrote:

> PATCH mempolicy - consolidate mpol_new() error paths
>
> Use common error path in mpol_new() for errors that we discover
> after allocation the new mempolicy structure. Free the mempolicy
> in the common error path.
>
> Signed-off-by: Lee Schermerhorn <[email protected]>
>
> mm/mempolicy.c | 16 +++++++++-------
> 1 file changed, 9 insertions(+), 7 deletions(-)
>
> Index: Linux/mm/mempolicy.c
> ===================================================================
> --- Linux.orig/mm/mempolicy.c 2008-02-12 15:18:12.000000000 -0700
> +++ Linux/mm/mempolicy.c 2008-02-12 15:22:07.000000000 -0700
> @@ -156,6 +156,7 @@ static struct mempolicy *mpol_new(enum m
> {
> struct mempolicy *policy;
> nodemask_t cpuset_context_nmask;
> + void *error_code = ERR_PTR(-EINVAL);
>
> pr_debug("setting mode %d flags %d nodes[0] %lx\n",
> mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
> @@ -172,8 +173,7 @@ static struct mempolicy *mpol_new(enum m
> switch (mode) {
> case MPOL_INTERLEAVE:
> if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask)) {
> - kmem_cache_free(policy_cache, policy);
> - return ERR_PTR(-EINVAL);
> + goto free_mpol;
> }
> policy->v.nodes = cpuset_context_nmask;
> break;

You can also get rid of the parentheses to make checkpatch.pl happy, too.

> @@ -184,14 +184,12 @@ static struct mempolicy *mpol_new(enum m
> break;
> case MPOL_BIND:
> if (nodes_empty(*nodes)) {
> - kmem_cache_free(policy_cache, policy);
> - return ERR_PTR(-EINVAL);
> + goto free_mpol;
> }
> policy->v.zonelist = bind_zonelist(&cpuset_context_nmask);
> if (IS_ERR(policy->v.zonelist)) {
> - void *error_code = policy->v.zonelist;
> - kmem_cache_free(policy_cache, policy);
> - return error_code;
> + error_code = policy->v.zonelist;
> + goto free_mpol;
> }
> break;
> default:
> @@ -201,6 +199,10 @@ static struct mempolicy *mpol_new(enum m
> policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
> policy->user_nodemask = *nodes;
> return policy;
> +
> +free_mpol:
> + kmem_cache_free(policy_cache, policy);
> + return error_code;
> }
>
> static void gather_stats(struct page *, void *, int pte_dirty);
>

I'll fold this into my patchset when I repost it, thanks.

David

2008-02-13 01:04:38

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Tue, 12 Feb 2008, Paul Jackson wrote:

> I'm inclined toward the original types for the 'policy' field.

Good. And remove the enum.

> Specifically, I'd suggest adding the one line for 'mode_f_static_nodes'
> as below, and leaving the code involving the encoding of the policy
> field alone.
>
> struct mempolicy {
> atomic_t refcnt;
> short policy; /* See MPOL_* above */
> int mode_f_static_nodes:1; /* <== Added line <== */

It would be better to add some sort of flags field?

> Single bit fields (The ":1" above) provide the simplest way to add
> boolean flags to structs. Let the compiler do the work of packing
> and unpacking the field.

We usually do that with unsigned XXX and constants. You may want to check
multiple flags at once or do other fancy things.

2008-02-13 01:28:53

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

Christoph wrote:
> We usually do that with unsigned XXX and constants. You may want to check
> multiple flags at once or do other fancy things.

Both are common in the kernel. I see 241 ":1" bit fields in include/linux/*.h.

One can do Boolean expressions with either form, bitfields or defines.

For example:

struct {
int foo:1;
int goo:1;
} x;

if (x.foo && ! x.goo)
blah blah ...;

Doing (x.flags & FLAG_FOO) is simple enough, but it is still
not as simple as (x.foo). Where possible, I encourage keeping
extraneous detail out of mainline code.

Sometimes, such as in task struct flags, one has to do such Boolean
combinations in performance critical code paths, and then one must do
what one must do, with the defined constants.

Sometimes, such as with the GFP_* flags, one has to name various
combinations, and then one needs again to use defined constants.

I see no evidence that either of those situations applies here.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 01:32:57

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

Christoph wrote:
> Good. And remove the enum.
>
> It would be better to add some sort of flags field?

On the other hand, despite my brilliant (hah!) endorsement
of bit field flags in my reply a few minutes ago, I'd settle
for (1) removing the enum, and (2) using a flags field and
more defines for the new stuff. I will grant that Christoph
is correct that that form is more common in the kernel, and
there is something to be said for doing things in the most
common manner.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 02:01:32

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Tue, 12 Feb 2008, Paul Jackson wrote:

> Christoph wrote:
> > Good. And remove the enum.
> >
> > It would be better to add some sort of flags field?
>
> On the other hand, despite my brilliant (hah!) endorsement
> of bit field flags in my reply a few minutes ago, I'd settle
> for (1) removing the enum, and (2) using a flags field and
> more defines for the new stuff. I will grant that Christoph
> is correct that that form is more common in the kernel, and
> there is something to be said for doing things in the most
> common manner.
>

The enum has already been removed, as I've said a few times.

The 'flags' field would be wrong because you're ignoring that these flags
are both passed by the user to the kernel and by the kernel to the user as
part of the 'int *' parameter in either set_mempolicy() or mbind().

So they must have predefined shift in linux/mempolicy.h to separate that
'int *' parameter into the policy mode and the optional mode flags.

If you introduced a separate flags member to struct mempolicy, you'd be
wasting the lower MPOL_FLAG_SHIFT bits for no apparent purpose so your
flag bits still work:

#define MPOL_F_STATIC_NODES (1 << MPOL_FLAG_SHIFT)

There's no way around that without shifting before storing and each time
you read pol->flags.

So since those bits would have to be unused and can perfectly accomodate
the mode bits (with the current definition of MPOL_FLAG_SHIFT at 8, we
can accomodate up to 256 distinct modes), there is no reason why they
cannot be combined. That is why I implemented it that way.

David

2008-02-13 02:22:20

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

David wrote:
> The enum has already been removed, as I've said a few times.

Yup. Sorry ... beating dead horses is too much fun.

> The 'flags' field would be wrong because you're ignoring that these flags
> are both passed by the user to the kernel and by the kernel to the user as
> part of the 'int *' parameter in either set_mempolicy() or mbind().

I disagree, though I risk being in a minority on this matter.

Yes, you're entirely correct that these new flag have to be passed to and
from user space via an existing integer parameter. There is no plausible
way other than packing the new flags into that existing parameter to preserve
the kernel-user API.

However, once inside the kernel, how we store this flag in struct mempolicy,
and how we pass it about between kernel routines, is our choice.

We can leave it packed, and unpack and repack it each time we consider the
flag and mode bits, or we can store and pass it as separate flags.

I urge us to consider handling it as separate flags within the kernel
because it most clearly and explicitly represents what is going on logically.
There are two different kinds of flags here, the original mempolicy modes,
and these meta modes (MPOL_F_STATIC_NODES, being the first example) which
affect the nodemask intepretation.

Cramming both these into a single int is necessary across the kernel-user API,
but it's an obfuscation that is not needed, therefore better avoided, within
the kernel code.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 02:42:45

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Tue, 12 Feb 2008, Paul Jackson wrote:

> However, once inside the kernel, how we store this flag in struct mempolicy,
> and how we pass it about between kernel routines, is our choice.
>
> We can leave it packed, and unpack and repack it each time we consider the
> flag and mode bits, or we can store and pass it as separate flags.
>
> I urge us to consider handling it as separate flags within the kernel
> because it most clearly and explicitly represents what is going on logically.
> There are two different kinds of flags here, the original mempolicy modes,
> and these meta modes (MPOL_F_STATIC_NODES, being the first example) which
> affect the nodemask intepretation.
>

Again, if you did it this way, the lower MPOL_FLAG_SHIFT bits of the new
'flags' member would always be zero if you are still going to use the
MPOL_F_* defines from linux/mempolicy.h to do your bit testing.

I do not subscribe to the theory that just because we have a couple extra
bytes of space somewhere in struct mempolicy that we have to use it
immediately.

> Cramming both these into a single int is necessary across the kernel-user API,
> but it's an obfuscation that is not needed, therefore better avoided, within
> the kernel code.
>

It makes the kernel code simpler, in a way.

Now we only have to pass a single actual among functions that include both
the mode and optional flags (there are a lot of them and they span not
only the VM but also filesystem code). The catch is that we have to use a
mpol_mode() wrapper for mode conditionals or switch statements.

But testing the flags is just as easy as

if (mode & MPOL_F_STATIC_NODES) {
...
}

That test would remain unchanged (except for s/mode/flags/) if flags were
stored in a separate member.

So by storing them both in an 'unsigned short' member of struct mempolicy:

- we don't use any additional memory (and we can use those two extra
bytes you identified earlier later), and

- we only have to pass a single actual to many different functions that
require both the mode and optional mode flags.

2008-02-13 02:59:56

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

> I do not subscribe to the theory that just because we have a couple extra
> bytes of space somewhere in struct mempolicy that we have to use it
> immediately.

Good grief ... I'm not lobbying for separate flag fields because the
space is there. I was just dealing with one possible obection, by
noting that it wouldn't cost us in terms of struct mempolicy size.

> It makes the kernel code simpler, in a way.
>
> Now we only have to pass a single actual among functions that include both
> the mode and optional flags (there are a lot of them and they span not
> only the VM but also filesystem code).

This gets closer to the key issue.

We both agree we want "simpler", but disagree on what that means.

We don't measure complexity -solely- by counting the size of parameter lists.
If we did that, we'd be packing all manner of sub-integer fields into single
'int' parameters.

I tend to measure complexity a level up from the bits and bytes,
and more in terms of how I think about things. If I think of a routine
as taking two values, such as in this case an mempolicy mode (such as
MPOL_BIND or MPOL_INTERLEAVE) and this new flag (MPOL_F_STATIC_NODES),
which have a different sort of affect.

==> If each time I look at some 'flags' field, I have to think of it
as a couple of things glued together that I will have to pick apart to
use, that's more mental work than seeing those two things explicit and
separate, through most of the mempolicy.c code <==

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 03:17:35

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

On Tue, 12 Feb 2008, Paul Jackson wrote:

> ==> If each time I look at some 'flags' field, I have to think of it
> as a couple of things glued together that I will have to pick apart to
> use, that's more mental work than seeing those two things explicit and
> separate, through most of the mempolicy.c code <==
>

That doesn't logically follow because the aggregate of the mode and the
optional flags _are_ the policy itself. If you want to know whether a
policy is interleave, for example, and don't care whether it is referring
to static (absolute) node ids, then you need to mask that off.

The reality of the kernel code is that these policies are not only
restricted to kernel/mempolicy.c. They are also shared with filesystem
code that store them in a single member of a struct as well. The
interface between those two are functions that would now need to be
modified to include additional parameters to pass the flags along.

Additionally, these flags need to be "glued together" with the mode in
userspace to pass to the syscalls anyway, so they're facing the exact
same challenge. So once this gets a little traction, I think it will
quickly become the norm for how you think about the 'policy' member of the
struct.

David

2008-02-13 03:23:17

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 1/4] mempolicy: convert MPOL constants to enum

David wrote:
> That doesn't logically follow because the aggregate of the mode and the
> optional flags _are_ the policy itself.

I give up ... I still don't agree, but that's ok.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 03:52:55

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

Lee wrote:
> 1) we've discussed the issue of returning EINVAL for non-empty nodemasks
> with MPOL_DEFAULT. By removing this restriction, we run the risk of
> breaking applications if we should ever want to define a semantic to
> non-empty node mask for MPOL_DEFAULT.

The bigger risk, in my view, is breaking some piece of existing user code.
Properly written user code wouldn't break, but that doesn't mean much.
Changes, even minor corner case changes, often break something, so should
not be done with out cause. Whether or not code cleanup in mempolicy.c is
sufficient cause here is not clear to me.

Future room for growth doesn't mean so much for me here; if we close one
future alternative, we always have others, such as more mode flag bits.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 04:04:58

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 12 Feb 2008, Paul Jackson wrote:

> > 1) we've discussed the issue of returning EINVAL for non-empty nodemasks
> > with MPOL_DEFAULT. By removing this restriction, we run the risk of
> > breaking applications if we should ever want to define a semantic to
> > non-empty node mask for MPOL_DEFAULT.
>
> The bigger risk, in my view, is breaking some piece of existing user code.
> Properly written user code wouldn't break, but that doesn't mean much.
> Changes, even minor corner case changes, often break something, so should
> not be done with out cause. Whether or not code cleanup in mempolicy.c is
> sufficient cause here is not clear to me.
>
> Future room for growth doesn't mean so much for me here; if we close one
> future alternative, we always have others, such as more mode flag bits.
>

I've redone my patchset based on the feedback that I've received, and in
my latest revisions I folded the entire equivalent of mpol_check_policy()
into mpol_new().

Lee, you feel strongly that non-empty nodemasks passed with MPOL_DEFAULT
should be considered invalid and rejected by the kernel, as the current
implementation does. I've brought up a counter-argument based on the
set_mempolicy() man page and the Linux documentation that don't specify
anything about what the nodemask shall contain if it's not a NULL pointer.

My position was to give the user the benefit of the doubt. Because Linux
has been vague in specifying what the nodemask shall contain, that (to me)
means that it can contain anything. It's undefined, in a standards sense.
The only thing that we should look for is whether the user passed
MPOL_DEFAULT as the first parameter and then we should effect that policy
because it's clearly the intention.

I don't think there's a super strong case for either behavior, and that's
why I just folded the mpol_check_policy() logic straight into mpol_new().

David

2008-02-13 04:14:12

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

David wrote:
> I've redone my patchset based on the feedback that I've received

Will you be sending that along soon? I was just getting into my
review of this patchset, and I suppose it would be better to
review the latest and greatest.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 04:19:46

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 12 Feb 2008, Lee Schermerhorn wrote:

> > Adds another member to struct mempolicy,
> >
> > nodemask_t user_nodemask
> >
> > that stores the the nodemask that the user passed when he or she created
> > the mempolicy via set_mempolicy() or mbind(). When using
> > MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's
> > passed nodemask is always used when determining the preferred node,
> > building the MPOL_BIND zonelist, or creating the interleave nodemask.
> > This happens whenever the policy is rebound, including when a task's
> > cpuset assignment changes or the cpuset's mems are changed.
>
> When you say that the user's nodemask is "always used" you mean "after
> cpuset contextualization", right? I.e., after masking with mems_allowed
> [which also restricts to nodes with memory]. That is what the code
> still does.
>

Yeah, I'm not introducing a loophole so that tasks can access memory to
which they don't have access. I've clarified that for the next version.

> > It is also possible to mount tmpfs with the static nodemask behavior when
> > specifying a node or nodemask. To do this, simply add "=static"
> > immediately following the mempolicy mode at mount time:
> >
> > mount -o remount mpol=interleave=static:1-3
> >
> > Also removes mpol_check_policy() and folds some of its logic into
> > mpol_new() since it is now mostly obsoleted.
>
> Couple of comments here:
>
> 1) we've discussed the issue of returning EINVAL for non-empty nodemasks
> with MPOL_DEFAULT. By removing this restriction, we run the risk of
> breaking applications if we should ever want to define a semantic to
> non-empty node mask for MPOL_DEFAULT. I think this is probably
> unlikely, but consider the new mode flags part of the mode/policy
> parameter: by not rejecting undefined flags, we may break applications
> by later defining additional flags. I'd argue for fairly strict
> argument checking.
>

As I've mentioned in a parallel thread, I've folded the entire logic of
mpol_check_policy() as it stands this minute in Linus' tree into
mpol_new().

> 2) Before this patch, policy_nodemask from the shmem_sb_info was NOT
> masked with allowed nodes because we passed this mask directly to
> mpol_new() without "contextualization". We DO guarantee that this
> policy nodemask contains only nodes with memory [see
> shmem_parse_mpol()]. Now that you've moved the contextualization to
> mpol_new(), we are masking this policy with the cpuset mems allowed.
> This is a change in behavior. Do we want this? I.e., are we preserving
> the "intent" of the system administrator in setting the tmpfs policy?
> Maybe they wanted to shared interleaved shmem areas between cpusets with
> disjoint mems allowed. Nothing prevents this...
>

We're still saving the intent in the new policy->user_nodemask field so
any future rebinds will still allow unaccessible nodes be effected by the
mempolicy if the permissions are changed later.

> If we just want to preserve existing behavior, we can define an
> additional mode flag that we set in the sbinfo policy in
> shmem_parse_mpol() and test in mpol_new(). If we want to be able to
> specify existing or new behavior, we can use the same flag, but set it
> or not based on an additional qualifier specified via the mount option.
>

Shmem areas between cpusets with disjoint mems_allowed seems like an error
in userspace to me and I would prefer that mpol_new() reject it outright.

> > @@ -218,21 +167,27 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> > return ERR_PTR(-ENOMEM);
> > flags &= MPOL_MODE_FLAGS;
> > atomic_set(&policy->refcnt, 1);
> > + cpuset_update_task_memory_state();
> > + nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
> > switch (mode) {
> > case MPOL_INTERLEAVE:
> > - policy->v.nodes = *nodes;
> > + if (nodes_empty(*nodes))
> > + return ERR_PTR(-EINVAL);
> > + policy->v.nodes = cpuset_context_nmask;
> > if (nodes_weight(policy->v.nodes) == 0) {
> > kmem_cache_free(policy_cache, policy);
> > return ERR_PTR(-EINVAL);
> > }
> > break;
> > case MPOL_PREFERRED:
> > - policy->v.preferred_node = first_node(*nodes);
> > + policy->v.preferred_node = first_node(cpuset_context_nmask);
> > if (policy->v.preferred_node >= MAX_NUMNODES)
> > policy->v.preferred_node = -1;
> > break;
> > case MPOL_BIND:
> > - policy->v.zonelist = bind_zonelist(nodes);
> > + if (nodes_empty(*nodes))
>
> Kosaki-san already pointed out the need to free the policy struct
> here.
>

I folded your fix to have only one

kmem_cache_free(policy_cache, policy);
return ERR_PTR(error_code);

point in mpol_new().

> > @@ -1780,51 +1737,67 @@ static void mpol_rebind_policy(struct mempolicy *pol,
> > if (nodes_equal(*mpolmask, *newmask))
> > return;
> >
> > + remap = !(mpol_flags(pol->policy) & MPOL_F_STATIC_NODES);
> > switch (mpol_mode(pol->policy)) {
> > case MPOL_DEFAULT:
> > break;
> > case MPOL_INTERLEAVE:
> > - nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
> > + if (!remap)
> > + nodes_and(tmp, pol->user_nodemask, *newmask);
> > + else
> > + nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
> > pol->v.nodes = tmp;
> > - *mpolmask = *newmask;
> > - current->il_next = node_remap(current->il_next,
> > - *mpolmask, *newmask);
> > + if (!node_isset(current->il_next, tmp)) {
> > + current->il_next = next_node(current->il_next, tmp);
> > + if (current->il_next >= MAX_NUMNODES)
> > + current->il_next = first_node(tmp);
> > + if (current->il_next >= MAX_NUMNODES)
> > + current->il_next = numa_node_id();
> > + }
> > break;
> > case MPOL_PREFERRED:
> > - pol->v.preferred_node = node_remap(pol->v.preferred_node,
> > - *mpolmask, *newmask);
> > - *mpolmask = *newmask;
> > + if (!remap && !node_isset(pol->v.preferred_node, *newmask))
> > + pol->v.preferred_node = -1;
> > + else
> > + pol->v.preferred_node = node_remap(pol->v.preferred_node,
> > + *mpolmask, *newmask);
>
> Note: "local allocation" [MPOL_PREFERRED with preferred_node == -1]
> does not need and, IMO, should not be remapped. Current code doesn't
> check for this, but I think that it will do the right thing because
> node_remap() will not remap an invalid node id. However, I think it's
> cleaner to explicitly check in the code. I have a pending
> MPOL_PREFERRED cleanup patch that address this.
>
> That being said, I don't understand what you're trying to do with the
> "then" clause, or rather, why. Looks like you're explicitly dropping
> back to local allocation? Would you/could you add a comment explaining
> the "intent"?
>

Since we're allowed to remap the node to a different node than the user
specified with either syscall, the current behavior is that "one node is
as good as another." In other words, we're trying to accomodate the mode
first by setting pol->v.preferred_node to some accessible node and only
setting that to the user-supplied node if it is available.

If the node isn't available and the user specifically asked that it is not
remapped (with MPOL_F_STATIC_NODES), then I felt local allocation was best
compared to remapping to a node that may be unrelated to the VMA or task.
This preserves a sense of the current behavior that "one node is as good
as another," but the user specifically asked for no remap.

> > break;
> > case MPOL_BIND: {
> > - nodemask_t nodes;
> > - struct zone **z;
> > - struct zonelist *zonelist;
> > -
> > - nodes_clear(nodes);
> > - for (z = pol->v.zonelist->zones; *z; z++)
> > - node_set(zone_to_nid(*z), nodes);
> > - nodes_remap(tmp, nodes, *mpolmask, *newmask);
> > - nodes = tmp;
> > + struct zonelist *zonelist = NULL;
> > +
> > + if (!remap)
> > + nodes_and(tmp, pol->user_nodemask, *newmask);
> > + else {
> > + nodemask_t nodes;
> > + struct zone **z;
> > +
> > + nodes_clear(nodes);
> > + for (z = pol->v.zonelist->zones; *z; z++)
> > + node_set(zone_to_nid(*z), nodes);
> > + nodes_remap(tmp, nodes, *mpolmask, *newmask);
> > + }
> >
> > - zonelist = bind_zonelist(&nodes);
> > + if (nodes_weight(tmp))
> > + zonelist = bind_zonelist(&tmp);
>
> Not sure I understand what's going on here [nor with the comment below
> about "falling thru'" to MPOL_DEFAULT means]. However, this area is
> vastly simplified by Mel Gorman's "two zonelist" patch series. He
> replaces the zonelist with a nodemask, so _BIND can share the
> _INTERLEAVE case code.
>
> Suggest you consider basing atop those.
>

We'll have to see what the merge order turns out to be, because the
patchset you're referring to that modifies this in mm/mempolicy.c is not
currently in -mm.

David

2008-02-13 04:25:44

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 12 Feb 2008, Paul Jackson wrote:

> > I've redone my patchset based on the feedback that I've received
>
> Will you be sending that along soon? I was just getting into my
> review of this patchset, and I suppose it would be better to
> review the latest and greatest.
>

Lee had some questions and comments that I've recently addressed so I
wanted to give him a chance to respond before sending it out again in case
more changes need to be made.

I'll email the current set to you now.

David

2008-02-13 05:07:17

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 12 Feb 2008, David Rientjes wrote:

> Since we're allowed to remap the node to a different node than the user
> specified with either syscall, the current behavior is that "one node is
> as good as another." In other words, we're trying to accomodate the mode
> first by setting pol->v.preferred_node to some accessible node and only
> setting that to the user-supplied node if it is available.
>
> If the node isn't available and the user specifically asked that it is not
> remapped (with MPOL_F_STATIC_NODES), then I felt local allocation was best
> compared to remapping to a node that may be unrelated to the VMA or task.
> This preserves a sense of the current behavior that "one node is as good
> as another," but the user specifically asked for no remap.
>

Upon further inspection, perhaps it's better to fallback to local
allocation when the preferred node is not available on rebind and then, if
it becomes available later, prefer it again.

In mpol_rebind_policy():

case MPOL_PREFERRED:
if (!remap) {
int nid = first_node(pol->user_nodemask);

if (node_isset(nid, *newmask))
pol->v.preferred_node = nid;
else {
/*
* Fallback to local allocation since that
* is the behavior in mpol_new(). The
* node may eventually become available.
*/
pol->v.preferred_node = -1;
}
} else
pol->v.preferred_node = node_remap(pol->v.preferred_node,
*mpolmask, *newmask);
break;

What do you think, Lee?

David

2008-02-13 08:04:00

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

Ok ... I read this patchset a little closer, and see a couple
of items worth noting.

The infamous unpublished (except to a few) patch I drafted on Christmas
(Dec 25, 2007) basically added two new modes for how mempolicy
nodemasks were to be resolved:
1) a static, no remap, mode, such as in this present patchset, and
2) a cpuset relative mode that correctly handled the case of cpusets
growing larger (increased numbers of nodes.)

I'd like to persuade you to add case (2) as well. But I suspect,
given that case (2) was never as compelling to you as it was to me
in our December discussions, that I'll have little luck doing that.

If you choose not to add case (2) to your patchset, then I'll wait
until the dust settles on your patchset, and follow up with a second
patch, adding case (2) and presenting the justification for it then.

Notes and questions on your current patchset:

1) It looks like you -add- a second nodemask to struct mempolicy:
nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
nodemask_t user_nodemask; /* nodemask passed by user */

I believe you can overlay these two nodemasks using a union, and avoid making
struct mempolicy bigger.

2) Those 'mpol_mode()' wrappers on all those mempolicy->policy
evaluations look dangerous to me. It looks like a code bug
waiting to happen. Unless I miss my guess, if you forget one of
those wrappers (or someone making a future change misses one), no
one will notice until sometime at runtime, someone makes use of the
new fangled MPOL_F_STATIC_NODES mode in some new situation, and we
end up executing code that we didn't expect to execute just then.

I urge you to reconsider, and keep it so that the 'policy' field of struct
mempolicy naturally evaluates to just the policy. There should be just one
pair of places, on entry to, and exit from, the kernel, where the code is
aware of the need to pack the mode and the policy into a single word.

3) Does your patchset allow the user to specify a nodemask that includes nodes
outside of the current cpuset with a MPOL_F_STATIC_NODES mode? At first
glance, I didn't see that it did, but I should have looked closer. This
strikes me as an essential aspect of this mode.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 09:37:13

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Wed, 13 Feb 2008, Paul Jackson wrote:

> The infamous unpublished (except to a few) patch I drafted on Christmas
> (Dec 25, 2007) basically added two new modes for how mempolicy
> nodemasks were to be resolved:
> 1) a static, no remap, mode, such as in this present patchset, and
> 2) a cpuset relative mode that correctly handled the case of cpusets
> growing larger (increased numbers of nodes.)
>
> I'd like to persuade you to add case (2) as well. But I suspect,
> given that case (2) was never as compelling to you as it was to me
> in our December discussions, that I'll have little luck doing that.
>

MPOL_F_STATIC_NODES already handles the second case because you can
specify nodes that aren't currently accessible because of your cpuset in
the hopes that eventually they will become accessible. It's possible to
pass a nodemask with all bits set so that when the cpuset's set of nodes
expand, the mempolicy is effected over the intersection of the two on
rebind.

Other than that, perhaps if you can elaborate more on what you imagined
supporting as far as cpusets growing larger (or supporting node hotplug,
which is the same type of problem), we can discuss that further before I
resend my patches.

> Notes and questions on your current patchset:
>
> 1) It looks like you -add- a second nodemask to struct mempolicy:
> nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
> nodemask_t user_nodemask; /* nodemask passed by user */
>
> I believe you can overlay these two nodemasks using a union, and avoid making
> struct mempolicy bigger.
>

Ahh, since policy->cpuset_mems_allowed is only meaningful in the
non-MPOL_F_STATIC_NODES case, that probably will work. For the purposes
of this patchset, we can certainly do that. I'm wondering whether future
expansions will require them to be separated again, however.

> 2) Those 'mpol_mode()' wrappers on all those mempolicy->policy
> evaluations look dangerous to me. It looks like a code bug
> waiting to happen. Unless I miss my guess, if you forget one of
> those wrappers (or someone making a future change misses one), no
> one will notice until sometime at runtime, someone makes use of the
> new fangled MPOL_F_STATIC_NODES mode in some new situation, and we
> end up executing code that we didn't expect to execute just then.
>
> I urge you to reconsider, and keep it so that the 'policy' field of struct
> mempolicy naturally evaluates to just the policy. There should be just one
> pair of places, on entry to, and exit from, the kernel, where the code is
> aware of the need to pack the mode and the policy into a single word.
>

Ok.

> 3) Does your patchset allow the user to specify a nodemask that includes nodes
> outside of the current cpuset with a MPOL_F_STATIC_NODES mode? At first
> glance, I didn't see that it did, but I should have looked closer. This
> strikes me as an essential aspect of this mode.
>

It does, and that's why I was a little curious about your second case at
the beginning of this email.

The user's nodemask is always stored unaltered in policy->user_nodemask.
In mpol_new(), we intersect the current accessible nodes with that
nodemask to determine if there's even a mempolicy to effect. If not,
mpol_new() returns ERR_PTR(-EINVAL) and we fall back to the existing
policy (if any) for the VMA or task. Otherwise, we use the intersection
of those two nodemasks.

In mpol_rebind_policy() with MPOL_F_STATIC_NODES, we always intersect
policy->user_nodemask with the set of accessible nodes (nodemask_t
*newmask). So if we can now access nodes that we previously couldn't,
they are now part of the interleave nodemask. A similiar situation occurs
when building the zonelist for the MPOL_BIND case and you can see the
change I made to MPOL_PREFERRED in the incremental patch I sent you.

The only way that newly-accessible nodes will not become a part of the
mempolicy nodemask is when the user's nodemask and the set of accessible
nodes is disjoint when the policy was created.

It is arguable whether we want to support that behavior as well (and we
definitely could do it, it's not out of the scope of mempolicies). Lee
had specific requirements of rejecting nodemasks that had no nodes in the
intersection based on the current implementation, but we could continue
discussing the possibility of setting up mempolicies that are uneffected
when they are created and only become policy when they are rebound later.

David

2008-02-13 15:15:33

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 2008-02-12 at 21:06 -0800, David Rientjes wrote:
> On Tue, 12 Feb 2008, David Rientjes wrote:
>
> > Since we're allowed to remap the node to a different node than the user
> > specified with either syscall, the current behavior is that "one node is
> > as good as another." In other words, we're trying to accomodate the mode
> > first by setting pol->v.preferred_node to some accessible node and only
> > setting that to the user-supplied node if it is available.
> >
> > If the node isn't available and the user specifically asked that it is not
> > remapped (with MPOL_F_STATIC_NODES), then I felt local allocation was best
> > compared to remapping to a node that may be unrelated to the VMA or task.
> > This preserves a sense of the current behavior that "one node is as good
> > as another," but the user specifically asked for no remap.
> >
>
> Upon further inspection, perhaps it's better to fallback to local
> allocation when the preferred node is not available on rebind and then, if
> it becomes available later, prefer it again.
>
> In mpol_rebind_policy():
>
> case MPOL_PREFERRED:
> if (!remap) {
> int nid = first_node(pol->user_nodemask);
>
> if (node_isset(nid, *newmask))
> pol->v.preferred_node = nid;
> else {
> /*
> * Fallback to local allocation since that
> * is the behavior in mpol_new(). The
> * node may eventually become available.
> */
> pol->v.preferred_node = -1;
> }
> } else
> pol->v.preferred_node = node_remap(pol->v.preferred_node,
> *mpolmask, *newmask);
> break;
>
> What do you think, Lee?

I agree. I was still puzzled by your patch in this area as it appeared
to me that you weren't remapping back to your original preferred node,
which seemed to be the whole point of "static". I was going to ask
about this today, but you caught it first!

Lee

>
> David

2008-02-13 16:01:47

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Wed, 2008-02-13 at 01:36 -0800, David Rientjes wrote:
> On Wed, 13 Feb 2008, Paul Jackson wrote:
>
> > The infamous unpublished (except to a few) patch I drafted on Christmas
> > (Dec 25, 2007) basically added two new modes for how mempolicy
> > nodemasks were to be resolved:
> > 1) a static, no remap, mode, such as in this present patchset, and
> > 2) a cpuset relative mode that correctly handled the case of cpusets
> > growing larger (increased numbers of nodes.)
> >
> > I'd like to persuade you to add case (2) as well. But I suspect,
> > given that case (2) was never as compelling to you as it was to me
> > in our December discussions, that I'll have little luck doing that.
> >
>
> MPOL_F_STATIC_NODES already handles the second case because you can
> specify nodes that aren't currently accessible because of your cpuset in
> the hopes that eventually they will become accessible. It's possible to
> pass a nodemask with all bits set so that when the cpuset's set of nodes
> expand, the mempolicy is effected over the intersection of the two on
> rebind.
>
> Other than that, perhaps if you can elaborate more on what you imagined
> supporting as far as cpusets growing larger (or supporting node hotplug,
> which is the same type of problem), we can discuss that further before I
> resend my patches.
>
> > Notes and questions on your current patchset:
> >
> > 1) It looks like you -add- a second nodemask to struct mempolicy:
> > nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
> > nodemask_t user_nodemask; /* nodemask passed by user */
> >
> > I believe you can overlay these two nodemasks using a union, and avoid making
> > struct mempolicy bigger.
> >
>
> Ahh, since policy->cpuset_mems_allowed is only meaningful in the
> non-MPOL_F_STATIC_NODES case, that probably will work. For the purposes
> of this patchset, we can certainly do that. I'm wondering whether future
> expansions will require them to be separated again, however.
>
> > 2) Those 'mpol_mode()' wrappers on all those mempolicy->policy
> > evaluations look dangerous to me. It looks like a code bug
> > waiting to happen. Unless I miss my guess, if you forget one of
> > those wrappers (or someone making a future change misses one), no
> > one will notice until sometime at runtime, someone makes use of the
> > new fangled MPOL_F_STATIC_NODES mode in some new situation, and we
> > end up executing code that we didn't expect to execute just then.
> >
> > I urge you to reconsider, and keep it so that the 'policy' field of struct
> > mempolicy naturally evaluates to just the policy. There should be just one
> > pair of places, on entry to, and exit from, the kernel, where the code is
> > aware of the need to pack the mode and the policy into a single word.
> >
>
> Ok.

I was hoping David wouldn't cave on this point. ;-) I do agree with
David that if we ever end up with nr_modes + nr_mode_flags > 16, the API
will become too complex for myself, at least, to comprehend. I don't
feel too strongly either way, and I'll explain more below, but first:

I do agree with Paul that there is a danger of missing one of these in
converting existing code. In fact, I missed one in my ref counting
rework patch that I will ultimately rebase atop David's final version
[I'm already dependent on Mel Gorman's patch series]. To catch this, I
decided to rename the "policy" member to "mode". This also aligns the
struct better with the numa_memory_policy doc [I think of the "policy"
as the tuple: mode + optional nodemask]. For future code, I think we
could add comments to the struct definition warning that one should use
the wrappers to access the mode or mode flags.

However, let's step back and look at the usage of the mempolicy struct.
In earlier mail, David mentioned that it is passed around outside of
mempolicy.c. While this is true, the only place that I see where code
outside of mempolicy.c deals with the policy/mode and nodemask
separately--as opposed to using an opaque mempolicy struct--is in the
shmem mount option parsing. Everywhere else, as far as I can see, just
uses the struct mempolicy. Even slab/slub call into slab_node() in
mempolicy.c to extract the target node for the policy.

So, if David does accept Paul's plea to separate the mode and the mode
flags in the structure, I would suggest that we do this in mpol_new().
That is, in {sys|do}_{set_mempolicy|mbind}(), maintain the API format
with the mode flags ORed into the mode. mpol_new() can pull them apart
into different struct mempolicy members. The rest of mempolicy.c can
use them directly from the struct without the helpers. get_mempolicy()
will need to pack the mode and flags back together--perhaps with an
inverse helper function.

As for the shmem mount option parsing: For now, I'd suggest that we
keep the mode flags OR'd into the "policy" member of the shmem_sb_info
struct -- i.e., the "API format"--to preserve the mpol_new() interface.
At some point [no urgency, I think], I'd like to replace the
shmem_sb_info policy and nodemask members with a struct mempolicy or
pointer thereto, and instead of using mpol_new() to create the mempolicy
for new tmpfs inodes, use mpol_copy() [which should probably be renamed
to mpol_dup(), 'cause that's what it does]. I'd also like to move
shmem_parse_mpol() to mempolicy.c as a more general memory policy
parser. shmem_show_mpol() can probably be reworked to use mpol_to_str()
used by show_numa_map(). Again, no urgency. I'd rather see David's
and Mel's and my pending mempolicy patches settle down and get in first.
[Too many interdependent out of tree mempolicy patches, already :-(]

>
> > 3) Does your patchset allow the user to specify a nodemask that includes nodes
> > outside of the current cpuset with a MPOL_F_STATIC_NODES mode? At first
> > glance, I didn't see that it did, but I should have looked closer. This
> > strikes me as an essential aspect of this mode.
> >
>
> It does, and that's why I was a little curious about your second case at
> the beginning of this email.
>
> The user's nodemask is always stored unaltered in policy->user_nodemask.
> In mpol_new(), we intersect the current accessible nodes with that
> nodemask to determine if there's even a mempolicy to effect. If not,
> mpol_new() returns ERR_PTR(-EINVAL) and we fall back to the existing
> policy (if any) for the VMA or task. Otherwise, we use the intersection
> of those two nodemasks.
>
> In mpol_rebind_policy() with MPOL_F_STATIC_NODES, we always intersect
> policy->user_nodemask with the set of accessible nodes (nodemask_t
> *newmask). So if we can now access nodes that we previously couldn't,
> they are now part of the interleave nodemask. A similiar situation occurs
> when building the zonelist for the MPOL_BIND case and you can see the
> change I made to MPOL_PREFERRED in the incremental patch I sent you.
>
> The only way that newly-accessible nodes will not become a part of the
> mempolicy nodemask is when the user's nodemask and the set of accessible
> nodes is disjoint when the policy was created.
>
> It is arguable whether we want to support that behavior as well (and we
> definitely could do it, it's not out of the scope of mempolicies). Lee
> had specific requirements of rejecting nodemasks that had no nodes in the
> intersection based on the current implementation, but we could continue
> discussing the possibility of setting up mempolicies that are uneffected
> when they are created and only become policy when they are rebound later.

Note: I agree that the 'STATIC' flag overrides the argument checking
you refer to above. That's new behavior that the application has
indicated that it wants to use. However, methinks that MPOL_DEFAULT|
MPOL_F_STATIC_NODES doesn't make much sense.

Regards,
Lee

2008-02-13 16:14:53

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Tue, 2008-02-12 at 20:18 -0800, David Rientjes wrote:
> On Tue, 12 Feb 2008, Lee Schermerhorn wrote:
>
> > > Adds another member to struct mempolicy,
> > >
> > > nodemask_t user_nodemask
> > >
> > > that stores the the nodemask that the user passed when he or she created
> > > the mempolicy via set_mempolicy() or mbind(). When using
> > > MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's
> > > passed nodemask is always used when determining the preferred node,
> > > building the MPOL_BIND zonelist, or creating the interleave nodemask.
> > > This happens whenever the policy is rebound, including when a task's
> > > cpuset assignment changes or the cpuset's mems are changed.
> >
> > When you say that the user's nodemask is "always used" you mean "after
> > cpuset contextualization", right? I.e., after masking with mems_allowed
> > [which also restricts to nodes with memory]. That is what the code
> > still does.
> >
>
> Yeah, I'm not introducing a loophole so that tasks can access memory to
> which they don't have access. I've clarified that for the next version.

I though so. The description just seemed ambiguous to me. Thanks for
the clarification.
>
> > > It is also possible to mount tmpfs with the static nodemask behavior when
> > > specifying a node or nodemask. To do this, simply add "=static"
> > > immediately following the mempolicy mode at mount time:
> > >
> > > mount -o remount mpol=interleave=static:1-3
> > >
> > > Also removes mpol_check_policy() and folds some of its logic into
> > > mpol_new() since it is now mostly obsoleted.
> >
> > Couple of comments here:
> >
> > 1) we've discussed the issue of returning EINVAL for non-empty nodemasks
> > with MPOL_DEFAULT. By removing this restriction, we run the risk of
> > breaking applications if we should ever want to define a semantic to
> > non-empty node mask for MPOL_DEFAULT. I think this is probably
> > unlikely, but consider the new mode flags part of the mode/policy
> > parameter: by not rejecting undefined flags, we may break applications
> > by later defining additional flags. I'd argue for fairly strict
> > argument checking.
> >
>
> As I've mentioned in a parallel thread, I've folded the entire logic of
> mpol_check_policy() as it stands this minute in Linus' tree into
> mpol_new().

OK.
>
> > 2) Before this patch, policy_nodemask from the shmem_sb_info was NOT
> > masked with allowed nodes because we passed this mask directly to
> > mpol_new() without "contextualization". We DO guarantee that this
> > policy nodemask contains only nodes with memory [see
> > shmem_parse_mpol()]. Now that you've moved the contextualization to
> > mpol_new(), we are masking this policy with the cpuset mems allowed.
> > This is a change in behavior. Do we want this? I.e., are we preserving
> > the "intent" of the system administrator in setting the tmpfs policy?
> > Maybe they wanted to shared interleaved shmem areas between cpusets with
> > disjoint mems allowed. Nothing prevents this...
> >
>
> We're still saving the intent in the new policy->user_nodemask field so
> any future rebinds will still allow unaccessible nodes be effected by the
> mempolicy if the permissions are changed later.
>
> > If we just want to preserve existing behavior, we can define an
> > additional mode flag that we set in the sbinfo policy in
> > shmem_parse_mpol() and test in mpol_new(). If we want to be able to
> > specify existing or new behavior, we can use the same flag, but set it
> > or not based on an additional qualifier specified via the mount option.
> >
>
> Shmem areas between cpusets with disjoint mems_allowed seems like an error
> in userspace to me and I would prefer that mpol_new() reject it outright.

I don't think so, and I'd like to explain why. Perhaps this is too big
a topic to cover in the context of this patch response. I think I'll
start another thread. Suffice it to say here that cpusets and
mempolicies don't constrain, in anyway, the ability of tasks in a cpuset
to attach to shmem segments created by tasks in other, perhaps disjoint,
cpusets. The cpusets and mempolicies DO constrain page allocations,
however. This can result in OOM faults for tasks in cpusets whose
mems_allowed contains no overlap with a shared policy installed by the
creating task. This can be avoided if the task that creates the shmem
segment faults in all of the pages and SHM_LOCKs them down. Whether
this is sufficient for all applications is subject of the other
discussion. In any case, I think we need to point this out in mempolicy
and cpuset man pages.

>
> > > @@ -218,21 +167,27 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> > > return ERR_PTR(-ENOMEM);
> > > flags &= MPOL_MODE_FLAGS;
> > > atomic_set(&policy->refcnt, 1);
> > > + cpuset_update_task_memory_state();
> > > + nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
> > > switch (mode) {
> > > case MPOL_INTERLEAVE:
> > > - policy->v.nodes = *nodes;
> > > + if (nodes_empty(*nodes))
> > > + return ERR_PTR(-EINVAL);
> > > + policy->v.nodes = cpuset_context_nmask;
> > > if (nodes_weight(policy->v.nodes) == 0) {
> > > kmem_cache_free(policy_cache, policy);
> > > return ERR_PTR(-EINVAL);
> > > }
> > > break;
> > > case MPOL_PREFERRED:
> > > - policy->v.preferred_node = first_node(*nodes);
> > > + policy->v.preferred_node = first_node(cpuset_context_nmask);
> > > if (policy->v.preferred_node >= MAX_NUMNODES)
> > > policy->v.preferred_node = -1;
> > > break;
> > > case MPOL_BIND:
> > > - policy->v.zonelist = bind_zonelist(nodes);
> > > + if (nodes_empty(*nodes))
> >
> > Kosaki-san already pointed out the need to free the policy struct
> > here.
> >
>
> I folded your fix to have only one
>
> kmem_cache_free(policy_cache, policy);
> return ERR_PTR(error_code);
>
> point in mpol_new().
>
> > > @@ -1780,51 +1737,67 @@ static void mpol_rebind_policy(struct mempolicy *pol,
> > > if (nodes_equal(*mpolmask, *newmask))
> > > return;
> > >
> > > + remap = !(mpol_flags(pol->policy) & MPOL_F_STATIC_NODES);
> > > switch (mpol_mode(pol->policy)) {
> > > case MPOL_DEFAULT:
> > > break;
> > > case MPOL_INTERLEAVE:
> > > - nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
> > > + if (!remap)
> > > + nodes_and(tmp, pol->user_nodemask, *newmask);
> > > + else
> > > + nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
> > > pol->v.nodes = tmp;
> > > - *mpolmask = *newmask;
> > > - current->il_next = node_remap(current->il_next,
> > > - *mpolmask, *newmask);
> > > + if (!node_isset(current->il_next, tmp)) {
> > > + current->il_next = next_node(current->il_next, tmp);
> > > + if (current->il_next >= MAX_NUMNODES)
> > > + current->il_next = first_node(tmp);
> > > + if (current->il_next >= MAX_NUMNODES)
> > > + current->il_next = numa_node_id();
> > > + }
> > > break;
> > > case MPOL_PREFERRED:
> > > - pol->v.preferred_node = node_remap(pol->v.preferred_node,
> > > - *mpolmask, *newmask);
> > > - *mpolmask = *newmask;
> > > + if (!remap && !node_isset(pol->v.preferred_node, *newmask))
> > > + pol->v.preferred_node = -1;
> > > + else
> > > + pol->v.preferred_node = node_remap(pol->v.preferred_node,
> > > + *mpolmask, *newmask);
> >
> > Note: "local allocation" [MPOL_PREFERRED with preferred_node == -1]
> > does not need and, IMO, should not be remapped. Current code doesn't
> > check for this, but I think that it will do the right thing because
> > node_remap() will not remap an invalid node id. However, I think it's
> > cleaner to explicitly check in the code. I have a pending
> > MPOL_PREFERRED cleanup patch that address this.
> >
> > That being said, I don't understand what you're trying to do with the
> > "then" clause, or rather, why. Looks like you're explicitly dropping
> > back to local allocation? Would you/could you add a comment explaining
> > the "intent"?
> >
>
> Since we're allowed to remap the node to a different node than the user
> specified with either syscall, the current behavior is that "one node is
> as good as another." In other words, we're trying to accomodate the mode
> first by setting pol->v.preferred_node to some accessible node and only
> setting that to the user-supplied node if it is available.
>
> If the node isn't available and the user specifically asked that it is not
> remapped (with MPOL_F_STATIC_NODES), then I felt local allocation was best
> compared to remapping to a node that may be unrelated to the VMA or task.
> This preserves a sense of the current behavior that "one node is as good
> as another," but the user specifically asked for no remap.

Yeah. I went back and read the update to the mempolicy doc where you
make it clear that this is the semantic of 'STATIC_NODES. You also
mentioned it in the patch description, but I didn't "get it". Sorry.

Still a comment in the code would, I think, help future spelunkers.

>
> > > break;
> > > case MPOL_BIND: {
> > > - nodemask_t nodes;
> > > - struct zone **z;
> > > - struct zonelist *zonelist;
> > > -
> > > - nodes_clear(nodes);
> > > - for (z = pol->v.zonelist->zones; *z; z++)
> > > - node_set(zone_to_nid(*z), nodes);
> > > - nodes_remap(tmp, nodes, *mpolmask, *newmask);
> > > - nodes = tmp;
> > > + struct zonelist *zonelist = NULL;
> > > +
> > > + if (!remap)
> > > + nodes_and(tmp, pol->user_nodemask, *newmask);
> > > + else {
> > > + nodemask_t nodes;
> > > + struct zone **z;
> > > +
> > > + nodes_clear(nodes);
> > > + for (z = pol->v.zonelist->zones; *z; z++)
> > > + node_set(zone_to_nid(*z), nodes);
> > > + nodes_remap(tmp, nodes, *mpolmask, *newmask);
> > > + }
> > >
> > > - zonelist = bind_zonelist(&nodes);
> > > + if (nodes_weight(tmp))
> > > + zonelist = bind_zonelist(&tmp);
> >
> > Not sure I understand what's going on here [nor with the comment below
> > about "falling thru'" to MPOL_DEFAULT means]. However, this area is
> > vastly simplified by Mel Gorman's "two zonelist" patch series. He
> > replaces the zonelist with a nodemask, so _BIND can share the
> > _INTERLEAVE case code.
> >
> > Suggest you consider basing atop those.
> >
>
> We'll have to see what the merge order turns out to be, because the
> patchset you're referring to that modifies this in mm/mempolicy.c is not
> currently in -mm.

Right. I was hoping I could entice you to join the chorus in support
of Mel's patches because of the way they simplify the mempolicy
work. :-)

Lee
>
> David

2008-02-13 17:05:57

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

David, responding to pj, write:
> > 2) a cpuset relative mode that correctly handled the case of cpusets
> > growing larger (increased numbers of nodes.)
> >
> > I'd like to persuade you to add case (2) as well. But I suspect,
> > given that case (2) was never as compelling to you as it was to me
> > in our December discussions, that I'll have little luck doing that.
> >
>
> MPOL_F_STATIC_NODES already handles the second case because you can
> specify nodes that aren't currently accessible because of your cpuset in
> the hopes that eventually they will become accessible.

No -- MPOL_F_STATIC_NODES does not handle my second case. Notice the
phrase 'cpuset relative.'

In my second case, nodes are numbered relative to the cpuset. If you
say "node 2" then you mean whatever is the third (since numbering is
zero based) node in your current cpuset.

This cpuset relative mode that I'm proposing is an entirely different
mode of numbering nodes than anything we've seen so far (except for our
side discussions with just yourself, Lee and Christoph, last December.)

In this mode, "node 2" doesn't mean what the system calls "node 2"; it
means the third node in whatever is ones current cpuset placement (if
your cpuset even has that many nodes), and mempolicies using this mode
are automatically remapped by the kernel, anytime the cpuset placement
changes.

This second, cpuset relative, mode is required:

1) to provide a race-free means for an application to place its memory
when the application considers all physical nodes essentially
equivalent, and just wants to lay things out relative to whatever
cpuset it happens to be running in, and

2) to provide a practical means, without the need for constantly
reprobing ones current cpuset placement, for an application to
specify cpuset-relative placement to be applied whenever the
application is placed in a larger cpuset, even if the application
is presently in a smaller cpuset.

Without it, the application has to first query its current physical
cpuset placement, then calculate its desired relative placement into
terms of the current physical nodes it finds itself on, and then issue
various mbind or set_mempolicy calls using those current physical node
numbers, praying that it wasn't migrated to other physical nodes at the
same time, which would have invalidated its current placement
assumptions in its relative to physical node number calculations.

And without it, if an application is currently placed in a smaller cpuset
than it would prefer, it cannot specify how it would want its mempolicies
should it ever subsequently be moved to a larger cpuset. This leaves such
an application with little option other than to constantly reprobe its
cpuset placement, in order to know if and when it needs to reissue its
mbind and set_mempolicy calls because it gained access to a larger cpuset.

I agree, David, that this present MPOL_F_STATIC_NODES patch handles the
case of a growing cpuset (or hotplug added nodes) for the static mapped
case (node "2" means physical system node "2", always.) But this
present patch, by design, does not address the case of a growing cpuset
for the case where an application actually wants its mempolicies remapped.

My original code remapping mempolicies when cpuset placement changes,
that has been in the kernel for a couple of years now, was -supposed-
to handle this relative case. But it is flawed, as it fails to meet
the requirements I list above. We can't just change the way that mode
works now, for compatibility reasons. So we need to add a new cpuset
relative mode, just as you're adding a STATIC mode, that addresses the
above requirements for a cpuset relative, remapped, numbering of
mempolicy nodes.


> Other than that, perhaps if you can elaborate more on what you imagined
> supporting as far as cpusets growing larger (or supporting node hotplug,
> which is the same type of problem), we can discuss that further before I
> resend my patches.

Ok - I tried to elaborate, above. You (David), Lee and Christoph will
perhaps recognize this elaboration, as it essentially repeats things
I said in our earlier discussion in December and early January.

Yes - hotplug presents the same problems as cpusets growing larger.

If this makes sense to you, David, and you'd like to include this
second, cpuset relative mode, in your patchset, that would be excellent.

Given that I have not been very good at explaining this second mode
you might choose not to do that; in that case I'll have to follow up,
after your second patch shapes up, with a patch of my own, adding this
cpuset relative mode.


> > I believe you can overlay these two nodemasks using a union, and
> > avoid making struct mempolicy bigger.
>
> Ahh, since policy->cpuset_mems_allowed is only meaningful in the
> non-MPOL_F_STATIC_NODES case, that probably will work. For the purposes
> of this patchset, we can certainly do that. I'm wondering whether future
> expansions will require them to be separated again, however.

I'd suggest we let future expansions deal with their own needs. We
don't usually pad internal (not externally visible) data structures
in anticipation that someone else might need the space in the future.

At least earlier, Andi Kleen, when he was the primary author and sole
active maintainer of this mempolicy code, was always keen to avoid
expanding the size of 'struct mempolicy' by another nodemask.

I have not done the calculations myself to know how vital it is to
keep the size of struct mempolicy to a minimum. It certainly seems
worth a bit of effort, however, if adding this union of these two
nodemasks doesn't complicate the code too horribly much.

> > I urge you to reconsider, and keep it so that the 'policy' field of struct
> > mempolicy naturally evaluates to just the policy. There should be just one
> > pair of places, on entry to, and exit from, the kernel, where the code is
> > aware of the need to pack the mode and the policy into a single word.
> >
>
> Ok.

Cool. Thanks. (I'm glad you caved ... ;). Looking forward in my inbox, I see
that Lee has some suggestions on where to handle the conversion between the
packed mode and the separate fields. I'm too lazy to think about that more,
and will likely acquiesce to whatever you and Lee agree to.


> > 3) Does your patchset allow the user to specify a nodemask that includes nodes
> > outside of the current cpuset with a MPOL_F_STATIC_NODES mode? At first
> > glance, I didn't see that it did, but I should have looked closer. This
> > strikes me as an essential aspect of this mode.
> >
>
> It does, and that's why I was a little curious about your second case at
> the beginning of this email.
>
> The user's nodemask is always stored unaltered in policy->user_nodemask.

Ah - good. I missed that. Just to be sure I'm reading the code right,
I take it that it is the following line, at the end of the mpol_new()
routine, that stores the unaltered user nodemask ... right?

policy->user_nodemask = *nodes;


> The only way that newly-accessible nodes will not become a part of the
> mempolicy nodemask is when the user's nodemask and the set of accessible
> nodes is disjoint when the policy was created.
>
> It is arguable whether we want to support that behavior as well (and we
> definitely could do it, it's not out of the scope of mempolicies). Lee
> had specific requirements of rejecting nodemasks that had no nodes in the
> intersection based on the current implementation, but we could continue
> discussing the possibility of setting up mempolicies that are uneffected
> when they are created and only become policy when they are rebound later.

I would be inclined toward having the classic compatibility mode (no
such mode as MPOL_F_STATIC_NODES specified) continue to do as it always
has done, which apparently includes failing EINVAL in some cases due to
an empty nodemask intersection with the current cpuset.

But I would also expect that this new MPOL_F_STATIC_NODES would allow
specifying any nodes, whether or not any of them were in the tasks
current cpuset.

Looking ahead, I think Lee is saying pretty much the same thing.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 18:49:32

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Wed, 13 Feb 2008, Lee Schermerhorn wrote:

> > > 2) Those 'mpol_mode()' wrappers on all those mempolicy->policy
> > > evaluations look dangerous to me. It looks like a code bug
> > > waiting to happen. Unless I miss my guess, if you forget one of
> > > those wrappers (or someone making a future change misses one), no
> > > one will notice until sometime at runtime, someone makes use of the
> > > new fangled MPOL_F_STATIC_NODES mode in some new situation, and we
> > > end up executing code that we didn't expect to execute just then.
> > >
> > > I urge you to reconsider, and keep it so that the 'policy' field of struct
> > > mempolicy naturally evaluates to just the policy. There should be just one
> > > pair of places, on entry to, and exit from, the kernel, where the code is
> > > aware of the need to pack the mode and the policy into a single word.
> > >
> >
> > Ok.
>
> I was hoping David wouldn't cave on this point. ;-) I do agree with
> David that if we ever end up with nr_modes + nr_mode_flags > 16, the API
> will become too complex for myself, at least, to comprehend. I don't
> feel too strongly either way, and I'll explain more below, but first:
>

Hmm, I don't remember "caving" on this point; Paul asked me to reconsider
and I said that I would. I think it's also helpful to have more
discussion on the matter by including other people's opinions.

> I do agree with Paul that there is a danger of missing one of these in
> converting existing code. In fact, I missed one in my ref counting
> rework patch that I will ultimately rebase atop David's final version
> [I'm already dependent on Mel Gorman's patch series]. To catch this, I
> decided to rename the "policy" member to "mode". This also aligns the
> struct better with the numa_memory_policy doc [I think of the "policy"
> as the tuple: mode + optional nodemask]. For future code, I think we
> could add comments to the struct definition warning that one should use
> the wrappers to access the mode or mode flags.
>

I considered this when I implemented it the way I did, and that was part
of the reason I was in favor of enum mempolicy_mode so much: functions
that had a formal of type 'enum mempolicy_mode' were only the mode and
formals of type 'int' or 'unsigned short' were the combination. I even
stated that difference in Documentation/vm/numa_memory_policy.txt in the
first patchset.

> However, let's step back and look at the usage of the mempolicy struct.
> In earlier mail, David mentioned that it is passed around outside of
> mempolicy.c. While this is true, the only place that I see where code
> outside of mempolicy.c deals with the policy/mode and nodemask
> separately--as opposed to using an opaque mempolicy struct--is in the
> shmem mount option parsing. Everywhere else, as far as I can see, just
> uses the struct mempolicy. Even slab/slub call into slab_node() in
> mempolicy.c to extract the target node for the policy.
>

Yes, struct shmem_sb_info stores the 'policy' member as well so it would
need to include a new 'flags' member as well. And the interface between
the two, mpol_shared_policy_init() would need another formal for the
flags.

> So, if David does accept Paul's plea to separate the mode and the mode
> flags in the structure, I would suggest that we do this in mpol_new().
> That is, in {sys|do}_{set_mempolicy|mbind}(), maintain the API format
> with the mode flags ORed into the mode. mpol_new() can pull them apart
> into different struct mempolicy members. The rest of mempolicy.c can
> use them directly from the struct without the helpers. get_mempolicy()
> will need to pack the mode and flags back together--perhaps with an
> inverse helper function.
>

Yeah, it gets tricky. I'm not too sure about only pulling the mode and
flags apart at mpol_new() time because perhaps, in the future, there will
be flag and mode combinations that are only applicable for set_mempolicy()
and not for mbind(), for example. I can imagine that someday we may add a
flag that applies only to a task mempolicy and not to a VMA mempolicy.

> As for the shmem mount option parsing: For now, I'd suggest that we
> keep the mode flags OR'd into the "policy" member of the shmem_sb_info
> struct -- i.e., the "API format"--to preserve the mpol_new() interface.

I don't like this because its not consistent. It increases the confusion
of what contains a mode and what contains the combination. I think the
safest thing to do, if we separate the mode and flags out in struct
mempolicy is to do it everywhere. All helper functions will need
additional formals similar to what I did in the first patchset (that was
actually unpopular) and struct shmem_sb_info need to be modified to
include the additional member.

In other words, I'm all in favor of storing the mode and flags however we
want, but I think it should be done consistenty throughout the tree.
While mpol_mode() wrappers may not be favorable all over the place (and I
didn't miss any), it may be easier than the alternative.

David

2008-02-13 18:58:34

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

David wrote:
> Yeah, it gets tricky.

If you can stand to read my code, look at that big patch I sent you
and Lee, dated "Sun, 23 Dec 2007 22:24:02 -0600", under the Subject
of "Re: [RFC] cpuset relative memory policies - second choice",
where I did this, keeping the 'policy' field unchanged in struct
mempolicy and adding additional fields for the (three, in my case)
new modes.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 19:03:33

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Wed, 13 Feb 2008, Paul Jackson wrote:

> No -- MPOL_F_STATIC_NODES does not handle my second case. Notice the
> phrase 'cpuset relative.'
>
> In my second case, nodes are numbered relative to the cpuset. If you
> say "node 2" then you mean whatever is the third (since numbering is
> zero based) node in your current cpuset.
>

Ok, so this truly is a new feature that isn't addressed either by the
current implementation or my patchset. Fair enough.

You're specifically trying to avoid having the application know about its
cpuset placement with regard to mems at the time it sets up its mempolicy,
right? Otherwise it could already setup this relative nodemask by
selecting node 2, from your example above, in its mems_allowed and it
would always be remapped appropriately.

> In this mode, "node 2" doesn't mean what the system calls "node 2"; it
> means the third node in whatever is ones current cpuset placement (if
> your cpuset even has that many nodes), and mempolicies using this mode
> are automatically remapped by the kernel, anytime the cpuset placement
> changes.
>
> This second, cpuset relative, mode is required:
>
> 1) to provide a race-free means for an application to place its memory
> when the application considers all physical nodes essentially
> equivalent, and just wants to lay things out relative to whatever
> cpuset it happens to be running in, and
>
> 2) to provide a practical means, without the need for constantly
> reprobing ones current cpuset placement, for an application to
> specify cpuset-relative placement to be applied whenever the
> application is placed in a larger cpuset, even if the application
> is presently in a smaller cpuset.
>

So, for example, if the task is bound by mems 1-3, and it asks for
MPOL_INTERLEAVE over 2-4, then initially the mempolicy is only effected
over node 3 and if it's later expanded to mems 1-8, then the mempolicy is
effected over nodes 3-5, right?

And if the mems change to 3-8, the mempolicy is remapped to 5-7 even
though 3-5 (which it already was interleaving over) is still accessible?

> I agree, David, that this present MPOL_F_STATIC_NODES patch handles the
> case of a growing cpuset (or hotplug added nodes) for the static mapped
> case (node "2" means physical system node "2", always.) But this
> present patch, by design, does not address the case of a growing cpuset
> for the case where an application actually wants its mempolicies remapped.
>

Does MPOL_INTERLEAVE | MPOL_F_STATIC_NODES | MPOL_F_PAULS_NEW_FLAG make
any logical sense? If it does, I think we're going to be writing some
very complex remap code in our future.

> > Ahh, since policy->cpuset_mems_allowed is only meaningful in the
> > non-MPOL_F_STATIC_NODES case, that probably will work. For the purposes
> > of this patchset, we can certainly do that. I'm wondering whether future
> > expansions will require them to be separated again, however.
>
> I'd suggest we let future expansions deal with their own needs. We
> don't usually pad internal (not externally visible) data structures
> in anticipation that someone else might need the space in the future.
>
> At least earlier, Andi Kleen, when he was the primary author and sole
> active maintainer of this mempolicy code, was always keen to avoid
> expanding the size of 'struct mempolicy' by another nodemask.
>
> I have not done the calculations myself to know how vital it is to
> keep the size of struct mempolicy to a minimum. It certainly seems
> worth a bit of effort, however, if adding this union of these two
> nodemasks doesn't complicate the code too horribly much.
>

I think it will work very nicely and the benefit is immediately obvious
for systems that have large nodemasks.

> > > I urge you to reconsider, and keep it so that the 'policy' field of struct
> > > mempolicy naturally evaluates to just the policy. There should be just one
> > > pair of places, on entry to, and exit from, the kernel, where the code is
> > > aware of the need to pack the mode and the policy into a single word.
> > >
> >
> > Ok.
>
> Cool. Thanks. (I'm glad you caved ... ;). Looking forward in my inbox, I see
> that Lee has some suggestions on where to handle the conversion between the
> packed mode and the separate fields. I'm too lazy to think about that more,
> and will likely acquiesce to whatever you and Lee agree to.
>

Well, I didn't cave on anything, I said that we can reconsider it in the
hopes that other people would add their feedback. I think continuing to
discuss this matter with yourself and Lee (and whomever else is
interested) will lead us to the correct solution. Since this is an
internal implementation detail, I think it's important to hear other
people's opinions since we're the ones who will be hacking the code in the
future so it's really our opinions that matter.

> > The user's nodemask is always stored unaltered in policy->user_nodemask.
>
> Ah - good. I missed that. Just to be sure I'm reading the code right,
> I take it that it is the following line, at the end of the mpol_new()
> routine, that stores the unaltered user nodemask ... right?
>
> policy->user_nodemask = *nodes;
>

Yes.

> > The only way that newly-accessible nodes will not become a part of the
> > mempolicy nodemask is when the user's nodemask and the set of accessible
> > nodes is disjoint when the policy was created.
> >
> > It is arguable whether we want to support that behavior as well (and we
> > definitely could do it, it's not out of the scope of mempolicies). Lee
> > had specific requirements of rejecting nodemasks that had no nodes in the
> > intersection based on the current implementation, but we could continue
> > discussing the possibility of setting up mempolicies that are uneffected
> > when they are created and only become policy when they are rebound later.
>
> I would be inclined toward having the classic compatibility mode (no
> such mode as MPOL_F_STATIC_NODES specified) continue to do as it always
> has done, which apparently includes failing EINVAL in some cases due to
> an empty nodemask intersection with the current cpuset.
>

And it does in my latest series, which I sent to you last night.

David

2008-02-13 19:05:28

by Lee Schermerhorn

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Wed, 2008-02-13 at 10:48 -0800, David Rientjes wrote:
> On Wed, 13 Feb 2008, Lee Schermerhorn wrote:
>
> > > > 2) Those 'mpol_mode()' wrappers on all those mempolicy->policy
> > > > evaluations look dangerous to me. It looks like a code bug
> > > > waiting to happen. Unless I miss my guess, if you forget one of
> > > > those wrappers (or someone making a future change misses one), no
> > > > one will notice until sometime at runtime, someone makes use of the
> > > > new fangled MPOL_F_STATIC_NODES mode in some new situation, and we
> > > > end up executing code that we didn't expect to execute just then.
> > > >
> > > > I urge you to reconsider, and keep it so that the 'policy' field of struct
> > > > mempolicy naturally evaluates to just the policy. There should be just one
> > > > pair of places, on entry to, and exit from, the kernel, where the code is
> > > > aware of the need to pack the mode and the policy into a single word.
> > > >
> > >
> > > Ok.
> >
> > I was hoping David wouldn't cave on this point. ;-) I do agree with
> > David that if we ever end up with nr_modes + nr_mode_flags > 16, the API
> > will become too complex for myself, at least, to comprehend. I don't
> > feel too strongly either way, and I'll explain more below, but first:
> >
>
> Hmm, I don't remember "caving" on this point; Paul asked me to reconsider
> and I said that I would. I think it's also helpful to have more
> discussion on the matter by including other people's opinions.

>
> > I do agree with Paul that there is a danger of missing one of these in
> > converting existing code. In fact, I missed one in my ref counting
> > rework patch that I will ultimately rebase atop David's final version
> > [I'm already dependent on Mel Gorman's patch series]. To catch this, I
> > decided to rename the "policy" member to "mode". This also aligns the
> > struct better with the numa_memory_policy doc [I think of the "policy"
> > as the tuple: mode + optional nodemask]. For future code, I think we
> > could add comments to the struct definition warning that one should use
> > the wrappers to access the mode or mode flags.
> >
>
> I considered this when I implemented it the way I did, and that was part
> of the reason I was in favor of enum mempolicy_mode so much: functions
> that had a formal of type 'enum mempolicy_mode' were only the mode and
> formals of type 'int' or 'unsigned short' were the combination. I even
> stated that difference in Documentation/vm/numa_memory_policy.txt in the
> first patchset.
>
> > However, let's step back and look at the usage of the mempolicy struct.
> > In earlier mail, David mentioned that it is passed around outside of
> > mempolicy.c. While this is true, the only place that I see where code
> > outside of mempolicy.c deals with the policy/mode and nodemask
> > separately--as opposed to using an opaque mempolicy struct--is in the
> > shmem mount option parsing. Everywhere else, as far as I can see, just
> > uses the struct mempolicy. Even slab/slub call into slab_node() in
> > mempolicy.c to extract the target node for the policy.
> >
>
> Yes, struct shmem_sb_info stores the 'policy' member as well so it would
> need to include a new 'flags' member as well. And the interface between
> the two, mpol_shared_policy_init() would need another formal for the
> flags.
>
> > So, if David does accept Paul's plea to separate the mode and the mode
> > flags in the structure, I would suggest that we do this in mpol_new().
> > That is, in {sys|do}_{set_mempolicy|mbind}(), maintain the API format
> > with the mode flags ORed into the mode. mpol_new() can pull them apart
> > into different struct mempolicy members. The rest of mempolicy.c can
> > use them directly from the struct without the helpers. get_mempolicy()
> > will need to pack the mode and flags back together--perhaps with an
> > inverse helper function.
> >
>
> Yeah, it gets tricky. I'm not too sure about only pulling the mode and
> flags apart at mpol_new() time because perhaps, in the future, there will
> be flag and mode combinations that are only applicable for set_mempolicy()
> and not for mbind(), for example. I can imagine that someday we may add a
> flag that applies only to a task mempolicy and not to a VMA mempolicy.

True. Altho' at such a time, I'd probably argue for testing for and
rejecting invalid mode/flag combinations in the respective
functions :-).


>
> > As for the shmem mount option parsing: For now, I'd suggest that we
> > keep the mode flags OR'd into the "policy" member of the shmem_sb_info
> > struct -- i.e., the "API format"--to preserve the mpol_new() interface.
>
> I don't like this because its not consistent. It increases the confusion
> of what contains a mode and what contains the combination. I think the
> safest thing to do, if we separate the mode and flags out in struct
> mempolicy is to do it everywhere. All helper functions will need
> additional formals similar to what I did in the first patchset (that was
> actually unpopular) and struct shmem_sb_info need to be modified to
> include the additional member.
>
> In other words, I'm all in favor of storing the mode and flags however we
> want, but I think it should be done consistenty throughout the tree.
> While mpol_mode() wrappers may not be favorable all over the place (and I
> didn't miss any), it may be easier than the alternative.

OK. I'm "caving"... :-) Different views of consistency. But,
eventually, I hope we can replace the separate mode[, flags] and
nodemask in the 'sb_info with a mempolicy and keep the details of modes,
flags, etc. internal to mempolicy.c.

Looking forward to the respin of the patches...

Lee

2008-02-13 19:13:26

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Wed, 13 Feb 2008, Lee Schermerhorn wrote:

> > > If we just want to preserve existing behavior, we can define an
> > > additional mode flag that we set in the sbinfo policy in
> > > shmem_parse_mpol() and test in mpol_new(). If we want to be able to
> > > specify existing or new behavior, we can use the same flag, but set it
> > > or not based on an additional qualifier specified via the mount option.
> > >
> >
> > Shmem areas between cpusets with disjoint mems_allowed seems like an error
> > in userspace to me and I would prefer that mpol_new() reject it outright.
>
> I don't think so, and I'd like to explain why. Perhaps this is too big
> a topic to cover in the context of this patch response. I think I'll
> start another thread. Suffice it to say here that cpusets and
> mempolicies don't constrain, in anyway, the ability of tasks in a cpuset
> to attach to shmem segments created by tasks in other, perhaps disjoint,
> cpusets. The cpusets and mempolicies DO constrain page allocations,
> however. This can result in OOM faults for tasks in cpusets whose
> mems_allowed contains no overlap with a shared policy installed by the
> creating task. This can be avoided if the task that creates the shmem
> segment faults in all of the pages and SHM_LOCKs them down. Whether
> this is sufficient for all applications is subject of the other
> discussion. In any case, I think we need to point this out in mempolicy
> and cpuset man pages.
>

I don't think we're likely to see examples of actual code written to do
this being posted to refute my comment.

I think what you said above is right and that it should be allowed.
You're advocating for allowing task A to attach to shmem segments created
by task B while task A has its own mempolicy that restricts its own
allocations but still being able to access the shmem segments of task B,
providing that task A will not fault them back in itself.

You're right, that's not a scenario that I was hoping to address to
MPOL_F_STATIC_NODES.

> > Since we're allowed to remap the node to a different node than the user
> > specified with either syscall, the current behavior is that "one node is
> > as good as another." In other words, we're trying to accomodate the mode
> > first by setting pol->v.preferred_node to some accessible node and only
> > setting that to the user-supplied node if it is available.
> >
> > If the node isn't available and the user specifically asked that it is not
> > remapped (with MPOL_F_STATIC_NODES), then I felt local allocation was best
> > compared to remapping to a node that may be unrelated to the VMA or task.
> > This preserves a sense of the current behavior that "one node is as good
> > as another," but the user specifically asked for no remap.
>
> Yeah. I went back and read the update to the mempolicy doc where you
> make it clear that this is the semantic of 'STATIC_NODES. You also
> mentioned it in the patch description, but I didn't "get it". Sorry.
>
> Still a comment in the code would, I think, help future spelunkers.
>

Ok, I'll clarify the intention in this code that we agreed was better:

case MPOL_PREFERRED:
if (!remap) {
int nid = first_node(pol->user_nodemask);

if (node_isset(nid, *newmask))
pol->v.preferred_node = nid;
else
pol->v.preferred_node = -1;
} else
pol->v.preferred_node = node_remap(pol->v.preferred_node,
*mpolmask, *newmask);
break;

2008-02-13 19:18:24

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Wed, 13 Feb 2008, Lee Schermerhorn wrote:

> > Yeah, it gets tricky. I'm not too sure about only pulling the mode and
> > flags apart at mpol_new() time because perhaps, in the future, there will
> > be flag and mode combinations that are only applicable for set_mempolicy()
> > and not for mbind(), for example. I can imagine that someday we may add a
> > flag that applies only to a task mempolicy and not to a VMA mempolicy.
>
> True. Altho' at such a time, I'd probably argue for testing for and
> rejecting invalid mode/flag combinations in the respective
> functions :-).
>

Yeah, and that would require the modes and flags to be split apart in
sys_set_mempolicy() and sys_mbind() or at least in do_set_mempolicy() or
do_mbind(). So if we see the possibility that we want to test for mode
and flag combinations that perhaps differ between the set_mempolicy() and
mbind() case, we have to do it in either of those two places. I think we
should do it there, as early as possible, like I did in my first patchset.

> OK. I'm "caving"... :-) Different views of consistency. But,
> eventually, I hope we can replace the separate mode[, flags] and
> nodemask in the 'sb_info with a mempolicy and keep the details of modes,
> flags, etc. internal to mempolicy.c.
>

I agree, I think keeping all of these implementation details inside
mm/mempolicy.c is the best practice. We'll still need to expose some of
the details, such as the parsing of '=static' in the tmpfs mount option to
add the MPOL_F_STATIC_NODES flag to the policy, but situations like that
should be rare.

For extendability, I agree that the struct shmem_sb_info member should be
a pointer to a mempolicy and not an int.

David

2008-02-13 20:30:18

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

David wrote:
> You're specifically trying to avoid having the application know about its
> cpuset placement with regard to mems at the time it sets up its mempolicy,
> right? Otherwise it could already setup this relative nodemask by
> selecting node 2, from your example above, in its mems_allowed and it
> would always be remapped appropriately.

Yes, if an application considers nodes to be interchangeable, I'm
trying to avoid having that application -have- to know its current
cpuset placement, for two reasons:

For one thing, it's racey. It's cpuset placement could change,
unbeknownst to it, between the time it queried it, and the time
that it issued the mbind or set_mempolicy call.

For the other thing, it's not always possible. If the application
is currently in a cpuset that is smaller than it's preferred
configuration, it would not be possible to express its preferred
memory policies using just the smaller number of memory nodes
allowed by its current cpuset placement. How do you say "put
this on my third node" if you don't have a third node and you
can only speak of the nodes you currently have?


> So, for example, if the task is bound by mems 1-3, and it asks for
> MPOL_INTERLEAVE over 2-4, then initially the mempolicy is only effected
> over node 3 and if it's later expanded to mems 1-8, then the mempolicy is
> effected over nodes 3-5, right?
>
> And if the mems change to 3-8, the mempolicy is remapped to 5-7 even
> though 3-5 (which it already was interleaving over) is still accessible?

Yes and yes, for this cpuset relative numbering mode.

> Does MPOL_INTERLEAVE | MPOL_F_STATIC_NODES | MPOL_F_PAULS_NEW_FLAG make
> any logical sense? If it does, I think we're going to be writing some
> very complex remap code in our future.


No -- MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES (which is what I'll
likely call my new flag) are mutually exclusive.

The allowed modes and flags would be:
Choose exactly one of:
MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND, MPOL_INTERLEAVE
Plus zero or one of:
MPOL_F_STATIC_NODES, MPOL_F_RELATIVE_NODES
where, if you choose neither of tne MPOL_F_*_NODES flags,
then you get the classic, compatible nodemask handling.

> Well, I didn't cave on anything

;) Your simple "ok" was ambiguous enough that we were able to
read into it whatever we wanted to.

But I've made my case on that issue (involving the separate or
packed policy flag field). So I probably won't say more, and
I expect to live with whatever you choose, after any further
input from Lee or others.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-13 21:58:52

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Wed, 13 Feb 2008, Paul Jackson wrote:

> Yes, if an application considers nodes to be interchangeable, I'm
> trying to avoid having that application -have- to know its current
> cpuset placement, for two reasons:
>
> For one thing, it's racey. It's cpuset placement could change,
> unbeknownst to it, between the time it queried it, and the time
> that it issued the mbind or set_mempolicy call.
>
> For the other thing, it's not always possible. If the application
> is currently in a cpuset that is smaller than it's preferred
> configuration, it would not be possible to express its preferred
> memory policies using just the smaller number of memory nodes
> allowed by its current cpuset placement. How do you say "put
> this on my third node" if you don't have a third node and you
> can only speak of the nodes you currently have?
>

So let's say, like my first example from the previous email, that you have
MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES over nodes 3-4 and your cpuset's
mems is only nodes 5-7. This would interleave over no nodes. Correct?

It seems like MPOL_F_RELATIVE_NODES is primarily designed to maintain a
certain order among the nodes it effects the mempolicy over. It comes
with the premise that the task doesn't already know it's cpuset mems
(otherwise, the current implementation without MPOL_F_STATIC_NODES would
work fine for this) so it doesn't really care what nodes it allocates
pages on, it just cares about the order.

This works for MPOL_PREFERRED and MPOL_BIND as well, right?

I don't understand the use case for this (at all), but if you have
workloads that require this type of setting then I can implement this as
part of my series. I just want to confirm that there are real world cases
backing this so that we don't have flags with highly highly specialized
cornercases.

[ If a user _does_ specify MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES
as part of their syscall, then we'll simply return -EINVAL. ]

> > Well, I didn't cave on anything
>
> ;) Your simple "ok" was ambiguous enough that we were able to
> read into it whatever we wanted to.
>
> But I've made my case on that issue (involving the separate or
> packed policy flag field). So I probably won't say more, and
> I expect to live with whatever you choose, after any further
> input from Lee or others.
>

Well, there's advantages and disadvantages to either approach.

My preference (both mode and flags stored in the same member of struct
mempolicy):

Advantages:

- completely consistent with the userspace API of passing modes
and flags together in a pointer to an int, and

- does not require additional formals to be added to several
functions, including functions outside mm/mempolicy.c.

Disadvantage:

- use of mpol_mode() throughout mm/mempolicy.c code to mask
off optional mode flags for conditionals or switch statements.

Your preference (separate mode and flags members in struct mempolicy):

Advantages:

- clearer implementation when dealing with modes: all existing
statements involving pol->policy can remain unchanged.

Disadvantages:

- requires additional formals to be added to several functions,
including functions outside mm/mempolicy.c, and

- takes additional space in struct mempolicy (two bytes) which
could eventually be used for something else.

In both cases the testing of mode flags is the same as before:

if (pol->policy & MPOL_F_STATIC_NODES) {
...
}

2008-02-14 10:09:30

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

A few more review comments on details of this patch set ...

=========

In mempolicy.h, the lines:

/*
* The lower MPOL_FLAG_SHIFT bits of the policy mode represent the MPOL_*
* constants defined in enum mempolicy_mode. The upper bits represent
* optional set_mempolicy() MPOL_F_* mode flags.
*/
#define MPOL_FLAG_SHIFT (8)
#define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)

/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES (1 << MPOL_FLAG_SHIFT)
#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES) /* legal set_mempolicy() MPOL_* mode flags */

could be simplified, to:

/*
* Optional flags that modify nodemask numbering.
*/
#define MPOL_F_RELATIVE_NODES (1<<14) /* remapped relative to cpuset */
#define MPOL_F_STATIC_NODES (1<<15) /* unremapped physical masks */
#define MPOL_MODE_FLAGS (MPOL_F_RELATIVE_NODES|MPOL_F_STATIC_NODES)
/* combined MPOL_F_* mask flags */

(really, that MPOL_FLAG_SHIFT is just unnecessary distracting detail.)

=========

If we ever call mpol_rebind_policy() with an MPOL_PREFERRED|MPOL_F_STATIC_NODES
policy when the preferred_node doesn't happen to be in the current cpuset,
then would the following lines loose the preferred node setting, such that
it didn't get applied again correctly if that node came back into our allowed
cpuset ?

case MPOL_PREFERRED:
if (!remap && !node_isset(pol->v.preferred_node, *newmask))
pol->v.preferred_node = -1;

=========

Instead of the double negative and the use of a new
word 'remap' meaning the opposite of STATIC, as in"

int remap;
...
remap = !(mpol_flags(pol->policy) & MPOL_F_STATIC_NODES);
...
if (!remap)

How about something more direct, as in:

int static_nodes;
...
static_nodes = mpol_flags(pol->policy) & MPOL_F_STATIC_NODES;
...
if (static_nodes)

Each additional logic inversion and additional name that isn't
trivially based on an existing name just makes this code more
difficult to read.

Of course, if we used bit fields for these additional flags,
instead of #defines and masks, the above would get even simpler,
eliminating the extra define's, and replacing the above three lines
with just the one line:

if (pol->f_static_nodes)

But I already failed with that suggestion ... grumble, grumble ...

=========

Should the mpol_equal() algorithm change, in the case of either
MPOL_F_RELATIVE_NODES or MPOL_F_STATIC_NODES, to checking
user_nodemask equality, -instead- of the "switch(mode)"
mode specific tests?

=========

Could we have mpol_to_str() mark policies which are
MPOL_F_RELATIVE_NODES or MPOL_F_STATIC_NODES? Perhaps
by adding a suffix of "|relative" or "|static" or some
such.


--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-14 10:26:56

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

Paul, responding (incorrectly) to David:
> > So, for example, if the task is bound by mems 1-3, and it asks for
> > MPOL_INTERLEAVE over 2-4, then initially the mempolicy is only effected
> > over node 3 and if it's later expanded to mems 1-8, then the mempolicy is
> > effected over nodes 3-5, right?
> >
> > And if the mems change to 3-8, the mempolicy is remapped to 5-7 even
> > though 3-5 (which it already was interleaving over) is still accessible?
>
> Yes and yes, for this cpuset relative numbering mode.

Paul, correcting himself ...

No and yes. The manner in which too many nodes (as requested in a
RELATIVE mask) are folded into too small a cpuset is not actually
that critical, so long as it doesn't come up empty. However, what
I'll be recommending, in a follow-up patch, will be folding the
larger set into the smaller one modulo the size of the smaller one.

In your example above, the requested policy asked for nodes 2-4,
so that means it is trying to lay out a memory policy with at least
five (0-4) nodes in consideration. But the cpuset is only allowing
three (1-3) nodes at the moment. So for each requested node, we take
its number modulo three to see which of the available nodes to include
in the interleave.

Given
N: how many nodes are allowed by the tasks cpuset (3, here)

n m := (n % N) r := m-th set node in allowed
requested mod 3 result (in set 1, 2, 3 allowed)
2 2 3
3 0 1
4 1 2

Hence your first example would result in an interleave over physical
nodes 1, 2, and 3 (the last column above.) ... not just over 3.

I intend to post patches you can use to lib/bitmap.c and the related
cpumask and nodemask apparatus that compute the above for you.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-14 11:12:45

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

David wrote:
> So let's say, like my first example from the previous email, that you have
> MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES over nodes 3-4 and your cpuset's
> mems is only nodes 5-7. This would interleave over no nodes. Correct?

Given what I said yesterday, that would be a correct conclusion.

However, as I just posted, what I said yesterday was wrong.

Using the same table format as I used in what I just posted, we see that
the two nodes 5 and 6 would be included in the interleave. Each requested
node gets folded modulo the size (weight) of the current mems_allowed,
into some actual node to be included in the interleave.

Given
N: how many nodes are allowed by the tasks cpuset (3, here [5,6,7])

n m := (n % N) r := m-th set node in allowed
requested mod 3 result (in set 5,6,7 allowed)
3 0 5
4 1 6

> it just cares about the order.

MPOL_F_RELATIVE_NODES is for use by jobs consisting of multiple compute
threads which care about how many CPUs they run on, how many memory nodes
they have, and which nodes are closest to which CPUs. In the typical case
all CPUs are equivalent, and all memory nodes are equivalent, except for
the basic topology issues of which nodes are near which CPUs, and in the
bigger NUMA systems (the less uniform NUMA systems, I should say), whether
all these CPUs and nodes are "close to" each other in the larger system.

> I don't understand the use case for this

The use case is the original motivating use case for cpusets, and for
my customer audience still one of the largest use cases of interest.

They are running N-way tightly coupled parallel threads, with sometimes
extreme sensitivity to data placement. The job may run for hours, even
days, with exactly N threads, on exactly N CPUs, carefully placing each
data page on the memory node nearest the CPU that will access it most
frequently. They may be using OpenMP, synchronizing many times per
second across many hundreds of threads. If -any- of the threads takes
a few per-cent longer due to poor page placement, the entire job slows
as much. If various threads, at uncorrelated times, each slow a few
per-cent, then the entire job can take twice as long. When you're
computing tomorrows weather forecast, getting the result the day after
tomorrow is rather useless.

Now, if the job is critical enough, the customer simply won't interfere
with it or move it to other CPUs or any of that nonsense. But when
running a mix of such jobs, under a batch scheduler such as PBS or LSF,
where the jobs have varying priority and varying resource needs, then
the batch scheduler will, intentionally, move less critical jobs about,
sometimes suspending them entirely, in order to provide the more
critical jobs with exactly what they need, while keeping the
utilization of the rest of the system as high as possible.

> It comes
> with the premise that the task doesn't already know it's cpuset mems
> (otherwise, the current implementation without MPOL_F_STATIC_NODES would
> work fine for this)

No.

The current "classical" implementation doesn't work adequately.

I think I've already explained this before, so will just copy my
previous explanation here. Perhaps you missed it the first time,
or perhaps something in it isn't clear enough.

=========
This second, cpuset relative, mode is required:

1) to provide a race-free means for an application to place its memory
when the application considers all physical nodes essentially
equivalent, and just wants to lay things out relative to whatever
cpuset it happens to be running in, and

2) to provide a practical means, without the need for constantly
reprobing ones current cpuset placement, for an application to
specify cpuset-relative placement to be applied whenever the
application is placed in a larger cpuset, even if the application
is presently in a smaller cpuset.

Without it, the application has to first query its current physical
cpuset placement, then calculate its desired relative placement into
terms of the current physical nodes it finds itself on, and then issue
various mbind or set_mempolicy calls using those current physical node
numbers, praying that it wasn't migrated to other physical nodes at the
same time, which would have invalidated its current placement
assumptions in its relative to physical node number calculations.

And without it, if an application is currently placed in a smaller cpuset
than it would prefer, it cannot specify how it would want its mempolicies
should it ever subsequently be moved to a larger cpuset. This leaves such
an application with little option other than to constantly reprobe its
cpuset placement, in order to know if and when it needs to reissue its
mbind and set_mempolicy calls because it gained access to a larger cpuset.
=========

We would have similar problems with remapping sched_setaffinity policies
as we do with mempolicies, except that sched_setaffinity can be applied
by one task on another. So the batch schedulers can and do fix up the
scheduler policies of migrated tasks behind their back. They cannot
do so for memory policies, because mbind and set_mempolicy can only be
issued on a task by itself.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-14 12:27:31

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

I had taken a vow of silence on this one, but couldn't resist one more
round.

David wrote:
> My preference (both mode and flags stored in the same member of struct
> mempolicy):
>
> Advantages:
>
> - completely consistent with the userspace API of passing modes
> and flags together in a pointer to an int, and

True -- the necessary overloaded ugliness of the kernel-user system
call API is propogated throughout mempolicy.c. However, I wouldn't
necessarily call that an advantage.

> - does not require additional formals to be added to several
> functions, including functions outside mm/mempolicy.c.

... but does require the use of the new mpol_flags() and mpol_mode()
macros by code in mmshmem.c, outside of mm/mempolicy.c

> Disadvantage:
>
> - use of mpol_mode() throughout mm/mempolicy.c code to mask
> off optional mode flags for conditionals or switch statements.

This will cause a bug in the future, that escapes into the wild, when
someone forgets one of these. I'll bet on that. It's fragile, because
(1) such errors are easy to make, and (2) hard to catch.

> Your preference (separate mode and flags members in struct mempolicy):
>
> Advantages:
>
> - clearer implementation when dealing with modes: all existing
> statements involving pol->policy can remain unchanged.

Clear, robust code - that's the biggie.

> Disadvantages:
>
> - requires additional formals to be added to several functions,
> including functions outside mm/mempolicy.c, and

True -- though by this argument, we'd routinely aggregate multiple flags
and small words into single integer parameters, just to minimize the
parameter count. Putting two flags in one parameter is a false
simplification, unless required by circumstance, such as communicating
with deep space probes or across the system call boundary with existing
API's.

Across the system call boundary, we have little choice, for compatibility
reasons. But kernel internal interfaces are not so constrained, and the
ugliness at the system call boundary can be quarantined from most of the
mempolicy.c code.

> - takes additional space in struct mempolicy (two bytes) which
> could eventually be used for something else.

To be clear to others, as you know, we're not talking here about
growing the sizeof(struct mempolicy) at present, but rather about using
some currently unused bytes in the struct.

More often than not, when someone adds complexity that is not needed at
present, because it might be needed in the future, they are making it
harder to maintain the code, not easier.

The single most important thing we can do to improve future
maintainability of code is to make it more readable.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-14 19:41:17

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Thu, 14 Feb 2008, Paul Jackson wrote:

> If we ever call mpol_rebind_policy() with an MPOL_PREFERRED|MPOL_F_STATIC_NODES
> policy when the preferred_node doesn't happen to be in the current cpuset,
> then would the following lines loose the preferred node setting, such that
> it didn't get applied again correctly if that node came back into our allowed
> cpuset ?
>
> case MPOL_PREFERRED:
> if (!remap && !node_isset(pol->v.preferred_node, *newmask))
> pol->v.preferred_node = -1;
>

That's already been corrected as a result of a discussion between Lee and
myself (please see the incremental patch that I sent you privately when I
sent the patchset along).

> Should the mpol_equal() algorithm change, in the case of either
> MPOL_F_RELATIVE_NODES or MPOL_F_STATIC_NODES, to checking
> user_nodemask equality, -instead- of the "switch(mode)"
> mode specific tests?
>

That's a good question.

We'll need to decide whether mpol_equal() is determining the equality of
the currently effected mempolicy (whereas policy->user_nodemask is
irrelevant) or the whole intended mempolicy overall.

I didn't originally modify mpol_equal() because I preferred the former.
Is there a compelling case for the latter where mpol_equal() is used in
the tree that would require this change?

> Could we have mpol_to_str() mark policies which are
> MPOL_F_RELATIVE_NODES or MPOL_F_STATIC_NODES? Perhaps
> by adding a suffix of "|relative" or "|static" or some
> such.
>

I'd like to keep it in the same format as the tmpfs mount option which is
'=relative' and '=static'.

David

2008-02-14 19:46:04

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Thu, 14 Feb 2008, Paul Jackson wrote:

> No and yes. The manner in which too many nodes (as requested in a
> RELATIVE mask) are folded into too small a cpuset is not actually
> that critical, so long as it doesn't come up empty. However, what
> I'll be recommending, in a follow-up patch, will be folding the
> larger set into the smaller one modulo the size of the smaller one.
>

So basically the "relative" nodemask that is passed with
MPOL_F_RELATIVE_NODES is wrapped around the allowed nodes?

relative nodemask mems_allowed result
1,3,5 4 4
1,3,5 4-6 4-6
1,3,5 4-8 4-5,7
1,3,5 4-10 4,6,8

Is that correct?

David

2008-02-14 21:39:28

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Thu, 14 Feb 2008, Paul Jackson wrote:

> In mempolicy.h, the lines:
>
> /*
> * The lower MPOL_FLAG_SHIFT bits of the policy mode represent the MPOL_*
> * constants defined in enum mempolicy_mode. The upper bits represent
> * optional set_mempolicy() MPOL_F_* mode flags.
> */
> #define MPOL_FLAG_SHIFT (8)
> #define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)
>
> /* Flags for set_mempolicy */
> #define MPOL_F_STATIC_NODES (1 << MPOL_FLAG_SHIFT)
> #define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES) /* legal set_mempolicy() MPOL_* mode flags */
>
> could be simplified, to:
>
> /*
> * Optional flags that modify nodemask numbering.
> */
> #define MPOL_F_RELATIVE_NODES (1<<14) /* remapped relative to cpuset */
> #define MPOL_F_STATIC_NODES (1<<15) /* unremapped physical masks */
> #define MPOL_MODE_FLAGS (MPOL_F_RELATIVE_NODES|MPOL_F_STATIC_NODES)
> /* combined MPOL_F_* mask flags */
>
> (really, that MPOL_FLAG_SHIFT is just unnecessary distracting detail.)
>

It would be easy to define mpol_mode() and mpol_flags() in terms of
MPOL_MODE_FLAGS as well, yes. But without MPOL_FLAG_SHIFT it becomes
impossible to determine whether a user passed an invalid flag.

I think we're all in agreement that passing an invalid flag bit should be
rejected with -EINVAL. So to do that we need MPOL_MODE_MASK to expicitly
define the parts of the int *policy passed from set_mempolicy() that
represent the mode.

David

2008-02-15 01:44:48

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Thu, 14 Feb 2008, David Rientjes wrote:

> We'll need to decide whether mpol_equal() is determining the equality of
> the currently effected mempolicy (whereas policy->user_nodemask is
> irrelevant) or the whole intended mempolicy overall.
>

I've done the latter in the latest patchset.

Since mpol_equal() is only used for determining whether nearby vmas can be
merged, it is only logical to merge them if they share the mempolicy
intent of the user if MPOL_F_STATIC_NODES or any flag that makes sense of
policy->user_nodemask is used.

> > Could we have mpol_to_str() mark policies which are
> > MPOL_F_RELATIVE_NODES or MPOL_F_STATIC_NODES? Perhaps
> > by adding a suffix of "|relative" or "|static" or some
> > such.
> >
>
> I'd like to keep it in the same format as the tmpfs mount option which is
> '=relative' and '=static'.
>

In preparation for mode flags in addition to MPOL_F_STATIC_NODES to be
added (like MPOL_F_RELATIVE_NODES or whatever Paul decides to call it
based on the latest feedback, I've prepared mpol_to_str() to have this
format

interleave=static|relative=1-3

as viewable from /proc/pid/numa_maps. This is also how tmpfs mount
options will be specified.

[ Of course the above example can't happen since MPOL_F_STATIC_NODES and
MPOL_F_RELATIVE_NODES are disjoint, but it's sufficient for the
example. ]

2008-02-15 09:29:33

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

David wrote:
> But without MPOL_FLAG_SHIFT it becomes
> impossible to determine whether a user passed an invalid flag.

I don't think so. It's not possible to detemine if exactly the low
eight bits of the 'policy' short are a valid mode, -however- that
"eight" is a spurious detail. Remove it.

Instead of specifying that the 'policy' short consists of two bytes,
one for mode and one for flags, rather specify that the policy fields
consists of some high bits for flags, and the remaining low bits
(however many remain) for the mode.

That in turn exposes the opportunity to remove a couple of checks
for "(mpol_flags(mode) & ~MPOL_MODE_FLAGS)" being zero, and that
in turn makes it obvious that the 'flags' variable in
sys_set_mempolicy() is unused.

Consider the following patch, on top of the latest you sent me a day
ago.

---
include/linux/mempolicy.h | 26 +++++++++++++++-----------
mm/mempolicy.c | 6 ------
2 files changed, 15 insertions(+), 17 deletions(-)

--- 2.6.24-mm1.orig/include/linux/mempolicy.h 2008-02-15 00:11:10.000000000 -0800
+++ 2.6.24-mm1/include/linux/mempolicy.h 2008-02-15 01:13:51.597894429 -0800
@@ -8,6 +8,14 @@
* Copyright 2003,2004 Andi Kleen SuSE Labs
*/

+/*
+ * The 'policy' field of 'struct mempolicy' has both a mode and
+ * some flags packed into it. The flags (MPOL_F_* below) occupy
+ * the high bit positions (MPOL_MODE_FLAGS), and the mempolicy
+ * modes (the "Policies" below) are encoded in the remaining low
+ * bit positions.
+ */
+
/* Policies */
enum {
MPOL_DEFAULT,
@@ -18,16 +26,12 @@ enum {
};

/*
- * The lower MPOL_FLAG_SHIFT bits of the policy mode represent the MPOL_*
- * constants defined in the above enum. The upper bits represent optional
- * set_mempolicy() or mbind() MPOL_F_* mode flags.
+ * Optional flags that modify nodemask numbering.
*/
-#define MPOL_FLAG_SHIFT (8)
-#define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)
-
-/* Flags for set_mempolicy */
-#define MPOL_F_STATIC_NODES (1 << MPOL_FLAG_SHIFT)
-#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES) /* legal set_mempolicy() MPOL_F_* flags */
+#define MPOL_F_RELATIVE_NODES (1<<14) /* remapped relative to cpuset */
+#define MPOL_F_STATIC_NODES (1<<15) /* unremapped physical masks */
+#define MPOL_MODE_FLAGS (MPOL_F_RELATIVE_NODES|MPOL_F_STATIC_NODES)
+ /* combined MPOL_F_* mask flags */

/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
@@ -130,12 +134,12 @@ static inline int mpol_equal(struct memp

static inline unsigned char mpol_mode(unsigned short mode)
{
- return mode & MPOL_MODE_MASK;
+ return mode & ~MPOL_MODE_FLAGS;
}

static inline unsigned short mpol_flags(unsigned short mode)
{
- return mode & ~MPOL_MODE_MASK;
+ return mode & MPOL_MODE_FLAGS;
}

/*
--- 2.6.24-mm1.orig/mm/mempolicy.c 2008-02-15 00:18:35.000000000 -0800
+++ 2.6.24-mm1/mm/mempolicy.c 2008-02-15 01:23:53.775748002 -0800
@@ -884,8 +884,6 @@ asmlinkage long sys_mbind(unsigned long

if (mpol_mode(mode) >= MPOL_MAX)
return -EINVAL;
- if (mpol_flags(mode) & ~MPOL_MODE_FLAGS)
- return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
@@ -898,13 +896,9 @@ asmlinkage long sys_set_mempolicy(int mo
{
int err;
nodemask_t nodes;
- unsigned short flags;

if (mpol_mode(mode) >= MPOL_MAX)
return -EINVAL;
- if (mpol_flags(mode) & ~MPOL_MODE_FLAGS)
- return -EINVAL;
- flags = mpol_flags(mode) & MPOL_MODE_FLAGS;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;


--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-15 10:00:40

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

> like MPOL_F_RELATIVE_NODES or whatever Paul decides to call it

MPOL_F_RELATIVE_NODES it is.

I see no compelling need for this name to track whatever
we name the relative bitmap operator. They are related,
but not the same thing.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-15 10:20:20

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

> So basically the "relative" nodemask that is passed with
> MPOL_F_RELATIVE_NODES is wrapped around the allowed nodes?
>
> relative nodemask mems_allowed result
> 1,3,5 4 4
> 1,3,5 4-6 4-6
> 1,3,5 4-8 4-5,7
> 1,3,5 4-10 4,6,8
>
> Is that correct?

By my calculation, all but the last line is correct.

We use zero-based numbering, so relative node '1' is the
'second' node, and the 'second' node in allowed nodes 4-10
is node 5, not 4. Similarly for relative nodes '3' and '5'.

So that last line should be:

> 1,3,5 4-10 5,7,9


--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-15 20:15:00

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Fri, 15 Feb 2008, Paul Jackson wrote:

> So that last line should be:
>
> > 1,3,5 4-10 5,7,9
>

What about this case where one of the relative nodes wraps around to
represent an already set node in the result?

relative mems_allowed result
1,3,6 4-8 5,7 or 5-7 ?

Neither result is immediately obvious to me logically: either your result
has less weight than your relative nodemask (seems like a bad thing) or
your relative nodemask really isn't all that relative to begin with (it's
the same result as 1-3, 6-8, 11-13, etc).

Or is this just a less-than-desired side-effect of relative nodemasks that
we're willing to live with given its other advantages?

David

2008-02-15 20:24:22

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Fri, 15 Feb 2008, Paul Jackson wrote:

> I don't think so. It's not possible to detemine if exactly the low
> eight bits of the 'policy' short are a valid mode, -however- that
> "eight" is a spurious detail. Remove it.
>

Sure it is, you can determine if the low MPOL_FLAG_SHIFT bits are a valid
mode by masking off the upper bits and testing if the result is >=
MPOL_MAX.

> Instead of specifying that the 'policy' short consists of two bytes,
> one for mode and one for flags, rather specify that the policy fields
> consists of some high bits for flags, and the remaining low bits
> (however many remain) for the mode.
>

Well, it's still an implementation detail that needs to be explicitly
defined, otherwise we have no way to separate mode from flags when the
user passes in their int formal as part of either syscall.

And given the recent requirements for a perfectly formed set_mempolicy()
or mbind() syscall (as a result of the discussion about allowing non-empty
nodemasks with MPOL_DEFAULT), we need to ensure that invalid flags are not
being passed.

We should make sure to return -EINVAL to a user specifying an invalid flag
if, for example, they are using an older kernel that doesn't support what
they're asking for.

It would be possible to do all of this in both sys_set_mempolicy() and
sys_mbind() by masking off the set of possible modes and checking the
result for being >= MPOL_MAX:

if ((mode & MPOL_MODE_FLAGS) >= MPOL_MAX)
return -EINVAL;

2008-02-15 20:33:22

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Fri, 15 Feb 2008, David Rientjes wrote:

> It would be possible to do all of this in both sys_set_mempolicy() and
> sys_mbind() by masking off the set of possible modes and checking the
> result for being >= MPOL_MAX:
>
> if ((mode & MPOL_MODE_FLAGS) >= MPOL_MAX)
> return -EINVAL;

Actually, in sys_setmempolicy() or sys_mbind() where mode is an int:

unsigned short flags;

flags = mode & MPOL_MODE_FLAGS;
mode &= ~MPOL_MODE_FLAGS;
if (mode < 0 || mode >= MPOL_MAX)
return -EINVAL;

2008-02-15 23:46:20

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

David responding to pj:
> > I don't think so. It's not possible to detemine if exactly the low
> > eight bits of the 'policy' short are a valid mode, -however- that
> > "eight" is a spurious detail. Remove it.
>
> Sure it is, you can determine if the low MPOL_FLAG_SHIFT bits are a valid
> mode by masking off the upper bits and testing if the result is >=
> MPOL_MAX.

Sorry ... my response was ambiguous, so not surprisingly you read it
the other way than I intended it and missed my point.

Let me try this again.

Yes, if we drop that MPOL_FLAG_SHIFT, we can't tell if exactly the low
eight bits are valid, -however- we can still tell if the low bits not
used by our MPOL_F_* flag bits are valid, and that's sufficient.

I included a PATCH in my last reply, in order to demonstrate this.


> Well, it's still an implementation detail that needs to be explicitly
> defined, otherwise we have no way to separate mode from flags when the
> user passes in their int formal as part of either syscall.

No.

That is not an implementation detail that must be explicitly defined
in order to separate the mode from the flags.

One can separate out the high order flag bits by simply masking them off.


> We should make sure to return -EINVAL to a user specifying an invalid flag
> if, for example, they are using an older kernel that doesn't support what
> they're asking for.

I agree that we must return -EINVAL if the user specifies an invalid
flag. I have always agreed with this.

======

> It would be possible to do all of this in both sys_set_mempolicy() and
> sys_mbind() by masking off the set of possible modes and checking the
> result for being >= MPOL_MAX:

Bingo!!

That's what I've been saying all along.

I'm not quite sure how we got here; this seems to me such a sharp turn
from what went before that I fear I've misread you, but I suppose I
should just be glad we're in agreement, despite the strange journey.

You go on to suggest a couple of variations of this check, by first
masking off the high order flag bits:

if ((mode & MPOL_MODE_FLAGS) >= MPOL_MAX)
return -EINVAL;

and then, in a follow-on message, refining that line of thought with:

unsigned short flags;

flags = mode & MPOL_MODE_FLAGS;
mode &= ~MPOL_MODE_FLAGS;
if (mode < 0 || mode >= MPOL_MAX)
return -EINVAL;

Actually, I don't think you need to add either variation, as I think you
-already- have that check, in both sys_mbind() and sys_set_mempolicy(),
as the check:

if (mpol_mode(mode) >= MPOL_MAX)
return -EINVAL;

With one minor tweak (changing the return type of mpol_mode() from
uchar to ushort), here again is the PATCH of my last reply, with this
check present (actually, it's a check of yours, unchanged from what has
been in your patch for days now).

Does this PATCH do what you have in mind?

---
include/linux/mempolicy.h | 30 +++++++++++++++++-------------
mm/mempolicy.c | 6 ------
2 files changed, 17 insertions(+), 19 deletions(-)

--- 2.6.24-mm1.orig/include/linux/mempolicy.h 2008-02-15 00:11:10.000000000 -0800
+++ 2.6.24-mm1/include/linux/mempolicy.h 2008-02-15 15:16:16.031031424 -0800
@@ -8,6 +8,14 @@
* Copyright 2003,2004 Andi Kleen SuSE Labs
*/

+/*
+ * The 'policy' field of 'struct mempolicy' has both a mode and
+ * some flags packed into it. The flags (MPOL_F_* below) occupy
+ * the high bit positions (MPOL_MODE_FLAGS), and the mempolicy
+ * modes (the "Policies" below) are encoded in the remaining low
+ * bit positions.
+ */
+
/* Policies */
enum {
MPOL_DEFAULT,
@@ -18,16 +26,12 @@ enum {
};

/*
- * The lower MPOL_FLAG_SHIFT bits of the policy mode represent the MPOL_*
- * constants defined in the above enum. The upper bits represent optional
- * set_mempolicy() or mbind() MPOL_F_* mode flags.
+ * Optional flags that modify nodemask numbering.
*/
-#define MPOL_FLAG_SHIFT (8)
-#define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)
-
-/* Flags for set_mempolicy */
-#define MPOL_F_STATIC_NODES (1 << MPOL_FLAG_SHIFT)
-#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES) /* legal set_mempolicy() MPOL_F_* flags */
+#define MPOL_F_RELATIVE_NODES (1<<14) /* remapped relative to cpuset */
+#define MPOL_F_STATIC_NODES (1<<15) /* unremapped physical masks */
+#define MPOL_MODE_FLAGS (MPOL_F_RELATIVE_NODES|MPOL_F_STATIC_NODES)
+ /* combined MPOL_F_* mask flags */

/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
@@ -128,14 +132,14 @@ static inline int mpol_equal(struct memp

#define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)

-static inline unsigned char mpol_mode(unsigned short mode)
+static inline unsigned short mpol_mode(unsigned short mode)
{
- return mode & MPOL_MODE_MASK;
+ return mode & ~MPOL_MODE_FLAGS;
}

static inline unsigned short mpol_flags(unsigned short mode)
{
- return mode & ~MPOL_MODE_MASK;
+ return mode & MPOL_MODE_FLAGS;
}

/*
@@ -201,7 +205,7 @@ static inline int mpol_equal(struct memp

#define mpol_set_vma_default(vma) do {} while(0)

-static inline unsigned char mpol_mode(unsigned short mode)
+static inline unsigned short mpol_mode(unsigned short mode)
{
return 0;
}
--- 2.6.24-mm1.orig/mm/mempolicy.c 2008-02-15 00:18:35.000000000 -0800
+++ 2.6.24-mm1/mm/mempolicy.c 2008-02-15 08:16:52.034431591 -0800
@@ -884,8 +884,6 @@ asmlinkage long sys_mbind(unsigned long

if (mpol_mode(mode) >= MPOL_MAX)
return -EINVAL;
- if (mpol_flags(mode) & ~MPOL_MODE_FLAGS)
- return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
@@ -898,13 +896,9 @@ asmlinkage long sys_set_mempolicy(int mo
{
int err;
nodemask_t nodes;
- unsigned short flags;

if (mpol_mode(mode) >= MPOL_MAX)
return -EINVAL;
- if (mpol_flags(mode) & ~MPOL_MODE_FLAGS)
- return -EINVAL;
- flags = mpol_flags(mode) & MPOL_MODE_FLAGS;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;


--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214

2008-02-15 23:55:46

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

On Fri, 15 Feb 2008, Paul Jackson wrote:

> --- 2.6.24-mm1.orig/include/linux/mempolicy.h 2008-02-15 00:11:10.000000000 -0800
> +++ 2.6.24-mm1/include/linux/mempolicy.h 2008-02-15 15:16:16.031031424 -0800
> @@ -8,6 +8,14 @@
> * Copyright 2003,2004 Andi Kleen SuSE Labs
> */
>
> +/*
> + * The 'policy' field of 'struct mempolicy' has both a mode and
> + * some flags packed into it. The flags (MPOL_F_* below) occupy
> + * the high bit positions (MPOL_MODE_FLAGS), and the mempolicy
> + * modes (the "Policies" below) are encoded in the remaining low
> + * bit positions.
> + */
> +
> /* Policies */
> enum {
> MPOL_DEFAULT,
> @@ -18,16 +26,12 @@ enum {
> };
>
> /*
> - * The lower MPOL_FLAG_SHIFT bits of the policy mode represent the MPOL_*
> - * constants defined in the above enum. The upper bits represent optional
> - * set_mempolicy() or mbind() MPOL_F_* mode flags.
> + * Optional flags that modify nodemask numbering.
> */
> -#define MPOL_FLAG_SHIFT (8)
> -#define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)
> -
> -/* Flags for set_mempolicy */
> -#define MPOL_F_STATIC_NODES (1 << MPOL_FLAG_SHIFT)
> -#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES) /* legal set_mempolicy() MPOL_F_* flags */
> +#define MPOL_F_RELATIVE_NODES (1<<14) /* remapped relative to cpuset */
> +#define MPOL_F_STATIC_NODES (1<<15) /* unremapped physical masks */
> +#define MPOL_MODE_FLAGS (MPOL_F_RELATIVE_NODES|MPOL_F_STATIC_NODES)
> + /* combined MPOL_F_* mask flags */
>
> /* Flags for get_mempolicy */
> #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
> @@ -128,14 +132,14 @@ static inline int mpol_equal(struct memp
>
> #define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)
>
> -static inline unsigned char mpol_mode(unsigned short mode)
> +static inline unsigned short mpol_mode(unsigned short mode)
> {
> - return mode & MPOL_MODE_MASK;
> + return mode & ~MPOL_MODE_FLAGS;
> }
>
> static inline unsigned short mpol_flags(unsigned short mode)
> {
> - return mode & ~MPOL_MODE_MASK;
> + return mode & MPOL_MODE_FLAGS;
> }
>
> /*
> @@ -201,7 +205,7 @@ static inline int mpol_equal(struct memp
>
> #define mpol_set_vma_default(vma) do {} while(0)
>
> -static inline unsigned char mpol_mode(unsigned short mode)
> +static inline unsigned short mpol_mode(unsigned short mode)
> {
> return 0;
> }
> --- 2.6.24-mm1.orig/mm/mempolicy.c 2008-02-15 00:18:35.000000000 -0800
> +++ 2.6.24-mm1/mm/mempolicy.c 2008-02-15 08:16:52.034431591 -0800
> @@ -884,8 +884,6 @@ asmlinkage long sys_mbind(unsigned long
>
> if (mpol_mode(mode) >= MPOL_MAX)
> return -EINVAL;
> - if (mpol_flags(mode) & ~MPOL_MODE_FLAGS)
> - return -EINVAL;
> err = get_nodes(&nodes, nmask, maxnode);
> if (err)
> return err;
> @@ -898,13 +896,9 @@ asmlinkage long sys_set_mempolicy(int mo
> {
> int err;
> nodemask_t nodes;
> - unsigned short flags;
>
> if (mpol_mode(mode) >= MPOL_MAX)
> return -EINVAL;
> - if (mpol_flags(mode) & ~MPOL_MODE_FLAGS)
> - return -EINVAL;
> - flags = mpol_flags(mode) & MPOL_MODE_FLAGS;
> err = get_nodes(&nodes, nmask, maxnode);
> if (err)
> return err;
>

There's been significant changes in this area since my last posting, but I
agree that doing a slight variation of this is better.

On that topic, I am ready to post the updated patchset but I'd like to do
it with your bitmap_onto() patch so that I can fully implement and test
MPOL_F_RELATIVE_NODES. Do you know when the patch that adds
bitmap_onto(), which is a name I think I noticed you liking, will be
available?

David

2008-02-16 00:11:29

by Paul Jackson

[permalink] [raw]
Subject: Re: [patch 3/4] mempolicy: add MPOL_F_STATIC_NODES flag

> Do you know when the patch that adds
> bitmap_onto(), which is a name I think I noticed you liking, will be
> available?

Thanks in good part to your various questions regarding it, I realized
a significant error in the central piece of code in that patch, the
routine lib/bitmap.c:bitmap_onto(). I should be done in a few hours
reworking it, and will post it then.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.940.382.4214