Commit-ID: 3a0fea961b98d1838f35dba51a639d40f4a5589f
Gitweb: http://git.kernel.org/tip/3a0fea961b98d1838f35dba51a639d40f4a5589f
Author: Peter Zijlstra <[email protected]>
AuthorDate: Thu, 8 Mar 2012 17:56:08 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 18 May 2012 08:16:27 +0200
sched/numa: Introduce sys_numa_{t,m}bind()
Now that we have a NUMA process scheduler, provide a syscall
interface for finer granularity NUMA balancing. In particular
this allows setting up NUMA groups of threads and vmas within
a process.
For this we introduce two new syscalls:
sys_numa_tbind(int tig, int ng_id, unsigned long flags);
Bind a thread to a numa group, query its binding or create a new group:
sys_numa_tbind(tid, -1, 0); // create new group, return new ng_id
sys_numa_tbind(tid, -2, 0); // returns existing ng_id
sys_numa_tbind(tid, ng_id, 0); // set ng_id
Returns:
-ESRCH tid->task resolution failed
-EINVAL task didn't have a ng_id, flags was wrong
-EPERM we don't have privileges over tid
sys_numa_mbind(unsigned long addr, unsigned long len, int ng_id, unsigned long flags);
Bind a memory region to a numa group.
sys_numa_mbind(addr, len, ng_id, 0);
create a non-mergable vma over [addr,addr+len) and assign an mpol
binding it to the numa group identified by ng_id.
These system calls together can create thread and vma groups which
will be scheduled as a single entity.
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Suresh Siddha <[email protected]>
Cc: Paul Turner <[email protected]>
Cc: Dan Smith <[email protected]>
Cc: Bharata B Rao <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Linus Torvalds <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/syscalls/syscall_32.tbl | 2 +
arch/x86/syscalls/syscall_64.tbl | 2 +
include/asm-generic/unistd.h | 6 +-
include/linux/mempolicy.h | 27 ++
include/linux/mm.h | 16 +
include/linux/sched.h | 2 +
include/linux/syscalls.h | 3 +
kernel/exit.c | 1 +
kernel/sched/numa.c | 646 ++++++++++++++++++++++++++++++++++++++
kernel/sys_ni.c | 4 +
mm/memory.c | 5 +-
mm/mempolicy.c | 8 +-
12 files changed, 718 insertions(+), 4 deletions(-)
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 29f9f05..38954c5 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -355,3 +355,5 @@
346 i386 setns sys_setns
347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
+349 i386 numa_mbind sys_numa_mbind compat_sys_numa_mbind
+350 i386 numa_tbind sys_numa_tbind compat_sys_numa_tbind
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index dd29a9e..63c5285 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -318,6 +318,8 @@
309 common getcpu sys_getcpu
310 64 process_vm_readv sys_process_vm_readv
311 64 process_vm_writev sys_process_vm_writev
+312 64 numa_mbind sys_numa_mbind
+313 64 numa_tbind sys_numa_tbind
#
# x32-specific system call numbers start at 512 to avoid cache impact
# for native 64-bit operation.
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index 991ef01..4dfef49 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -691,9 +691,13 @@ __SC_COMP(__NR_process_vm_readv, sys_process_vm_readv, \
#define __NR_process_vm_writev 271
__SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
compat_sys_process_vm_writev)
+#define __NR_numa_mbind 272
+__SC_COMP(__NR_numa_mbind, sys_numa_mbind, compat_sys_ms_mbind)
+#define __NR_numa_tbind 273
+__SC_COMP(__NR_numa_tbind, sys_numa_tbind, compat_sys_ms_tbind)
#undef __NR_syscalls
-#define __NR_syscalls 272
+#define __NR_syscalls 274
/*
* All syscalls below here should go away really,
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index af2983b..6e1029f 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -79,6 +79,8 @@ enum mpol_rebind_step {
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/migrate.h>
+#include <linux/list.h>
+#include <linux/sched.h>
struct mm_struct;
@@ -110,6 +112,10 @@ struct mempolicy {
atomic_t refcnt;
unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
+ struct numa_group *numa_group;
+ struct list_head ng_entry;
+ struct vm_area_struct *vma;
+ struct rcu_head rcu;
union {
short preferred_node; /* preferred */
nodemask_t nodes; /* interleave/bind */
@@ -397,6 +403,27 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
}
#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_NUMA
+
+extern void __numa_task_exit(struct task_struct *);
+extern void numa_vma_link(struct vm_area_struct *, struct vm_area_struct *);
+extern void numa_vma_unlink(struct vm_area_struct *);
+
+static inline void numa_task_exit(struct task_struct *p)
+{
+ if (p->numa_group)
+ __numa_task_exit(p);
+}
+
+#else /* CONFIG_NUMA */
+
+static inline void numa_task_exit(struct task_struct *p) { }
+static inline void numa_vma_link(struct vm_area_struct *new, struct vm_area_struct *old) { }
+static inline void numa_vma_unlink(struct vm_area_struct *vma) { }
+
+#endif /* CONFIG_NUMA */
+
#endif /* __KERNEL__ */
#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a89817e..eda8271 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1076,19 +1076,35 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
return (unsigned long)val;
}
+#ifdef CONFIG_NUMA
+extern void __numa_add_rss_counter(struct vm_area_struct *, int, long);
+
+static inline
+void numa_add_rss_counter(struct vm_area_struct *vma, int member, long value)
+{
+ if (vma->vm_policy) /* XXX sodding include dependecies */
+ __numa_add_rss_counter(vma, member, value);
+}
+#else /* !CONFIG_NUMA */
+static inline void numa_add_rss_counter(struct vm_area_struct *vma, int member, long value) { }
+#endif /* CONFIG_NUMA */
+
static inline void add_rss_counter(struct vm_area_struct *vma, int member, long value)
{
atomic_long_add(value, &vma->vm_mm->rss_stat.count[member]);
+ numa_add_rss_counter(vma, member, value);
}
static inline void inc_rss_counter(struct vm_area_struct *vma, int member)
{
atomic_long_inc(&vma->vm_mm->rss_stat.count[member]);
+ numa_add_rss_counter(vma, member, 1);
}
static inline void dec_rss_counter(struct vm_area_struct *vma, int member)
{
atomic_long_dec(&vma->vm_mm->rss_stat.count[member]);
+ numa_add_rss_counter(vma, member, -1);
}
static inline unsigned long get_mm_rss(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe6e535..024a5f9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1512,6 +1512,8 @@ struct task_struct {
short il_next;
short pref_node_fork;
int node;
+ struct numa_group *numa_group;
+ struct list_head ng_entry;
#endif
struct rcu_head rcu;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3de3acb..9c6f47a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -857,5 +857,8 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
const struct iovec __user *rvec,
unsigned long riovcnt,
unsigned long flags);
+asmlinkage long sys_numa_mbind(unsigned long addr, unsigned long len,
+ int ng_id, unsigned long flags);
+asmlinkage long sys_numa_tbind(int tid, int ng_id, unsigned long flags);
#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index d8bd3b42..a0c1b54 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1018,6 +1018,7 @@ void do_exit(long code)
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
task_unlock(tsk);
+ numa_task_exit(tsk);
#endif
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
diff --git a/kernel/sched/numa.c b/kernel/sched/numa.c
index 1d2837d..af123aa 100644
--- a/kernel/sched/numa.c
+++ b/kernel/sched/numa.c
@@ -14,6 +14,7 @@
#include <linux/mempolicy.h>
#include <linux/kthread.h>
+#include <linux/compat.h>
#include "sched.h"
@@ -842,3 +843,648 @@ static __init int numa_init(void)
return 0;
}
early_initcall(numa_init);
+
+
+/*
+ * numa_group bits
+ */
+
+#include <linux/idr.h>
+#include <linux/srcu.h>
+#include <linux/syscalls.h>
+
+struct numa_group {
+ spinlock_t lock;
+ int id;
+
+ struct mm_rss_stat rss;
+
+ struct list_head tasks;
+ struct list_head vmas;
+
+ const struct cred *cred;
+ atomic_t ref;
+
+ struct numa_entity numa_entity;
+
+ struct rcu_head rcu;
+};
+
+static struct srcu_struct ng_srcu;
+
+static DEFINE_MUTEX(numa_group_idr_lock);
+static DEFINE_IDR(numa_group_idr);
+
+static inline struct numa_group *ne_ng(struct numa_entity *ne)
+{
+ return container_of(ne, struct numa_group, numa_entity);
+}
+
+static inline bool ng_tryget(struct numa_group *ng)
+{
+ return atomic_inc_not_zero(&ng->ref);
+}
+
+static inline void ng_get(struct numa_group *ng)
+{
+ atomic_inc(&ng->ref);
+}
+
+static void __ng_put_rcu(struct rcu_head *rcu)
+{
+ struct numa_group *ng = container_of(rcu, struct numa_group, rcu);
+
+ put_cred(ng->cred);
+ kfree(ng);
+}
+
+static void __ng_put(struct numa_group *ng)
+{
+ mutex_lock(&numa_group_idr_lock);
+ idr_remove(&numa_group_idr, ng->id);
+ mutex_unlock(&numa_group_idr_lock);
+
+ WARN_ON(!list_empty(&ng->tasks));
+ WARN_ON(!list_empty(&ng->vmas));
+
+ dequeue_ne(&ng->numa_entity);
+
+ call_rcu(&ng->rcu, __ng_put_rcu);
+}
+
+static inline void ng_put(struct numa_group *ng)
+{
+ if (atomic_dec_and_test(&ng->ref))
+ __ng_put(ng);
+}
+
+/*
+ * numa_ops
+ */
+
+static unsigned long numa_group_mem_load(struct numa_entity *ne)
+{
+ struct numa_group *ng = ne_ng(ne);
+
+ return atomic_long_read(&ng->rss.count[MM_ANONPAGES]);
+}
+
+static unsigned long numa_group_cpu_load(struct numa_entity *ne)
+{
+ struct numa_group *ng = ne_ng(ne);
+ unsigned long load = 0;
+ struct task_struct *p;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(p, &ng->tasks, ng_entry)
+ load += p->numa_contrib;
+ rcu_read_unlock();
+
+ return load;
+}
+
+static void numa_group_mem_migrate(struct numa_entity *ne, int node)
+{
+ struct numa_group *ng = ne_ng(ne);
+ struct vm_area_struct *vma;
+ struct mempolicy *mpol;
+ struct mm_struct *mm;
+ int idx;
+
+ /*
+ * Horrid code this..
+ *
+ * The main problem is that ng->lock nests inside mmap_sem [
+ * numa_vma_{,un}link() gets called under mmap_sem ]. But here we need
+ * to iterate that list and acquire mmap_sem for each entry.
+ *
+ * We start here with no locks held. numa_vma_unlink() is used to add
+ * an SRCU delayed reference count to the mpols. This allows us to do
+ * lockless iteration of the list.
+ *
+ * Once we have an mpol we need to acquire mmap_sem, this too isn't
+ * straight fwd, take ng->lock to pin mpol->vma due to its
+ * serialization against numa_vma_unlink(). While that vma pointer is
+ * stable the vma->vm_mm pointer must be good too, so acquire an extra
+ * reference to the mm.
+ *
+ * This reference keeps mm stable so we can drop ng->lock and acquire
+ * mmap_sem. After which mpol->vma is stable again since the memory map
+ * is stable. So verify ->vma is still good (numa_vma_unlink clears it)
+ * and the mm is still the same (paranoia, can't see how that could
+ * happen).
+ */
+
+ idx = srcu_read_lock(&ng_srcu);
+ list_for_each_entry_rcu(mpol, &ng->vmas, ng_entry) {
+ nodemask_t mask = nodemask_of_node(node);
+
+ spin_lock(&ng->lock); /* pin mpol->vma */
+ vma = mpol->vma;
+ if (!vma) {
+ spin_unlock(&ng->lock);
+ continue;
+ }
+ mm = vma->vm_mm;
+ atomic_inc(&mm->mm_users); /* pin mm */
+ spin_unlock(&ng->lock);
+
+ down_read(&mm->mmap_sem);
+ vma = mpol->vma;
+ if (!vma)
+ goto unlock_next;
+
+ mpol_rebind_policy(mpol, &mask, MPOL_REBIND_ONCE);
+ lazy_migrate_vma(vma, node);
+unlock_next:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ }
+ srcu_read_unlock(&ng_srcu, idx);
+}
+
+static void numa_group_cpu_migrate(struct numa_entity *ne, int node)
+{
+ struct numa_group *ng = ne_ng(ne);
+ struct task_struct *p;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(p, &ng->tasks, ng_entry)
+ sched_setnode(p, node);
+ rcu_read_unlock();
+}
+
+static bool numa_group_can_migrate(struct numa_entity *ne, int node)
+{
+ struct numa_group *ng = ne_ng(ne);
+ struct task_struct *t;
+ bool allowed = false;
+ u64 runtime = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(t, &ng->tasks, ng_entry) {
+ allowed = __task_can_migrate(t, &runtime, node);
+ if (!allowed)
+ break;
+ }
+ rcu_read_unlock();
+
+ /*
+ * Don't bother migrating memory if there's less than 1 second
+ * of runtime on the tasks.
+ */
+ return allowed && runtime > NSEC_PER_SEC;
+}
+
+static bool numa_group_tryget(struct numa_entity *ne)
+{
+ /*
+ * See process_tryget(), similar but against ng_put().
+ */
+ return ng_tryget(ne_ng(ne));
+}
+
+static void numa_group_put(struct numa_entity *ne)
+{
+ ng_put(ne_ng(ne));
+}
+
+static const struct numa_ops numa_group_ops = {
+ .mem_load = numa_group_mem_load,
+ .cpu_load = numa_group_cpu_load,
+
+ .mem_migrate = numa_group_mem_migrate,
+ .cpu_migrate = numa_group_cpu_migrate,
+
+ .can_migrate = numa_group_can_migrate,
+
+ .tryget = numa_group_tryget,
+ .put = numa_group_put,
+};
+
+static struct numa_group *lock_p_ng(struct task_struct *p)
+{
+ struct numa_group *ng;
+
+ for (;;) {
+ ng = ACCESS_ONCE(p->numa_group);
+ if (!ng)
+ return NULL;
+
+ spin_lock(&ng->lock);
+ if (p->numa_group == ng)
+ break;
+ spin_unlock(&ng->lock);
+ }
+
+ return ng;
+}
+
+void __numa_task_exit(struct task_struct *p)
+{
+ struct numa_group *ng;
+
+ ng = lock_p_ng(p);
+ if (ng) {
+ list_del_rcu(&p->ng_entry);
+ p->numa_group = NULL;
+ spin_unlock(&ng->lock);
+
+ ng_put(ng);
+ }
+}
+
+/*
+ * memory (vma) accounting/tracking
+ *
+ * We assume a 1:1 relation between vmas and mpols and keep a list of mpols in
+ * the numa_group, and a vma backlink in the mpol.
+ */
+
+void numa_vma_link(struct vm_area_struct *new, struct vm_area_struct *old)
+{
+ struct numa_group *ng = NULL;
+
+ if (old && old->vm_policy)
+ ng = old->vm_policy->numa_group;
+
+ if (!ng && new->vm_policy)
+ ng = new->vm_policy->numa_group;
+
+ if (!ng)
+ return;
+
+ ng_get(ng);
+ new->vm_policy->numa_group = ng;
+ new->vm_policy->vma = new;
+
+ spin_lock(&ng->lock);
+ list_add_rcu(&new->vm_policy->ng_entry, &ng->vmas);
+ spin_unlock(&ng->lock);
+}
+
+void __numa_add_rss_counter(struct vm_area_struct *vma, int member, long value)
+{
+ /*
+ * Since the caller passes the vma argument, the caller is responsible
+ * for making sure the vma is stable, hence the ->vm_policy->numa_group
+ * dereference is safe. (caller usually has vma->vm_mm->mmap_sem for
+ * reading).
+ */
+ if (vma->vm_policy->numa_group)
+ atomic_long_add(value, &vma->vm_policy->numa_group->rss.count[member]);
+}
+
+static void __mpol_put_rcu(struct rcu_head *rcu)
+{
+ struct mempolicy *mpol = container_of(rcu, struct mempolicy, rcu);
+ mpol_put(mpol);
+}
+
+void numa_vma_unlink(struct vm_area_struct *vma)
+{
+ struct mempolicy *mpol;
+ struct numa_group *ng;
+
+ if (!vma)
+ return;
+
+ mpol = vma->vm_policy;
+ if (!mpol)
+ return;
+
+ ng = mpol->numa_group;
+ if (!ng)
+ return;
+
+ spin_lock(&ng->lock);
+ list_del_rcu(&mpol->ng_entry);
+ /*
+ * Rediculous, see numa_group_mem_migrate.
+ */
+ mpol->vma = NULL;
+ mpol_get(mpol);
+ call_srcu(&ng_srcu, &mpol->rcu, __mpol_put_rcu);
+ spin_unlock(&ng->lock);
+
+ ng_put(ng);
+}
+
+/*
+ * syscall bits
+ */
+
+#define MS_ID_GET -2
+#define MS_ID_NEW -1
+
+static struct numa_group *ng_create(struct task_struct *p)
+{
+ struct numa_group *ng;
+ int node, err;
+
+ ng = kzalloc(sizeof(*ng), GFP_KERNEL);
+ if (!ng)
+ goto fail;
+
+ err = idr_pre_get(&numa_group_idr, GFP_KERNEL);
+ if (!err)
+ goto fail_alloc;
+
+ mutex_lock(&numa_group_idr_lock);
+ err = idr_get_new(&numa_group_idr, ng, &ng->id);
+ mutex_unlock(&numa_group_idr_lock);
+
+ if (err)
+ goto fail_alloc;
+
+ spin_lock_init(&ng->lock);
+ atomic_set(&ng->ref, 1);
+ ng->cred = get_task_cred(p);
+ INIT_LIST_HEAD(&ng->tasks);
+ INIT_LIST_HEAD(&ng->vmas);
+ init_ne(&ng->numa_entity, &numa_group_ops);
+
+ dequeue_ne(&p->mm->numa);
+ node = find_idlest_node(tsk_home_node(p));
+ enqueue_ne(&ng->numa_entity, node);
+
+ return ng;
+
+fail_alloc:
+ kfree(ng);
+fail:
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * More or less equal to ptrace_may_access(); XXX
+ */
+static int ng_allowed(struct numa_group *ng, struct task_struct *p)
+{
+ const struct cred *cred = ng->cred, *tcred;
+
+ rcu_read_lock();
+ tcred = __task_cred(p);
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->uid == tcred->euid &&
+ cred->uid == tcred->suid &&
+ cred->uid == tcred->uid &&
+ cred->gid == tcred->egid &&
+ cred->gid == tcred->sgid &&
+ cred->gid == tcred->gid))
+ goto ok;
+ if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+ goto ok;
+ rcu_read_unlock();
+ return -EPERM;
+
+ok:
+ rcu_read_unlock();
+ return 0;
+}
+
+static struct numa_group *ng_lookup(int ng_id, struct task_struct *p)
+{
+ struct numa_group *ng;
+
+ rcu_read_lock();
+again:
+ ng = idr_find(&numa_group_idr, ng_id);
+ if (!ng) {
+ rcu_read_unlock();
+ return ERR_PTR(-EINVAL);
+ }
+ if (ng_allowed(ng, p)) {
+ rcu_read_unlock();
+ return ERR_PTR(-EPERM);
+ }
+ if (!ng_tryget(ng))
+ goto again;
+ rcu_read_unlock();
+
+ return ng;
+}
+
+static int ng_task_assign(struct task_struct *p, int ng_id)
+{
+ struct numa_group *old_ng, *ng;
+
+ ng = ng_lookup(ng_id, p);
+ if (IS_ERR(ng))
+ return PTR_ERR(ng);
+
+ old_ng = lock_p_ng(p);
+ if (old_ng) {
+ /*
+ * Special numa_group that assists in serializing the
+ * p->numa_group hand-over. Assume concurrent ng_task_assign()
+ * invocation, only one can remove the old_ng, but both need
+ * to serialize against RCU.
+ *
+ * Therefore we cannot clear p->numa_group, this would lead to
+ * the second not observing old_ng and thus missing the RCU
+ * sync.
+ *
+ * We also cannot set p->numa_group to ng, since then we'd
+ * try to remove ourselves from a list we're not on yet --
+ * double list_del_rcu() invocation.
+ *
+ * Solve this by using this special intermediate numa_group,
+ * we set p->numa_group to this object so that the second
+ * observes a !NULL numa_group, however we skip the
+ * list_del_rcu() when we find this special group avoiding the
+ * double delete.
+ */
+ static struct numa_group __ponies = {
+ .lock = __SPIN_LOCK_UNLOCKED(__ponies.lock),
+ };
+
+ if (likely(old_ng != &__ponies)) {
+ list_del_rcu(&p->ng_entry);
+ p->numa_group = &__ponies;
+ }
+ spin_unlock(&old_ng->lock);
+
+ if (unlikely(old_ng == &__ponies))
+ old_ng = NULL; /* avoid ng_put() */
+
+ /*
+ * We have to wait for the old ng_entry users to go away before
+ * we can re-use the link entry for the new list.
+ */
+ synchronize_rcu();
+ }
+
+ spin_lock(&ng->lock);
+ p->numa_group = ng;
+ list_add_rcu(&p->ng_entry, &ng->tasks);
+ spin_unlock(&ng->lock);
+
+ sched_setnode(p, ng->numa_entity.node);
+
+ if (old_ng)
+ ng_put(old_ng);
+
+ return ng_id;
+}
+
+static struct task_struct *find_get_task(pid_t tid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ if (!tid)
+ p = current;
+ else
+ p = find_task_by_vpid(tid);
+
+ if (p->flags & PF_EXITING)
+ p = NULL;
+
+ if (p)
+ get_task_struct(p);
+ rcu_read_unlock();
+
+ if (!p)
+ return ERR_PTR(-ESRCH);
+
+ return p;
+}
+
+/*
+ * Bind a thread to a numa group or query its binding or create a new group.
+ *
+ * sys_numa_tbind(tid, -1, 0); // create new group, return new ng_id
+ * sys_numa_tbind(tid, -2, 0); // returns existing ng_id
+ * sys_numa_tbind(tid, ng_id, 0); // set ng_id
+ *
+ * Returns:
+ * -ESRCH tid->task resolution failed
+ * -EINVAL task didn't have a ng_id, flags was wrong
+ * -EPERM we don't have privileges over tid
+ *
+ */
+SYSCALL_DEFINE3(numa_tbind, int, tid, int, ng_id, unsigned long, flags)
+{
+ struct task_struct *p = find_get_task(tid);
+ struct numa_group *ng = NULL;
+ int orig_ng_id = ng_id;
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ if (flags) {
+ ng_id = -EINVAL;
+ goto out;
+ }
+
+ switch (ng_id) {
+ case MS_ID_GET:
+ ng_id = -EINVAL;
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng)
+ ng_id = ng->id;
+ rcu_read_unlock();
+ break;
+
+ case MS_ID_NEW:
+ ng = ng_create(p);
+ if (IS_ERR(ng)) {
+ ng_id = PTR_ERR(ng);
+ break;
+ }
+ ng_id = ng->id;
+ /* fall through */
+
+ default:
+ ng_id = ng_task_assign(p, ng_id);
+ if (ng && orig_ng_id < 0)
+ ng_put(ng);
+ break;
+ }
+
+out:
+ put_task_struct(p);
+ return ng_id;
+}
+
+/*
+ * Bind a memory region to a numa group.
+ *
+ * sys_numa_mbind(addr, len, ng_id, 0);
+ *
+ * create a non-mergable vma over [addr,addr+len) and assign a mpol binding it
+ * to the numa group identified by ng_id.
+ *
+ */
+SYSCALL_DEFINE4(numa_mbind, unsigned long, addr, unsigned long, len,
+ int, ng_id, unsigned long, flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct mempolicy *mpol;
+ struct numa_group *ng;
+ nodemask_t mask;
+ int err = 0;
+
+ if (flags)
+ return -EINVAL;
+
+ if (addr & ~PAGE_MASK)
+ return -EINVAL;
+
+ ng = ng_lookup(ng_id, current);
+ if (IS_ERR(ng))
+ return PTR_ERR(ng);
+
+ mask = nodemask_of_node(ng->numa_entity.node);
+ mpol = mpol_new(MPOL_BIND, 0, &mask);
+ if (!mpol) {
+ ng_put(ng);
+ return -ENOMEM;
+ }
+ mpol->flags |= MPOL_MF_LAZY;
+ mpol->numa_group = ng;
+
+ down_write(&mm->mmap_sem);
+ err = mpol_do_mbind(addr, len, mpol, MPOL_BIND,
+ &mask, MPOL_MF_MOVE|MPOL_MF_LAZY);
+ up_write(&mm->mmap_sem);
+ mpol_put(mpol);
+ ng_put(ng);
+
+ if (!err) {
+ /*
+ * There's a small overlap between ng and mm here, we only
+ * remove the mm after we associate the VMAs with the ng. Since
+ * lazy_migrate_vma() checks the mempolicy hierarchy this works
+ * out fine.
+ */
+ dequeue_ne(&mm->numa);
+ }
+
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+
+asmlinkage long compat_sys_numa_mbind(compat_ulong_t addr, compat_ulong_t len,
+ compat_int_t ng_id, compat_ulong_t flags)
+{
+ return sys_numa_mbind(addr, len, ng_id, flags);
+}
+
+asmlinkage long compat_sys_numa_tbind(compat_int_t tid, compat_int_t ng_id,
+ compat_ulong_t flags)
+{
+ return sys_numa_tbind(tid, ng_id, flags);
+}
+
+#endif /* CONFIG_COMPAT */
+
+static __init int numa_group_init(void)
+{
+ init_srcu_struct(&ng_srcu);
+ return 0;
+}
+early_initcall(numa_group_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16..3f5a59f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -103,6 +103,10 @@ cond_syscall(sys_set_mempolicy);
cond_syscall(compat_sys_mbind);
cond_syscall(compat_sys_get_mempolicy);
cond_syscall(compat_sys_set_mempolicy);
+cond_syscall(sys_numa_mbind);
+cond_syscall(compat_sys_numa_mbind);
+cond_syscall(sys_numa_tbind);
+cond_syscall(compat_sys_numa_tbind);
cond_syscall(sys_add_key);
cond_syscall(sys_request_key);
cond_syscall(sys_keyctl);
diff --git a/mm/memory.c b/mm/memory.c
index 6b39551c..4614be0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -144,9 +144,10 @@ static void add_rss_counter_fast(struct vm_area_struct *vma, int member, int val
{
struct task_struct *task = current;
- if (likely(task->mm == vma->vm_mm))
+ if (likely(task->mm == vma->vm_mm)) {
task->rss_stat.count[member] += val;
- else
+ numa_add_rss_counter(vma, member, val);
+ } else
add_rss_counter(vma, member, val);
}
#define inc_rss_counter_fast(vma, member) add_rss_counter_fast(vma, member, 1)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index df29149..6cdfdba 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -291,12 +291,13 @@ struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+ policy = kmem_cache_alloc(policy_cache, GFP_KERNEL | __GFP_ZERO);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
+ INIT_LIST_HEAD(&policy->ng_entry);
return policy;
}
@@ -611,6 +612,9 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
if (!err) {
mpol_get(new);
vma->vm_policy = new;
+ numa_vma_link(vma, NULL);
+ if (old)
+ numa_vma_unlink(old->vma);
mpol_put(old);
}
return err;
@@ -2063,11 +2067,13 @@ int vma_dup_policy(struct vm_area_struct *new, struct vm_area_struct *old)
if (IS_ERR(mpol))
return PTR_ERR(mpol);
vma_set_policy(new, mpol);
+ numa_vma_link(new, old);
return 0;
}
void vma_put_policy(struct vm_area_struct *vma)
{
+ numa_vma_unlink(vma);
mpol_put(vma_policy(vma));
}
On Fri, 18 May 2012, Rik van Riel wrote:
> I like your code for handling smaller processes in NUMA
> systems, but we do need to have a serious discussion on
> how to handle processes that do not fit in one node.
The home node seems to be associated with a thread and not a process. So
we would be able to have multiple home nodes per process.
The whole NUMA policy thing is already quite complex and this will
increase that complexity somewhat. Wish we could simplify things somehow.
On Fri, 2012-05-18 at 10:25 -0500, Christoph Lameter wrote:
> On Fri, 18 May 2012, Rik van Riel wrote:
>
> > I like your code for handling smaller processes in NUMA
> > systems, but we do need to have a serious discussion on
> > how to handle processes that do not fit in one node.
>
> The home node seems to be associated with a thread and not a process. So
> we would be able to have multiple home nodes per process.
Its set the same for every thread in a process, unless you use the new
system calls to carve it up in pieces.
> The whole NUMA policy thing is already quite complex and this will
> increase that complexity somewhat. Wish we could simplify things somehow.
Most of that is due to existing interfaces, I'm afraid we cannot
simplify without reducing those :/
On Fri, 2012-05-18 at 11:14 -0400, Rik van Riel wrote:
>
> I am not convinced this is the right way forward.
Well, you don't have to use it.. but you can.
> While this may work well for programs written in languages
> with pointers, and for virtual machines, I do not see how
> eg. a JVM could provide useful hints to the kernel, because
> the Java program running on top has no idea about the
> memory addresses of its objects, and the Java language has
> no way to hint which thread will be the predominant user
> of an object.
This is one of the many reasons why you'll never see me use Java or any
other 'managed' language.
> I like your code for handling smaller processes in NUMA
> systems, but we do need to have a serious discussion on
> how to handle processes that do not fit in one node.
>
> The more I think about it, the more Andrea's code looks
> like it might be the more flexible way forward.
I still have serious concerns about his approach; it very much assumes
there's a temporal page<->thread relation to exploit. This might not at
all be true for some programs (including JVM) that have hardly any data
separation and just point chase their way around the entire object set.
Consider a database where each thread is basically scanning the global
data-set at random.
Furthermore, virtual machines actively counter this assumption, a guest
scheduler will move tasks between vcpus at will -- for the host this
effective scrambles any page<->thread (vcpu) relation that might have
existed.
I've also yet to see a coherent patch-set from Andrea with coherent
changelogs and comments (this very much precludes PDFs of any kind).
And you know I detest the way he cobbled his way around the scheduler
instead of integrating it properly. And I'll very much not accept
anything like the spaghetti code anywhere near the scheduler I have to
maintain. (I recently looked at the THP code and if that's the standard
I'm not having it).
That said, I'll leave the door open and will consider something like his
scanning stuff as an optional add on on-top of this.
I very much believe in doing the simple thing first, and this is that,
this is much like what the userspace daemons try to do and many of the
traditional Unixes have also done (albeit many of those didn't go the
way of home-node migration).
> Another topic to discuss is whether we want lazy
> migrate-on-fault, or if we want to keep the program
> spend its time running, using another (idle) core to
> do the migration in the background.
I've also said many times over that I absolutely detest all the async
stuff because it messes up accounting. And until someone comes up with a
sane means of sorting that I'll stick to migrate-on-fault.
You also assume there's idle time elsewhere. This isn't true in general.
On Fri, 18 May 2012, Peter Zijlstra wrote:
> Most of that is due to existing interfaces, I'm afraid we cannot
> simplify without reducing those :/
I think some change to those would also be appropriate. Those have grown
over a long time and there are some areas of problematic integration with
the scheduler, numa policies and cpusets etc.
On Fri, 2012-05-18 at 17:35 +0200, Peter Zijlstra wrote:
> I've also said many times over that I absolutely detest all the async
> stuff because it messes up accounting. And until someone comes up with a
> sane means of sorting that I'll stick to migrate-on-fault.
The other nice advantage of migrate-on-fault is that you don't have to
play lifetime games with vmas. This much simplifies that aspect.
Again, simple things first.
On 05/18/2012 06:42 AM, tip-bot for Peter Zijlstra wrote:
> Now that we have a NUMA process scheduler, provide a syscall
> interface for finer granularity NUMA balancing. In particular
> this allows setting up NUMA groups of threads and vmas within
> a process.
>
> For this we introduce two new syscalls:
>
> sys_numa_tbind(int tig, int ng_id, unsigned long flags);
>
> Bind a thread to a numa group, query its binding or create a new group:
>
> sys_numa_tbind(tid, -1, 0); // create new group, return new ng_id
> sys_numa_tbind(tid, -2, 0); // returns existing ng_id
> sys_numa_tbind(tid, ng_id, 0); // set ng_id
I am not convinced this is the right way forward.
While this may work well for programs written in languages
with pointers, and for virtual machines, I do not see how
eg. a JVM could provide useful hints to the kernel, because
the Java program running on top has no idea about the
memory addresses of its objects, and the Java language has
no way to hint which thread will be the predominant user
of an object.
I like your code for handling smaller processes in NUMA
systems, but we do need to have a serious discussion on
how to handle processes that do not fit in one node.
The more I think about it, the more Andrea's code looks
like it might be the more flexible way forward.
Another topic to discuss is whether we want lazy
migrate-on-fault, or if we want to keep the program
spend its time running, using another (idle) core to
do the migration in the background.
--
All rights reversed
On Fri, 2012-05-18 at 10:37 -0500, Christoph Lameter wrote:
> On Fri, 18 May 2012, Peter Zijlstra wrote:
>
> > Most of that is due to existing interfaces, I'm afraid we cannot
> > simplify without reducing those :/
>
> I think some change to those would also be appropriate. Those have grown
> over a long time and there are some areas of problematic integration with
> the scheduler, numa policies and cpusets etc.
You're free to explore that avenue.. But afaict all these interfaces are
in active use.
And yes, some of it is rather awkward, but such is the price we pay for
backwards compatibility.
On Fri, 18 May 2012, Peter Zijlstra wrote:
> On Fri, 2012-05-18 at 17:35 +0200, Peter Zijlstra wrote:
> > I've also said many times over that I absolutely detest all the async
> > stuff because it messes up accounting. And until someone comes up with a
> > sane means of sorting that I'll stick to migrate-on-fault.
>
> The other nice advantage of migrate-on-fault is that you don't have to
> play lifetime games with vmas. This much simplifies that aspect.
The problem with migrate on fault in the past has been that there was no
consistent benefit from the overhead added to the system. Useless page
migration is a bit expensive.
On 05/18/2012 11:35 AM, Peter Zijlstra wrote:
> On Fri, 2012-05-18 at 11:14 -0400, Rik van Riel wrote:
>> While this may work well for programs written in languages
>> with pointers, and for virtual machines, I do not see how
>> eg. a JVM could provide useful hints to the kernel, because
>> the Java program running on top has no idea about the
>> memory addresses of its objects, and the Java language has
>> no way to hint which thread will be the predominant user
>> of an object.
>
> This is one of the many reasons why you'll never see me use Java or any
> other 'managed' language.
You are hardly a typical user, though :)
Whether we like it or not, managed runtimes are here,
and people are using them in droves.
I believe that the kernel should be able to handle
NUMA placement for such uses.
>> I like your code for handling smaller processes in NUMA
>> systems, but we do need to have a serious discussion on
>> how to handle processes that do not fit in one node.
>>
>> The more I think about it, the more Andrea's code looks
>> like it might be the more flexible way forward.
>
> I still have serious concerns about his approach; it very much assumes
> there's a temporal page<->thread relation to exploit. This might not at
> all be true for some programs (including JVM) that have hardly any data
> separation and just point chase their way around the entire object set.
Neither his approach or your approach will be able to
help these workloads. I do not see how that should be
counted against Andrea's approach, though, since it
does seem to be useful for sane workloads.
> I've also yet to see a coherent patch-set from Andrea with coherent
> changelogs and comments (this very much precludes PDFs of any kind).
>
> And you know I detest the way he cobbled his way around the scheduler
> instead of integrating it properly. And I'll very much not accept
> anything like the spaghetti code anywhere near the scheduler I have to
> maintain. (I recently looked at the THP code and if that's the standard
> I'm not having it).
>
> That said, I'll leave the door open and will consider something like his
> scanning stuff as an optional add on on-top of this.
No argument there. Your scheduler integration is nicer,
and Andrea's code could stand some documentation.
> I very much believe in doing the simple thing first, and this is that,
Leave out your syscalls (which might not be useful for
managed runtimes), and you actually have the simple
thing :)
I am all in favor of simplicity, and doing one thing
at a time.
--
All rights reversed
On Fri, 2012-05-18 at 10:47 -0500, Christoph Lameter wrote:
> On Fri, 18 May 2012, Peter Zijlstra wrote:
>
> > On Fri, 2012-05-18 at 17:35 +0200, Peter Zijlstra wrote:
> > > I've also said many times over that I absolutely detest all the async
> > > stuff because it messes up accounting. And until someone comes up with a
> > > sane means of sorting that I'll stick to migrate-on-fault.
> >
> > The other nice advantage of migrate-on-fault is that you don't have to
> > play lifetime games with vmas. This much simplifies that aspect.
>
> The problem with migrate on fault in the past has been that there was no
> consistent benefit from the overhead added to the system. Useless page
> migration is a bit expensive.
I'm not sure I follow.. having the page local is a win, presuming you
can limit the migration rate to something low in relation to the cost of
remote accesses.
How does it matter how you migrate?
On Fri, 18 May 2012, Peter Zijlstra wrote:
> On Fri, 2012-05-18 at 10:47 -0500, Christoph Lameter wrote:
> > On Fri, 18 May 2012, Peter Zijlstra wrote:
> >
> > > On Fri, 2012-05-18 at 17:35 +0200, Peter Zijlstra wrote:
> > > > I've also said many times over that I absolutely detest all the async
> > > > stuff because it messes up accounting. And until someone comes up with a
> > > > sane means of sorting that I'll stick to migrate-on-fault.
> > >
> > > The other nice advantage of migrate-on-fault is that you don't have to
> > > play lifetime games with vmas. This much simplifies that aspect.
> >
> > The problem with migrate on fault in the past has been that there was no
> > consistent benefit from the overhead added to the system. Useless page
> > migration is a bit expensive.
>
> I'm not sure I follow.. having the page local is a win, presuming you
> can limit the migration rate to something low in relation to the cost of
> remote accesses.
Having the page local is a win if there are a sufficient number of
accesses to amortize the effort to move the page. Given the expensive
nature of page migration there would need to be a large number of accesses
to a page to justify the effort.
> How does it matter how you migrate?
Migrate on fault incurs two types of costs:
1. Unmapping. This results in additional faults to reestablish the ptes.
2. Actual lazy migrate. More faults. Now the page needs to be copied to
the new node and the actual migration work is done.
On Fri, 2012-05-18 at 11:00 -0500, Christoph Lameter wrote:
> Having the page local is a win if there are a sufficient number of
> accesses to amortize the effort to move the page. Given the expensive
> nature of page migration there would need to be a large number of accesses
> to a page to justify the effort.
Right, but this is true of any migration scheme and not specific to MoF.
> > How does it matter how you migrate?
>
> Migrate on fault incurs two types of costs:
>
> 1. Unmapping. This results in additional faults to reestablish the ptes.
>
> 2. Actual lazy migrate. More faults. Now the page needs to be copied to
> the new node and the actual migration work is done.
Nah, only the 1 fault is extra. Regular migration already needs to unmap
and copy and reinstate, so the only extra work is the fault to trigger
it.
On Fri, 2012-05-18 at 11:48 -0400, Rik van Riel wrote:
> Whether we like it or not, managed runtimes are here,
> and people are using them in droves.
>
> I believe that the kernel should be able to handle
> NUMA placement for such uses.
Possibly, but it would also help if runtimes grew the capability to
express such relations. That said.. I never said we shouldn't/couldn't
help them eventually.
> > I still have serious concerns about his approach; it very much assumes
> > there's a temporal page<->thread relation to exploit. This might not at
> > all be true for some programs (including JVM) that have hardly any data
> > separation and just point chase their way around the entire object set.
>
> Neither his approach or your approach will be able to
> help these workloads. I do not see how that should be
> counted against Andrea's approach, though, since it
> does seem to be useful for sane workloads.
Well, if you have a sane workload you often already have strong data
separation in your application, so telling the kernel about it isn't too
much bother, right?
The thing is, I've been traumatized by too much exposure to the
auto-parallelization crazies. I've never seen auto parallelization worth
using, despite lots of presumably smart people wanting to make it
happen.
> > I very much believe in doing the simple thing first, and this is that,
>
> Leave out your syscalls (which might not be useful for
> managed runtimes), and you actually have the simple
> thing :)
Right, but the virt people could actually trivially use those, and vnuma
doesn't have the scambling issue outlined earlier since the guest kernel
would also try to keep home-node affinity.
Avi already said patching kvm would be like 5 minutes work.
It also absolutely avoids the false sharing issue otherwise present with
per-cpu memory, since you explicitly tell it where it belongs.
On Fri, 18 May 2012, Peter Zijlstra wrote:
> On Fri, 2012-05-18 at 11:00 -0500, Christoph Lameter wrote:
>
> > Having the page local is a win if there are a sufficient number of
> > accesses to amortize the effort to move the page. Given the expensive
> > nature of page migration there would need to be a large number of accesses
> > to a page to justify the effort.
>
> Right, but this is true of any migration scheme and not specific to MoF.
Yes and the concern about auto migration schemes not improving performance
in general applies to all migration schemes.
On Fri, May 18, 2012 at 6:14 PM, Rik van Riel <[email protected]> wrote:
> I am not convinced this is the right way forward.
>
> While this may work well for programs written in languages
> with pointers, and for virtual machines, I do not see how
> eg. a JVM could provide useful hints to the kernel, because
> the Java program running on top has no idea about the
> memory addresses of its objects, and the Java language has
> no way to hint which thread will be the predominant user
> of an object.
True.
OTOH, while autonuma seems to be better for the JVM, numa sched is a
significant improvement over current code:
http://lkml.indiana.edu/hypermail/linux/kernel/1204.0/01200.html
Why is that?
* Peter Zijlstra <[email protected]> wrote:
< [...]
>
> That said, I'll leave the door open and will consider
> something like his scanning stuff as an optional add on on-top
> of this.
>
> I very much believe in doing the simple thing first, and this
> is that, this is much like what the userspace daemons try to
> do and many of the traditional Unixes have also done (albeit
> many of those didn't go the way of home-node migration).
Probably because those Unices didnt survive long enough to reach
NUMA machines in earnest. SMP was a big deal already.
Thanks,
Ingo
* Peter Zijlstra <[email protected]> wrote:
> > > I very much believe in doing the simple thing first, and
> > > this is that,
> >
> > Leave out your syscalls (which might not be useful for
> > managed runtimes), and you actually have the simple thing :)
>
> Right, but the virt people could actually trivially use those,
> and vnuma doesn't have the scambling issue outlined earlier
> since the guest kernel would also try to keep home-node
> affinity.
>
> Avi already said patching kvm would be like 5 minutes work.
These APIs also match what user-space numa daemons started doing
already.
> It also absolutely avoids the false sharing issue otherwise
> present with per-cpu memory, since you explicitly tell it
> where it belongs.
The grouping is also a natural extension to task and memory
affinities and groups in general.
It also allows us to turn auto-migration off by default, which
is a plus in my book. Without enough numbers I'm not convinced
that we really *want* auto-discovery turned on all the time, for
all workloads. The thing is, in practice most workloads that
matter are short-run and even trivial forms of CPU migration
doesnt ever happen for bursts of activity. We place them and
that's it.
Managed runtimes on the other hand can be expected to know about
and manage their locality - they do it anyway, by running guest
scheduler(s). So this patch-set gives them the ability to
express locality in a simple way, without the host kernel
scanning actively.
We can auto-scan on top of this, if the numbers support it, but
in the simple case where both the guest and the host is smart
then simply expressing locality and telling each other is vastly
superior to any scanning method.
Thanks,
Ingo
On Fri, 18 May 2012, tip-bot for Peter Zijlstra wrote:
> Commit-ID: 3a0fea961b98d1838f35dba51a639d40f4a5589f
> Gitweb: http://git.kernel.org/tip/3a0fea961b98d1838f35dba51a639d40f4a5589f
> Author: Peter Zijlstra <[email protected]>
> AuthorDate: Thu, 8 Mar 2012 17:56:08 +0100
> Committer: Ingo Molnar <[email protected]>
> CommitDate: Fri, 18 May 2012 08:16:27 +0200
>
> sched/numa: Introduce sys_numa_{t,m}bind()
>
This depends on 931ea9d1a6e0 ("rcu: Implement per-domain single-threaded
call_srcu() state machine") from core/rcu.
* David Rientjes <[email protected]> wrote:
> On Fri, 18 May 2012, tip-bot for Peter Zijlstra wrote:
>
> > Commit-ID: 3a0fea961b98d1838f35dba51a639d40f4a5589f
> > Gitweb: http://git.kernel.org/tip/3a0fea961b98d1838f35dba51a639d40f4a5589f
> > Author: Peter Zijlstra <[email protected]>
> > AuthorDate: Thu, 8 Mar 2012 17:56:08 +0100
> > Committer: Ingo Molnar <[email protected]>
> > CommitDate: Fri, 18 May 2012 08:16:27 +0200
> >
> > sched/numa: Introduce sys_numa_{t,m}bind()
> >
>
> This depends on 931ea9d1a6e0 ("rcu: Implement per-domain
> single-threaded call_srcu() state machine") from core/rcu.
Indeed ...
I'll rebase it to a (by that time probably upstream) srcu commit
after the merge window, once we have more fixes, have
incorporated suggestions, etc. - but it's still essentially an
RFC topic: [*]
Fundamentally, do people agree with the 'single home node'
approach to begin with? We could turn it into a node mask,
but that complicates things.
For example if there's a hierarchy of nodes, low latency and
high latency ones, then it might be valid to limit to a high
level (high latency) node and not specify the lower level node -
while the kernel would still know about the lower level nodes as
well.
Managing locality in a non-trivial cache hierarchy is hard :-/
Thanks,
Ingo
[*] I should probably move this to the tip:RFC/sched/numa
branch, to make it clear what the status of the branch is,
from the commit notification emails.
On Mon, 21 May 2012, Ingo Molnar wrote:
> Fundamentally, do people agree with the 'single home node'
> approach to begin with? We could turn it into a node mask,
> but that complicates things.
>
I merged core/rcu locally to get a kernel that builds but ran into a
divide by zero:
[ 0.602181] divide error: 0000 [#1] SMP
[ 0.606159] CPU 0
[ 0.608003] Modules linked in:
[ 0.611266]
[ 0.612767] Pid: 1, comm: swapper/0 Not tainted 3.4.0 #1
[ 0.620912] RIP: 0010:[<ffffffff810af9ab>] [<ffffffff810af9ab>] update_sd_lb_stats+0x38b/0x740
[ 0.629630] RSP: 0018:ffff880275471870 EFLAGS: 00010056
[ 0.634927] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[ 0.642042] RDX: 0000000000000000 RSI: ffff880475969ca0 RDI: 0000000000000000
[ 0.649181] RBP: ffff880275471970 R08: ffff88027545ff60 R09: 0000000000000000
[ 0.656294] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000020
[ 0.663419] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 0.670559] FS: 0000000000000000(0000) GS:ffff88027fc00000(0000) knlGS:0000000000000000
[ 0.678633] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 0.684364] CR2: 0000000000000000 CR3: 000000000180b000 CR4: 00000000000007f0
[ 0.691499] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 0.698628] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 0.705755] Process swapper/0 (pid: 1, threadinfo ffff880275470000, task ffff88027545b800)
[ 0.714001] Stack:
[ 0.716021] 000000000000000b 0000000000012000 0000000000000000 ffff88087fd92000
[ 0.723455] ffff880275471b5c 00000000754718b8 fffffffffffffff8 ffff880275471ab0
[ 0.730884] 00000000003f5e37 00000000ffffffff ffff880275471988 ffff88027545ff60
[ 0.738320] Call Trace:
[ 0.740772] [<ffffffff810afda5>] find_busiest_group+0x45/0x480
[ 0.746692] [<ffffffff810b02da>] load_balance+0xfa/0x6e0
[ 0.752084] [<ffffffff810ae820>] ? active_load_balance_cpu_stop+0x1d0/0x1d0
[ 0.759127] [<ffffffff810b0c7c>] idle_balance+0xcc/0x130
[ 0.764529] [<ffffffff81530686>] __schedule+0x756/0x760
[ 0.769829] [<ffffffff81530749>] schedule+0x29/0x70
[ 0.774795] [<ffffffff8152ee45>] schedule_timeout+0x1b5/0x2e0
[ 0.780623] [<ffffffff8106986e>] ? physflat_send_IPI_mask+0xe/0x10
[ 0.786885] [<ffffffff810655e6>] ? native_smp_send_reschedule+0x46/0x60
[ 0.793584] [<ffffffff810a6fd0>] ? resched_task+0x60/0x70
[ 0.799052] [<ffffffff810a7565>] ? check_preempt_curr+0x75/0xa0
[ 0.805051] [<ffffffff8152fdb6>] wait_for_common+0xc6/0x160
[ 0.810716] [<ffffffff810aa420>] ? try_to_wake_up+0x2a0/0x2a0
[ 0.816541] [<ffffffff8152ff2d>] wait_for_completion+0x1d/0x20
[ 0.822461] [<ffffffff81099fe9>] kthread_create_on_node+0xa9/0x130
[ 0.828714] [<ffffffff8152ac86>] ? alternate_node_alloc+0x9d/0xa6
[ 0.834890] [<ffffffff81093cd0>] ? worker_maybe_bind_and_lock.isra.27+0xd0/0xd0
[ 0.842266] [<ffffffff81160201>] ? kmem_freepages.isra.41+0x91/0x150
[ 0.848701] [<ffffffff81095d2d>] __alloc_workqueue_key+0x1dd/0x3f0
[ 0.854963] [<ffffffff818c7d26>] ? sched_init_smp+0x300/0x30f
[ 0.860787] [<ffffffff818c9891>] cpuset_init_smp+0x40/0x50
[ 0.866356] [<ffffffff818b2c49>] kernel_init+0xb9/0x1ad
[ 0.871662] [<ffffffff8153a0d4>] kernel_thread_helper+0x4/0x10
[ 0.877573] [<ffffffff818b2b90>] ? start_kernel+0x37e/0x37e
[ 0.883216] [<ffffffff8153a0d0>] ? gs_change+0xb/0xb
On Mon, 21 May 2012, David Rientjes wrote:
> [ 0.602181] divide error: 0000 [#1] SMP
> [ 0.606159] CPU 0
> [ 0.608003] Modules linked in:
> [ 0.611266]
> [ 0.612767] Pid: 1, comm: swapper/0 Not tainted 3.4.0 #1
> [ 0.620912] RIP: 0010:[<ffffffff810af9ab>] [<ffffffff810af9ab>] update_sd_lb_stats+0x38b/0x740
This is
4ec4412e kernel/sched/fair.c 3876) if (local_group) {
bd939f45 kernel/sched/fair.c 3877) if (env->idle != CPU_NEWLY_IDLE) {
04f733b4 kernel/sched/fair.c 3878) if (balance_cpu != env->dst_cpu) {
4ec4412e kernel/sched/fair.c 3879) *balance = 0;
4ec4412e kernel/sched/fair.c 3880) return;
4ec4412e kernel/sched/fair.c 3881) }
bd939f45 kernel/sched/fair.c 3882) update_group_power(env->sd, env->dst_cpu);
4ec4412e kernel/sched/fair.c 3883) } else if (time_after_eq(jiffies, group->sgp->next_update))
bd939f45 kernel/sched/fair.c 3884) update_group_power(env->sd, env->dst_cpu);
1e3c88bd kernel/sched_fair.c 3885) }
1e3c88bd kernel/sched_fair.c 3886)
1e3c88bd kernel/sched_fair.c 3887) /* Adjust by relative CPU power of the group */
9c3f75cb kernel/sched_fair.c 3888) sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
the divide of group->sgp->power. This doesn't happen when reverting back
to sched/urgent at 30b4e9eb783d ("sched: Fix KVM and ia64 boot crash due
to sched_groups circular linked list assumption"). Let me know if you'd
like a bisect if the problem isn't immediately obvious.
On Mon, 2012-05-21 at 19:42 -0700, David Rientjes wrote:
> On Mon, 21 May 2012, David Rientjes wrote:
>
> > [ 0.602181] divide error: 0000 [#1] SMP
> > [ 0.606159] CPU 0
> > [ 0.608003] Modules linked in:
> > [ 0.611266]
> > [ 0.612767] Pid: 1, comm: swapper/0 Not tainted 3.4.0 #1
> > [ 0.620912] RIP: 0010:[<ffffffff810af9ab>] [<ffffffff810af9ab>] update_sd_lb_stats+0x38b/0x740
>
> This is
>
> 4ec4412e kernel/sched/fair.c 3876) if (local_group) {
> bd939f45 kernel/sched/fair.c 3877) if (env->idle != CPU_NEWLY_IDLE) {
> 04f733b4 kernel/sched/fair.c 3878) if (balance_cpu != env->dst_cpu) {
> 4ec4412e kernel/sched/fair.c 3879) *balance = 0;
> 4ec4412e kernel/sched/fair.c 3880) return;
> 4ec4412e kernel/sched/fair.c 3881) }
> bd939f45 kernel/sched/fair.c 3882) update_group_power(env->sd, env->dst_cpu);
> 4ec4412e kernel/sched/fair.c 3883) } else if (time_after_eq(jiffies, group->sgp->next_update))
> bd939f45 kernel/sched/fair.c 3884) update_group_power(env->sd, env->dst_cpu);
> 1e3c88bd kernel/sched_fair.c 3885) }
> 1e3c88bd kernel/sched_fair.c 3886)
> 1e3c88bd kernel/sched_fair.c 3887) /* Adjust by relative CPU power of the group */
> 9c3f75cb kernel/sched_fair.c 3888) sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
>
> the divide of group->sgp->power. This doesn't happen when reverting back
> to sched/urgent at 30b4e9eb783d ("sched: Fix KVM and ia64 boot crash due
> to sched_groups circular linked list assumption"). Let me know if you'd
> like a bisect if the problem isn't immediately obvious.
I'm fairly sure you'll hit cb83b629b with your bisect (I've got one more
report on this).
So the code in build_sched_domains() initializes the group->sgp->power
stuff through init_sched_groups_power(), which ends up calling
update_cpu_power() for every individual cpu and update_group_power() for
groups.
Now update_cpu_power() should ensure ->power isn't ever 0 -- it sets it
to 1 in that case, update_group_power() computes a straight sum of
power, which being assumed are all >0 should also result in >0.
Only after we initialize the power in build_sched_domains() do we
install the domains, so we should never hit the above.
Now clearly we do so there's a hole somewhere.. let me carefully read
all that.
The below appears to contain a bug, not sure its the one you're
triggering, but who knows. Lemme stare more.
---
Subject: sched: Make sure to not re-read variables after validation
We could re-read rq->rt_avg after we validated it was smaller than
total, invalidating the check and resulting in an unintended negative.
Signed-off-by: Peter Zijlstra <[email protected]>
---
kernel/sched/fair.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index de49ed5..54dca4d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3697,15 +3697,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available;
+ u64 total, available, age_stamp, avg;
- total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = ACCESS_ONCE(rq->age_stamp);
+ avg = ACCESS_ONCE(rq->rt_avg);
+
+ total = sched_avg_period() + (rq->clock - age_stamp);
- if (unlikely(total < rq->rt_avg)) {
+ if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
available = 0;
} else {
- available = total - rq->rt_avg;
+ available = total - avg;
}
if (unlikely((s64)total < SCHED_POWER_SCALE))
On Tue, 2012-05-22 at 14:04 +0200, Peter Zijlstra wrote:
> The below appears to contain a bug, not sure its the one you're
> triggering, but who knows. Lemme stare more.
OK, nevermind that patch (not that it isn't a good one), but I've found
the problem. build_overlap_sched_groups() is crummy.. just no idea how
to go about fixing it quite yet though.
/me goes kick his gray cells some more.
On Tue, 2012-05-22 at 17:00 +0200, Peter Zijlstra wrote:
> /me goes kick his gray cells some more.
OK, does the below make your machine happy? If not, please share the
completely topology setup of that thing (lscpu or similar).
---
Subject: sched: Fix SD_OVERLAP
SD_OVERLAP exists to allow overlapping groups, overlapping groups
appear in NUMA topologies that aren't fully connected.
The typical result of not fully connected NUMA is that each cpu (or
rather node) will have different spans for a particular distance.
However due to how sched domains are traversed -- only the first cpu
in the mask goes one level up -- the next level only cares about the
spans of the cpus that went up.
Due to this two things were observed to be broken:
- build_overlap_sched_groups() -- since its possible the cpu we're
building the groups for exists in multiple (or all) groups, the
selection criteria of the first group didn't ensure there was a cpu
for which is was true that cpumask_first(span) == cpu. Thus load-
balancing would terminate.
- update_group_power() -- assumed that the cpu span of the first
group of the domain was covered by all groups of the child domain.
The above explains why this isn't true, so deal with it.
Signed-off-by: Peter Zijlstra <[email protected]>
---
kernel/sched/core.c | 7 +++++--
kernel/sched/fair.c | 25 ++++++++++++++++++++-----
2 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18eed17..29217a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6007,11 +6007,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ sg->sgp = *per_cpu_ptr(sdd->sgp, i);
atomic_inc(&sg->sgp->ref);
- if (cpumask_test_cpu(cpu, sg_span))
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ cpumask_first(sg_span) == cpu) {
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
groups = sg;
+ }
if (!first)
first = sg;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index de49ed5..7466b7a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3768,11 +3768,26 @@ void update_group_power(struct sched_domain *sd, int cpu)
power = 0;
- group = child->groups;
- do {
- power += group->sgp->power;
- group = group->next;
- } while (group != child->groups);
+ if (child->flags & SD_OVERLAP) {
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
+
+ for_each_cpu(cpu, sched_group_cpus(sdg))
+ power += power_of(cpu);
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
+
+ group = child->groups;
+ do {
+ power += group->sgp->power;
+ group = group->next;
+ } while (group != child->groups);
+ }
sdg->sgp->power = power;
}
On Wed, 23 May 2012, Peter Zijlstra wrote:
> On Tue, 2012-05-22 at 17:00 +0200, Peter Zijlstra wrote:
> > /me goes kick his gray cells some more.
>
> OK, does the below make your machine happy? If not, please share the
> completely topology setup of that thing (lscpu or similar).
>
Same divide by zero. I'd be happy to run a debugging patch if you can
come up with one.
$ grep -E 'processor|core|sibling|physical id|apicid|cpuid' /proc/cpuinfo | sed 's/processor/\nprocessor/'
processor : 0
physical id : 0
siblings : 4
core id : 0
cpu cores : 4
apicid : 4
initial apicid : 0
cpuid level : 5
processor : 1
physical id : 0
siblings : 4
core id : 1
cpu cores : 4
apicid : 5
initial apicid : 1
cpuid level : 5
processor : 2
physical id : 0
siblings : 4
core id : 2
cpu cores : 4
apicid : 6
initial apicid : 2
cpuid level : 5
processor : 3
physical id : 0
siblings : 4
core id : 3
cpu cores : 4
apicid : 7
initial apicid : 3
cpuid level : 5
processor : 4
physical id : 1
siblings : 4
core id : 0
cpu cores : 4
apicid : 8
initial apicid : 4
cpuid level : 5
processor : 5
physical id : 1
siblings : 4
core id : 1
cpu cores : 4
apicid : 9
initial apicid : 5
cpuid level : 5
processor : 6
physical id : 1
siblings : 4
core id : 2
cpu cores : 4
apicid : 10
initial apicid : 6
cpuid level : 5
processor : 7
physical id : 1
siblings : 4
core id : 3
cpu cores : 4
apicid : 11
initial apicid : 7
cpuid level : 5
processor : 8
physical id : 2
siblings : 4
core id : 0
cpu cores : 4
apicid : 12
initial apicid : 8
cpuid level : 5
processor : 9
physical id : 2
siblings : 4
core id : 1
cpu cores : 4
apicid : 13
initial apicid : 9
cpuid level : 5
processor : 10
physical id : 2
siblings : 4
core id : 2
cpu cores : 4
apicid : 14
initial apicid : 10
cpuid level : 5
processor : 11
physical id : 2
siblings : 4
core id : 3
cpu cores : 4
apicid : 15
initial apicid : 11
cpuid level : 5
processor : 12
physical id : 3
siblings : 4
core id : 0
cpu cores : 4
apicid : 16
initial apicid : 12
cpuid level : 5
processor : 13
physical id : 3
siblings : 4
core id : 1
cpu cores : 4
apicid : 17
initial apicid : 13
cpuid level : 5
processor : 14
physical id : 3
siblings : 4
core id : 2
cpu cores : 4
apicid : 18
initial apicid : 14
cpuid level : 5
processor : 15
physical id : 3
siblings : 4
core id : 3
cpu cores : 4
apicid : 19
initial apicid : 15
cpuid level : 5
On Wed, 2012-05-23 at 17:58 -0700, David Rientjes wrote:
> Same divide by zero. I'd be happy to run a debugging patch if you
> can
> come up with one.
>
> $ grep -E 'processor|core|sibling|physical id|apicid|
> cpuid' /proc/cpuinfo | sed 's/processor/\nprocessor/'
Curious, that looks like a 4 socket 4 core machine without HT. Is this
some Core2 era Xeon setup or so?
What does the node distance table on that thing look like?
cat /sys/devices/system/node/node*/distance
Anyway, could you boot that machine with
CONFIG_SCHED_DEBUG
CONFIG_FTRACE
and the following added to the boot parameters:
"sched_debug debug ftrace_dump_on_oops ftrace=nop"
that should dump the ftrace buffer (to which the trace_printk() stmts
go) to the console when it explodes.
If you could then send me the complete console output (privately if its
too big)..
NOTE this patch includes the previous patches so you should be able to
apply it to a clean tree.
---
arch/x86/mm/numa.c | 6 ++----
kernel/sched/core.c | 40 +++++++++++++++++++++++++++++++---------
kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++++++---------
lib/vsprintf.c | 5 +++++
4 files changed, 79 insertions(+), 22 deletions(-)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 19d3fa0..3f16071 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -751,7 +751,6 @@ int early_cpu_to_node(int cpu)
void debug_cpumask_set_cpu(int cpu, int node, bool enable)
{
struct cpumask *mask;
- char buf[64];
if (node == NUMA_NO_NODE) {
/* early_cpu_to_node() already emits a warning and trace */
@@ -769,10 +768,9 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable)
else
cpumask_clear_cpu(cpu, mask);
- cpulist_scnprintf(buf, sizeof(buf), mask);
- printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %pc\n",
enable ? "numa_add_cpu" : "numa_remove_cpu",
- cpu, node, buf);
+ cpu, node, mask);
return;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18eed17..eee020c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5537,9 +5537,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
- char str[256];
- cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
cpumask_clear(groupmask);
printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -5552,7 +5550,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %s level %s\n", str, sd->name);
+ printk(KERN_CONT "span %pc level %s\n", sched_domain_span(sd), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -5593,9 +5591,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_or(groupmask, groupmask, sched_group_cpus(group));
- cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-
- printk(KERN_CONT " %s", str);
+ printk(KERN_CONT " %pc", sched_group_cpus(group));
if (group->sgp->power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
group->sgp->power);
@@ -6005,13 +6001,18 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
} else
cpumask_set_cpu(i, sg_span);
+ trace_printk(" group: cpu (%d) span (%pc)\n", cpu, sg_span);
+
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ sg->sgp = *per_cpu_ptr(sdd->sgp, i);
atomic_inc(&sg->sgp->ref);
- if (cpumask_test_cpu(cpu, sg_span))
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ cpumask_first(sg_span) == cpu) {
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
groups = sg;
+ }
if (!first)
first = sg;
@@ -6125,6 +6126,9 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
sg = sg->next;
} while (sg != sd->groups);
+ trace_printk("groups init: cpu (%d) domain (%pc)\n", cpu,
+ sched_domain_span(sd));
+
if (cpu != group_first_cpu(sg))
return;
@@ -6421,6 +6425,7 @@ static void sched_init_numa(void)
sched_domains_numa_distance[level++] = next_distance;
sched_domains_numa_levels = level;
curr_distance = next_distance;
+ trace_printk("numa: found distance: %d\n", next_distance);
} else break;
}
/*
@@ -6446,7 +6451,7 @@ static void sched_init_numa(void)
return;
for (j = 0; j < nr_node_ids; j++) {
- struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
if (!mask)
return;
@@ -6458,6 +6463,9 @@ static void sched_init_numa(void)
cpumask_or(mask, mask, cpumask_of_node(k));
}
+
+ trace_printk("numa: level (%d) node (%d) mask (%pc)\n",
+ i, j, mask);
}
}
@@ -6484,6 +6492,8 @@ static void sched_init_numa(void)
};
}
+ trace_printk("numa: %d levels of numa goodness added!\n", j);
+
sched_domain_topology = tl;
}
#else
@@ -6621,6 +6631,8 @@ static int build_sched_domains(const struct cpumask *cpu_map,
sd = NULL;
for (tl = sched_domain_topology; tl->init; tl++) {
sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+ trace_printk("domain: cpu (%d) span (%pc)\n",
+ i, sched_domain_span(sd));
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
@@ -6636,6 +6648,8 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ struct sched_group *sg;
+
sd->span_weight = cpumask_weight(sched_domain_span(sd));
if (sd->flags & SD_OVERLAP) {
if (build_overlap_sched_groups(sd, i))
@@ -6644,6 +6658,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
if (build_sched_groups(sd, i))
goto error;
}
+
+ sg = sd->groups;
+ do {
+ trace_printk("groups: cpu (%d) domain (%pc) group (%pc)\n",
+ i, sched_domain_span(sd),
+ sched_group_cpus(sg));
+ sg = sg->next;
+ } while (sg != sd->groups);
}
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index de49ed5..77a48ad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3697,15 +3697,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available;
+ u64 total, available, age_stamp, avg;
- total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = ACCESS_ONCE(rq->age_stamp);
+ avg = ACCESS_ONCE(rq->rt_avg);
- if (unlikely(total < rq->rt_avg)) {
+ total = sched_avg_period() + (rq->clock - age_stamp);
+
+ if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
available = 0;
} else {
- available = total - rq->rt_avg;
+ available = total - avg;
}
if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3763,18 +3770,43 @@ void update_group_power(struct sched_domain *sd, int cpu)
if (!child) {
update_cpu_power(sd, cpu);
+ trace_printk("power: cpu (%d) : %d\n", cpu, sdg->sgp->power);
return;
}
power = 0;
- group = child->groups;
- do {
- power += group->sgp->power;
- group = group->next;
- } while (group != child->groups);
+ if (child->flags & SD_OVERLAP) {
+ int i;
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
+
+ for_each_cpu(i, sched_group_cpus(sdg)) {
+ power += power_of(i);
+ trace_printk("power: cpu (%d) cpu (%d) inc (%ld) : %ld\n",
+ cpu, i, power_of(i), power);
+ }
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
+
+ group = child->groups;
+ do {
+ power += group->sgp->power;
+ trace_printk("power: cpu (%d) group (%pc) inc (%d) : %ld\n",
+ cpu, sched_group_cpus(group),
+ group->sgp->power, power);
+ group = group->next;
+ } while (group != child->groups);
+ }
sdg->sgp->power = power;
+ trace_printk("power: cpu (%d) group (%pc) : %ld\n",
+ cpu, sched_group_cpus(sdg), power);
}
/*
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index abbabec..3b880ae 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -25,6 +25,7 @@
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ioport.h>
+#include <linux/cpumask.h>
#include <net/addrconf.h>
#include <asm/page.h> /* for PAGE_SIZE */
@@ -857,6 +858,7 @@ int kptr_restrict __read_mostly;
* correctness of the format string and va_list arguments.
* - 'K' For a kernel pointer that should be hidden from unprivileged users
* - 'NF' For a netdev_features_t
+ * - 'c' For a cpumask list
*
* Note: The difference between 'S' and 'F' is that on ia64 and ppc64
* function pointers are really function descriptors, which contain a
@@ -941,6 +943,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
return netdev_feature_string(buf, end, ptr, spec);
}
break;
+ case 'c':
+ return buf + cpulist_scnprintf(buf, end - buf, ptr);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {
@@ -1175,6 +1179,7 @@ int format_decode(const char *fmt, struct printf_spec *spec)
* %pI6c print an IPv6 address as specified by RFC 5952
* %pU[bBlL] print a UUID/GUID in big or little endian using lower or upper
* case.
+ * %pc print a cpumask as comma-separated list
* %n is ignored
*
* The return value is the number of characters which would
Commit-ID: b654f7de41b0e3903ee2b51d3b8db77fe52ce728
Gitweb: http://git.kernel.org/tip/b654f7de41b0e3903ee2b51d3b8db77fe52ce728
Author: Peter Zijlstra <[email protected]>
AuthorDate: Tue, 22 May 2012 14:04:28 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 30 May 2012 14:02:24 +0200
sched: Make sure to not re-read variables after validation
We could re-read rq->rt_avg after we validated it was smaller than
total, invalidating the check and resulting in an unintended negative.
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: David Rientjes <[email protected]>
Link: http://lkml.kernel.org/r/1337688268.9698.29.camel@twins
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 15 +++++++++++----
1 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f0380d4..2b449a7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available;
+ u64 total, available, age_stamp, avg;
- total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = ACCESS_ONCE(rq->age_stamp);
+ avg = ACCESS_ONCE(rq->rt_avg);
+
+ total = sched_avg_period() + (rq->clock - age_stamp);
- if (unlikely(total < rq->rt_avg)) {
+ if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
available = 0;
} else {
- available = total - rq->rt_avg;
+ available = total - avg;
}
if (unlikely((s64)total < SCHED_POWER_SCALE))
Commit-ID: 74a5ce20e6eeeb3751340b390e7ac1d1d07bbf55
Gitweb: http://git.kernel.org/tip/74a5ce20e6eeeb3751340b390e7ac1d1d07bbf55
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 23 May 2012 18:00:43 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 30 May 2012 14:02:24 +0200
sched: Fix SD_OVERLAP
SD_OVERLAP exists to allow overlapping groups, overlapping groups
appear in NUMA topologies that aren't fully connected.
The typical result of not fully connected NUMA is that each cpu (or
rather node) will have different spans for a particular distance.
However due to how sched domains are traversed -- only the first cpu
in the mask goes one level up -- the next level only cares about the
spans of the cpus that went up.
Due to this two things were observed to be broken:
- build_overlap_sched_groups() -- since its possible the cpu we're
building the groups for exists in multiple (or all) groups, the
selection criteria of the first group didn't ensure there was a cpu
for which is was true that cpumask_first(span) == cpu. Thus load-
balancing would terminate.
- update_group_power() -- assumed that the cpu span of the first
group of the domain was covered by all groups of the child domain.
The above explains why this isn't true, so deal with it.
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: David Rientjes <[email protected]>
Link: http://lkml.kernel.org/r/1337788843.9783.14.camel@laptop
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/core.c | 7 +++++--
kernel/sched/fair.c | 25 ++++++++++++++++++++-----
2 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5573361..3a69374 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6030,11 +6030,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ sg->sgp = *per_cpu_ptr(sdd->sgp, i);
atomic_inc(&sg->sgp->ref);
- if (cpumask_test_cpu(cpu, sg_span))
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ cpumask_first(sg_span) == cpu) {
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
groups = sg;
+ }
if (!first)
first = sg;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d1..f0380d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3574,11 +3574,26 @@ void update_group_power(struct sched_domain *sd, int cpu)
power = 0;
- group = child->groups;
- do {
- power += group->sgp->power;
- group = group->next;
- } while (group != child->groups);
+ if (child->flags & SD_OVERLAP) {
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
+
+ for_each_cpu(cpu, sched_group_cpus(sdg))
+ power += power_of(cpu);
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
+
+ group = child->groups;
+ do {
+ power += group->sgp->power;
+ group = group->next;
+ } while (group != child->groups);
+ }
sdg->sgp->power = power;
}
On Fri, 2012-05-25 at 10:35 +0200, Peter Zijlstra wrote:
> What does the node distance table on that thing look like?
The below makes it boot and I think even balance right. But I'm not
happy with the patch as I think it can be done simpler. The resulting
domain setup isn't minimal either.
---
Subject:
From: Peter Zijlstra <[email protected]>
Date: Thu May 31 14:47:33 CEST 2012
Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/sched.h | 11 ++++++++
kernel/sched/core.c | 64 +++++++++++++++++++++++++++++++++++++++++++-------
kernel/sched/fair.c | 5 ++-
kernel/sched/sched.h | 2 +
4 files changed, 72 insertions(+), 10 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -878,6 +878,8 @@ struct sched_group_power {
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;
+
+ unsigned long cpumask[0]; /* iteration mask */
};
struct sched_group {
@@ -902,6 +904,15 @@ static inline struct cpumask *sched_grou
return to_cpumask(sg->cpumask);
}
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgp->cpumask);
+}
+
/**
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
* @group: The group whose first cpu is to be returned.
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6008,6 +6008,44 @@ struct sched_domain_topology_level {
struct sd_data data;
};
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+ const struct cpumask *span = sched_domain_span(sd);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ for_each_cpu(i, span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ cpumask_set_cpu(i, sched_group_mask(sg));
+ }
+}
+
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
@@ -6026,6 +6064,12 @@ build_overlap_sched_groups(struct sched_
if (cpumask_test_cpu(i, covered))
continue;
+ child = *per_cpu_ptr(sdd->sd, i);
+
+ /* See the comment near build_group_mask(). */
+ if (!cpumask_test_cpu(i, sched_domain_span(child)))
+ continue;
+
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(cpu));
@@ -6033,8 +6077,6 @@ build_overlap_sched_groups(struct sched_
goto fail;
sg_span = sched_group_cpus(sg);
-
- child = *per_cpu_ptr(sdd->sd, i);
if (child->child) {
child = child->child;
cpumask_copy(sg_span, sched_domain_span(child));
@@ -6044,13 +6086,18 @@ build_overlap_sched_groups(struct sched_
cpumask_or(covered, covered, sg_span);
sg->sgp = *per_cpu_ptr(sdd->sgp, i);
- atomic_inc(&sg->sgp->ref);
+ if (atomic_inc_return(&sg->sgp->ref) == 1)
+ build_group_mask(sd, sg);
+
+ /*
+ * Make sure the first group of this domain contains the
+ * canonical balance cpu. Otherwise the sched_domain iteration
+ * breaks. See update_sg_lb_stats().
+ */
if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- cpumask_first(sg_span) == cpu) {
- WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
+ group_balance_cpu(sg) == cpu)
groups = sg;
- }
if (!first)
first = sg;
@@ -6123,6 +6170,7 @@ build_sched_groups(struct sched_domain *
cpumask_clear(sched_group_cpus(sg));
sg->sgp->power = 0;
+ cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -6164,7 +6212,7 @@ static void init_sched_groups_power(int
sg = sg->next;
} while (sg != sd->groups);
- if (cpu != group_first_cpu(sg))
+ if (cpu != group_balance_cpu(sg))
return;
update_group_power(sd, cpu);
@@ -6572,7 +6620,7 @@ static int __sdt_alloc(const struct cpum
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power),
+ sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sgp)
return -ENOMEM;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3846,7 +3846,7 @@ static inline void update_sg_lb_stats(st
int i;
if (local_group)
- balance_cpu = group_first_cpu(group);
+ balance_cpu = group_balance_cpu(group);
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
@@ -3861,7 +3861,8 @@ static inline void update_sg_lb_stats(st
/* Bias balancing toward cpus of our domain */
if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
+ if (idle_cpu(i) && !first_idle_cpu &&
+ cpumask_test_cpu(i, sched_group_mask(group))) {
first_idle_cpu = 1;
balance_cpu = i;
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -542,6 +542,8 @@ DECLARE_PER_CPU(struct sched_domain *, s
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_node);
+extern int group_balance_cpu(struct sched_group *sg);
+
#endif /* CONFIG_SMP */
#include "stats.h"