Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751760AbdGaTfJ (ORCPT ); Mon, 31 Jul 2017 15:35:09 -0400 Received: from shelob.surriel.com ([96.67.55.147]:45750 "EHLO shelob.surriel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751013AbdGaTfF (ORCPT ); Mon, 31 Jul 2017 15:35:05 -0400 From: riel@redhat.com To: linux-kernel@vger.kernel.org Cc: peterz@infradead.org, mgorman@suse.de, mingo@kernel.org, jhladky@redhat.com, lvenanci@redhat.com Subject: [RHEL-ALT-7.4 PATCH 2/2] sched,numa: scale scan period with tasks in group and shared/private Date: Mon, 31 Jul 2017 15:28:47 -0400 Message-Id: <20170731192847.23050-3-riel@redhat.com> X-Mailer: git-send-email 2.9.4 In-Reply-To: <20170731192847.23050-1-riel@redhat.com> References: <20170731192847.23050-1-riel@redhat.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 5680 Lines: 189 From: Rik van Riel Running 80 tasks in the same group, or as threads of the same process, results in the memory getting scanned 80x as fast as it would be if a single task was using the memory. This really hurts some workloads. Scale the scan period by the number of tasks in the numa group, and the shared / private ratio, so the average rate at which memory in the group is scanned corresponds roughly to the rate at which a single task would scan its memory. Signed-off-by: Rik van Riel --- kernel/sched/fair.c | 111 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 86 insertions(+), 25 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f8481b24a834..ca3b1e8e21b0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1076,6 +1076,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + pid_t gid; + int active_nodes; + + struct rcu_head rcu; + unsigned long total_faults; + unsigned long max_faults_cpu; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long *faults_cpu; + unsigned long faults[0]; +}; + +static inline unsigned long group_faults_priv(struct numa_group *ng); +static inline unsigned long group_faults_shared(struct numa_group *ng); + static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; @@ -1112,13 +1135,47 @@ static unsigned int task_scan_min(struct task_struct *p) return max_t(unsigned int, floor, scan); } +static unsigned int task_scan_start(struct task_struct *p) +{ + unsigned long smin = task_scan_min(p); + unsigned long period = smin; + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + } + + return max(smin, period); +} + static unsigned int task_scan_max(struct task_struct *p) { - unsigned int smin = task_scan_min(p); - unsigned int smax; + unsigned long smin = task_scan_min(p); + unsigned long smax; /* Watch for min being lower than max due to floor calculations */ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + unsigned long period = smax; + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + + smax = max(smax, period); + } + return max(smin, smax); } @@ -1134,26 +1191,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } -struct numa_group { - atomic_t refcount; - - spinlock_t lock; /* nr_tasks, tasks */ - int nr_tasks; - pid_t gid; - int active_nodes; - - struct rcu_head rcu; - unsigned long total_faults; - unsigned long max_faults_cpu; - /* - * Faults_cpu is used to decide whether memory should move - * towards the CPU. As a consequence, these stats are weighted - * more by CPU use than by memory faults. - */ - unsigned long *faults_cpu; - unsigned long faults[0]; -}; - /* Shared or private faults. */ #define NR_NUMA_HINT_FAULT_TYPES 2 @@ -1203,6 +1240,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +static inline unsigned long group_faults_priv(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + + return faults; +} + +static inline unsigned long group_faults_shared(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; + } + + return faults; +} + /* * A node triggering more than 1/3 as many NUMA faults as the maximum is * considered part of a numa group's pseudo-interleaving set. Migrations @@ -1813,7 +1874,7 @@ static int task_numa_migrate(struct task_struct *p) * Reset the scan period if the task is being rescheduled on an * alternative node to recheck if the tasks is now properly placed. */ - p->numa_scan_period = task_scan_min(p); + p->numa_scan_period = task_scan_start(p); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); @@ -2464,7 +2525,7 @@ void task_numa_work(struct callback_head *work) if (p->numa_scan_period == 0) { p->numa_scan_period_max = task_scan_max(p); - p->numa_scan_period = task_scan_min(p); + p->numa_scan_period = task_scan_start(p); } next_scan = now + msecs_to_jiffies(p->numa_scan_period); @@ -2591,7 +2652,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now > curr->node_stamp + period) { if (!curr->node_stamp) - curr->numa_scan_period = task_scan_min(curr); + curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) { -- 2.9.4