Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753034AbbGEK3f (ORCPT ); Sun, 5 Jul 2015 06:29:35 -0400 Received: from mail-wg0-f47.google.com ([74.125.82.47]:33348 "EHLO mail-wg0-f47.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750883AbbGEK30 (ORCPT ); Sun, 5 Jul 2015 06:29:26 -0400 Message-ID: <1436025462.17152.37.camel@gmail.com> Subject: Re: [PATCH RESEND] sched: prefer an idle cpu vs an idle sibling for BALANCE_WAKE From: Mike Galbraith To: Josef Bacik Cc: Peter Zijlstra , riel@redhat.com, mingo@redhat.com, linux-kernel@vger.kernel.org, morten.rasmussen@arm.com, kernel-team Date: Sat, 04 Jul 2015 17:57:42 +0200 In-Reply-To: <1435905658.6418.52.camel@gmail.com> References: <1432761736-22093-1-git-send-email-jbacik@fb.com> <20150528102127.GD3644@twins.programming.kicks-ass.net> <20150528110514.GR18673@twins.programming.kicks-ass.net> <1434087305.3674.26.camel@gmail.com> <5581B70D.2000800@fb.com> <1434588939.3444.25.camel@gmail.com> <55823F33.7040005@fb.com> <1434600765.3393.9.camel@gmail.com> <55957871.7080906@fb.com> <1435905658.6418.52.camel@gmail.com> Content-Type: text/plain; charset="UTF-8" X-Mailer: Evolution 3.12.11 Mime-Version: 1.0 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 7425 Lines: 238 On Fri, 2015-07-03 at 08:40 +0200, Mike Galbraith wrote: > Hm. Seems what this load should like best is if we detect 1:N, skip all > of the routine gyrations, ie move the N (workers) infrequently, expend > search cycles frequently only on the 1 (dispatch). > > Ponder.. Since it was too hot to do outside chores (any excuse will do;)... If we're (read /me) on track, the bellow should help. Per my tracing, it may want a wee bit of toning down actually, though when I trace virgin source I expect to see the same, namely Xorg and friends having "wide-load" tattooed across their hindquarters earlier than they should. It doesn't seem to hurt anything, but then demolishing a single llc box is a tad more difficult than demolishing a NUMA box. sched: beef up wake_wide() Josef Bacik reported that Facebook sees better performance with their 1:N load (1 dispatch/node, N workers/node) when carrying an old patch to try very hard to wake to an idle CPU. While looking at wake_wide(), I noticed that it doesn't pay attention to wakeup of the 1:N waker, returning 1 only when waking one of its N minions. Correct that, and give the user the option to do an expensive balance IFF select_idle_sibling() doesn't find an idle CPU, and IFF the wakee is the the 1:N dispatcher of work, thus worth some extra effort. Not-Signed-off-by: Mike Galbraith --- kernel/sched/fair.c | 89 +++++++++++++++++++++++++----------------------- kernel/sched/features.h | 6 +++ 2 files changed, 54 insertions(+), 41 deletions(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -666,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *c } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int cpu, void *clear); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); @@ -1375,7 +1375,7 @@ static void task_numa_compare(struct tas * Call select_idle_sibling to maybe find a better one. */ if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu, NULL); assign: task_numa_assign(env, cur, imp); @@ -4730,26 +4730,30 @@ static long effective_load(struct task_g #endif +/* + * Detect 1:N waker/wakee relationship via a switching-frequency heuristic. + * A waker of many should wake a different task than the one last awakened + * at a frequency roughly N times higher than one of its wakees. In order + * to determine whether we should let the load spread vs consolodating to + * shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. With + * both conditions met, we can be relatively sure that we are seeing a 1:N + * relationship, and that load size exceeds socket size. + */ static int wake_wide(struct task_struct *p) { - int factor = this_cpu_read(sd_llc_size); - - /* - * Yeah, it's the switching-frequency, could means many wakee or - * rapidly switch, use factor here will just help to automatically - * adjust the loose-degree, so bigger node will lead to more pull. - */ - if (p->wakee_flips > factor) { - /* - * wakee is somewhat hot, it needs certain amount of cpu - * resource, so if waker is far more hot, prefer to leave - * it alone. - */ - if (current->wakee_flips > (factor * p->wakee_flips)) - return 1; + unsigned long waker_flips = current->wakee_flips; + unsigned long wakee_flips = p->wakee_flips; + int factor = this_cpu_read(sd_llc_size), ret = 1; + + if (waker_flips < wakee_flips) { + swap(waker_flips, wakee_flips); + /* Tell the caller that we're waking a 1:N waker */ + ret += sched_feat(WAKE_WIDE_BALANCE); } - - return 0; + if (wakee_flips < factor || waker_flips < wakee_flips * factor) + return 0; + return ret; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) @@ -4761,13 +4765,6 @@ static int wake_affine(struct sched_doma unsigned long weight; int balanced; - /* - * If we wake multiple tasks be careful to not bounce - * ourselves around too much. - */ - if (wake_wide(p)) - return 0; - idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); @@ -4935,20 +4932,22 @@ find_idlest_cpu(struct sched_group *grou /* * Try and locate an idle CPU in the sched_domain. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int target, void *clear) { struct sched_domain *sd; struct sched_group *sg; int i = task_cpu(p); if (idle_cpu(target)) - return target; + goto done; /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) { + target = i; + goto done; + } /* * Otherwise, iterate the domains and find an elegible idle cpu. @@ -4973,7 +4972,11 @@ static int select_idle_sibling(struct ta sg = sg->next; } while (sg != sd->groups); } + return target; done: + if (clear) + *(void **)clear = 0; + return target; } /* @@ -5021,14 +5024,19 @@ select_task_rq_fair(struct task_struct * { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int new_cpu = cpu; - int want_affine = 0; + int new_cpu = prev_cpu; + int want_affine = 0, want_balance = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) - want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); - rcu_read_lock(); + if (sd_flag & SD_BALANCE_WAKE) { + want_affine = wake_wide(p); + want_balance = want_affine > 1; + want_affine = !want_affine && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + if (!want_affine && !want_balance) + goto select; + } + for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) continue; @@ -5043,23 +5051,23 @@ select_task_rq_fair(struct task_struct * break; } - if (tmp->flags & sd_flag) + if (tmp->flags & sd_flag || want_balance) sd = tmp; } if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + new_cpu = cpu; if (sd_flag & SD_BALANCE_WAKE) { - new_cpu = select_idle_sibling(p, prev_cpu); - goto unlock; +select: + new_cpu = select_idle_sibling(p, new_cpu, &sd); } while (sd) { struct sched_group *group; int weight; - if (!(sd->flags & sd_flag)) { + if (!(sd->flags & sd_flag) && !want_balance) { sd = sd->child; continue; } @@ -5089,7 +5097,6 @@ select_task_rq_fair(struct task_struct * } /* while loop will break here if sd == NULL */ } -unlock: rcu_read_unlock(); return new_cpu; --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -96,3 +96,9 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) */ SCHED_FEAT(NUMA_RESIST_LOWER, false) #endif + +/* + * Perform expensive full wake balance for 1:N wakers when the + * selected cpu is not completely idle. + */ +SCHED_FEAT(WAKE_WIDE_BALANCE, false) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/