BLD patch for Linux-3.18. Changes since previous release:
* Contains changes to address issue at the time of wakeup
of rt tasks, it was reported by Peter Junos <[email protected]>.
BLD has some positive impact on vmlinux size too, the following shows
the picture:
$ cat size.mainline
text data bss dec hex filename
12769041 2056008 11722752 26547801 1951659 vmlinux
$ cat size.bld
text data bss dec hex filename
12755462 2056040 11722752 26534254 194e16e vmlinux
and the config could be found here:
https://raw.githubusercontent.com/rmullick/bld-patches/master/config.benchmark-3.17
After previous release, Mike Galbraith shows that on systems BLD can
reduce throughput significantly, due to L2 misses and where no L3 is
available, that issue is yet to address, I lack those kind of systems,
so it might take some time.
Thanks,
Rakib
---
diff --git a/init/Kconfig b/init/Kconfig
index 2081a4d..becfd85 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -36,6 +36,15 @@ config BROKEN_ON_SMP
depends on BROKEN || !SMP
default y
+config BLD
+ bool "An alternate CPU load distribution technique for task scheduler"
+ depends on SMP
+ default y
+ help
+ This is an alternate CPU load distribution technique based for task
+ scheduler based on The Barbershop Load Distribution algorithm. Not
+ suitable for NUMA, should work well on SMP.
+
config INIT_ENV_ARG_LIMIT
int
default 32 if !UML
diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
new file mode 100644
index 0000000..f1f9fba
--- /dev/null
+++ b/kernel/sched/bld.h
@@ -0,0 +1,215 @@
+#ifdef CONFIG_BLD
+
+static DEFINE_RWLOCK(rt_list_lock);
+static LIST_HEAD(rt_rq_head);
+static LIST_HEAD(cfs_rq_head);
+static DEFINE_RWLOCK(cfs_list_lock);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->rq;
+}
+#else
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+ return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+ return rt_rq->rq;
+}
+#else
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+ return container_of(rt_rq, struct rq, rt);
+}
+#endif
+
+static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
+{
+ int cpu = smp_processor_id(), i;
+ unsigned long load, varload;
+ struct rq *rq;
+
+ if (task_type) {
+ varload = ULONG_MAX;
+ for_each_cpu(i, mask) {
+ rq = cpu_rq(i);
+ load = rq->cfs.load.weight;
+ if (load < varload) {
+ varload = load;
+ cpu = i;
+ }
+ }
+ } else {
+ /* Here's an attempt to get a CPU within the mask where
+ * we can preempt easily. To achieve this we tried to
+ * maintain a lowbit, which indicate the lowest bit set on
+ * array bitmap. Since all CPUs contains high priority
+ * kernel threads therefore we eliminate 0, so it might not
+ * be right every time, but it's just an indicator.
+ */
+ varload = 1;
+
+ for_each_cpu(i, mask) {
+ rq = cpu_rq(i);
+ load = rq->rt.lowbit;
+ if (load >= varload) {
+ varload = load;
+ cpu = i;
+ }
+ }
+ }
+
+ return cpu;
+}
+
+static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
+{
+ struct cfs_rq *cfs;
+ unsigned long flags;
+ unsigned int cpu = smp_processor_id();
+
+ read_lock_irqsave(&cfs_list_lock, flags);
+ list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
+ cpu = cpu_of(rq_of_cfs(cfs));
+ if (cpu_online(cpu))
+ break;
+ }
+ read_unlock_irqrestore(&cfs_list_lock, flags);
+ return cpu;
+}
+
+static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
+{
+ struct rt_rq *rt;
+ unsigned long flags;
+ unsigned int cpu = smp_processor_id();
+
+ read_lock_irqsave(&rt_list_lock, flags);
+ list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
+ cpu = cpu_of(rq_of_rt(rt));
+ if (cpu_online(cpu))
+ break;
+ }
+ read_unlock_irqrestore(&rt_list_lock, flags);
+ return cpu;
+}
+
+static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
+{
+ unsigned int cpu = smp_processor_id(), want_affine = 0;
+ struct cpumask *tmpmask;
+
+ if (p->nr_cpus_allowed == 1)
+ return task_cpu(p);
+
+ if (sd_flags & SD_BALANCE_WAKE) {
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ want_affine = 1;
+ }
+ }
+
+ if (want_affine)
+ tmpmask = tsk_cpus_allowed(p);
+ else
+ tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
+
+ if (rt_task(p))
+ cpu = select_cpu_for_wakeup(0, tmpmask);
+ else
+ cpu = select_cpu_for_wakeup(1, tmpmask);
+
+ return cpu;
+}
+
+static void track_load_rt(struct rq *rq, struct task_struct *p)
+{
+ unsigned long flag;
+ int firstbit;
+ struct rt_rq *first;
+ struct rt_prio_array *array = &rq->rt.active;
+
+ first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
+ firstbit = sched_find_first_bit(array->bitmap);
+
+ /* Maintaining rt.lowbit */
+ if (firstbit > 0 && firstbit <= rq->rt.lowbit)
+ rq->rt.lowbit = firstbit;
+
+ if (rq->rt.lowbit < first->lowbit) {
+ write_lock_irqsave(&rt_list_lock, flag);
+ list_del(&rq->rt.bld_rt_list);
+ list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
+ write_unlock_irqrestore(&rt_list_lock, flag);
+ }
+}
+
+static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
+{
+ unsigned int cpu;
+
+ if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
+ cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
+ else {
+ if (rt_task(p))
+ cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
+ else
+ cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
+ }
+
+ return cpu;
+}
+
+static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
+{
+ unsigned long flag;
+ if (rt_task(p)) {
+ track_load_rt(rq, p);
+ } else {
+ if (rq->cfs.pos != 2) {
+ struct cfs_rq *last;
+ last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
+ if (rq->cfs.load.weight >= last->load.weight) {
+ write_lock_irqsave(&cfs_list_lock, flag);
+ list_del(&rq->cfs.bld_cfs_list);
+ list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
+ rq->cfs.pos = 2; last->pos = 1;
+ write_unlock_irqrestore(&cfs_list_lock, flag);
+ }
+ }
+ }
+}
+
+static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
+{
+ unsigned long flag;
+ if (rt_task(p)) {
+ track_load_rt(rq, p);
+ } else {
+ if (rq->cfs.pos != 0) {
+ struct cfs_rq *first;
+ first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
+ if (rq->cfs.load.weight <= first->load.weight) {
+ write_lock_irqsave(&cfs_list_lock, flag);
+ list_del(&rq->cfs.bld_cfs_list);
+ list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
+ rq->cfs.pos = 0; first->pos = 1;
+ write_unlock_irqrestore(&cfs_list_lock, flag);
+ }
+ }
+ }
+}
+#else
+static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
+{
+}
+#endif /* CONFIG_BLD */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 89e7283..bd702c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -24,6 +24,8 @@
* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
* Thomas Gleixner, Mike Kravetz
+ * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate
+ * CPU load distribution technique for kernel scheduler by Rakib Mullick.
*/
#include <linux/mm.h>
@@ -86,6 +88,7 @@
#include "sched.h"
#include "../workqueue_internal.h"
#include "../smpboot.h"
+#include "bld.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -840,6 +843,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq);
sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+ if (!dl_task(p))
+ bld_track_load_activate(rq, p);
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -847,6 +852,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq);
sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
+ if (!dl_task(p))
+ bld_track_load_deactivate(rq, p);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1407,7 +1414,14 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
+#ifndef CONFIG_BLD
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+#else
+ if (dl_task(p))
+ cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
+ else
+ cpu = bld_get_cpu(p, sd_flags, wake_flags);
+#endif
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1577,7 +1591,11 @@ void scheduler_ipi(void)
*/
preempt_fold_need_resched();
+#ifndef CONFIG_BLD
if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+#else
+ if (llist_empty(&this_rq()->wake_list))
+#endif
return;
/*
@@ -1599,13 +1617,16 @@ void scheduler_ipi(void)
/*
* Check if someone kicked us for doing the nohz idle load balance.
*/
+#ifndef CONFIG_BLD
if (unlikely(got_nohz_idle_kick())) {
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
+#endif
irq_exit();
}
+#ifndef CONFIG_BLD
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -1618,6 +1639,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
}
}
+#endif
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+}
+
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -1636,18 +1664,13 @@ void wake_up_if_idle(int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
}
-
-bool cpus_share_cache(int this_cpu, int that_cpu)
-{
- return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
-}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu)
{
struct rq *rq = cpu_rq(cpu);
-#if defined(CONFIG_SMP)
+#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
@@ -1966,7 +1989,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ __set_task_cpu(p, cpu);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2451,7 +2474,14 @@ void sched_exec(void)
int dest_cpu;
raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifndef CONFIG_BLD
dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+#else
+ if (dl_task(p))
+ dest_cpu = task_cpu(p);
+ else
+ dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
+#endif
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -2539,8 +2569,10 @@ void scheduler_tick(void)
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
+#ifndef CONFIG_BLD
trigger_load_balance(rq);
#endif
+#endif
rq_last_tick_reset(rq);
}
@@ -7126,6 +7158,15 @@ void __init sched_init(void)
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
+#ifdef CONFIG_BLD
+ INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
+ list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
+ rq->cfs.pos = 0;
+
+ INIT_LIST_HEAD(&rq->rt.bld_rt_list);
+ list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
+ rq->rt.lowbit = INT_MAX;
+#endif
}
set_load_weight(&init_task);
@@ -7166,6 +7207,9 @@ void __init sched_init(void)
init_sched_fair_class();
scheduler_running = 1;
+#ifdef CONFIG_BLD
+ printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
+#endif
}
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef2b104..ea45f76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4164,6 +4164,7 @@ static void task_waking_fair(struct task_struct *p)
record_wakee(p);
}
+#ifndef CONFIG_BLD
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* effective_load() calculates the load change as seen from the root_task_group
@@ -4619,6 +4620,7 @@ unlock:
return new_cpu;
}
+#endif /* CONFIG_BLD */
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
@@ -4914,6 +4916,7 @@ simple:
return p;
idle:
+#ifndef CONFIG_BLD
new_tasks = idle_balance(rq);
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
@@ -4925,7 +4928,7 @@ idle:
if (new_tasks > 0)
goto again;
-
+#endif
return NULL;
}
@@ -7107,12 +7110,39 @@ static inline int on_null_domain(struct rq *rq)
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
+#ifndef CONFIG_BLD
static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
+static inline void nohz_balance_exit_idle(int cpu)
+{
+ if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+ /*
+ * Completely isolated CPUs don't ever set, so we must test.
+ */
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ }
+ clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+ }
+}
+
+static int sched_ilb_notifier(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DYING:
+ nohz_balance_exit_idle(smp_processor_id());
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
static inline int find_new_ilb(void)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
@@ -7150,20 +7180,7 @@ static void nohz_balancer_kick(void)
smp_send_reschedule(ilb_cpu);
return;
}
-
-static inline void nohz_balance_exit_idle(int cpu)
-{
- if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
- /*
- * Completely isolated CPUs don't ever set, so we must test.
- */
- if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
- }
- clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
- }
-}
+#endif /* CONFIG_BLD */
static inline void set_cpu_sd_state_busy(void)
{
@@ -7205,6 +7222,7 @@ unlock:
*/
void nohz_balance_enter_idle(int cpu)
{
+#ifndef CONFIG_BLD
/*
* If this cpu is going down, then nothing needs to be done.
*/
@@ -7223,23 +7241,10 @@ void nohz_balance_enter_idle(int cpu)
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-}
-
-static int sched_ilb_notifier(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DYING:
- nohz_balance_exit_idle(smp_processor_id());
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
+#endif
}
#endif
-static DEFINE_SPINLOCK(balancing);
-
/*
* Scale the max load_balance interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
@@ -7249,6 +7254,9 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
+#ifndef CONFIG_BLD
+static DEFINE_SPINLOCK(balancing);
+
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
@@ -7497,6 +7505,7 @@ void trigger_load_balance(struct rq *rq)
nohz_balancer_kick();
#endif
}
+#endif /* CONFIG_BLD */
static void rq_online_fair(struct rq *rq)
{
@@ -7942,7 +7951,9 @@ const struct sched_class fair_sched_class = {
.put_prev_task = put_prev_task_fair,
#ifdef CONFIG_SMP
+#ifndef CONFIG_BLD
.select_task_rq = select_task_rq_fair,
+#endif
.migrate_task_rq = migrate_task_rq_fair,
.rq_online = rq_online_fair,
@@ -7982,6 +7993,7 @@ void print_cfs_stats(struct seq_file *m, int cpu)
__init void init_sched_fair_class(void)
{
+#ifndef CONFIG_BLD
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
@@ -7991,5 +8003,5 @@ __init void init_sched_fair_class(void)
cpu_notifier(sched_ilb_notifier, 0);
#endif
#endif /* SMP */
-
+#endif /* BLD */
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 20bca39..03a0ed3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1295,6 +1295,7 @@ static void yield_task_rt(struct rq *rq)
#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
+#ifndef CONFIG_BLD
static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
@@ -1348,6 +1349,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
out:
return cpu;
}
+#endif /* CONFIG_BLD */
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
@@ -2111,7 +2113,9 @@ const struct sched_class rt_sched_class = {
.put_prev_task = put_prev_task_rt,
#ifdef CONFIG_SMP
+#ifndef CONFIG_BLD
.select_task_rq = select_task_rq_rt,
+#endif
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2df8ef0..79fce51 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -366,9 +366,8 @@ struct cfs_rq {
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
-
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
@@ -392,6 +391,11 @@ struct cfs_rq {
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_BLD
+ struct list_head bld_cfs_list;
+ char pos;
+#endif
};
static inline int rt_bandwidth_enabled(void)
@@ -425,12 +429,16 @@ struct rt_rq {
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
+ struct rq *rq;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
- struct rq *rq;
struct task_group *tg;
#endif
+#ifdef CONFIG_BLD
+ struct list_head bld_rt_list;
+ int lowbit;
+#endif
};
/* Deadline class' related fields in a runqueue */