Commit-ID: f27dde8deef33c9e58027df11ceab2198601d6a6
Gitweb: http://git.kernel.org/tip/f27dde8deef33c9e58027df11ceab2198601d6a6
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 14 Aug 2013 14:55:31 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 25 Sep 2013 14:07:49 +0200
sched: Add NEED_RESCHED to the preempt_count
In order to combine the preemption and need_resched test we need to
fold the need_resched information into the preempt_count value.
Since the NEED_RESCHED flag is set across CPUs this needs to be an
atomic operation, however we very much want to avoid making
preempt_count atomic, therefore we keep the existing TIF_NEED_RESCHED
infrastructure in place but at 3 sites test it and fold its value into
preempt_count; namely:
- resched_task() when setting TIF_NEED_RESCHED on the current task
- scheduler_ipi() when resched_task() sets TIF_NEED_RESCHED on a
remote task it follows it up with a reschedule IPI
and we can modify the cpu local preempt_count from
there.
- cpu_idle_loop() for when resched_task() found tsk_is_polling().
We use an inverted bitmask to indicate need_resched so that a 0 means
both need_resched and !atomic.
Also remove the barrier() in preempt_enable() between
preempt_enable_no_resched() and preempt_check_resched() to avoid
having to reload the preemption value and allow the compiler to use
the flags of the previuos decrement. I couldn't come up with any sane
reason for this barrier() to be there as preempt_enable_no_resched()
already has a barrier() before doing the decrement.
Suggested-by: Ingo Molnar <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/preempt.h | 47 ++++++++++++++++++++++++++++++++++++++++++-----
include/linux/sched.h | 7 +++++--
kernel/cpu/idle.c | 7 +++++++
kernel/sched/core.c | 20 +++++++++++++++-----
4 files changed, 69 insertions(+), 12 deletions(-)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index eaac52a..92e3418 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -10,9 +10,19 @@
#include <linux/linkage.h>
#include <linux/list.h>
+/*
+ * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
+ * the other bits -- can't include that header due to inclusion hell.
+ */
+#define PREEMPT_NEED_RESCHED 0x80000000
+
+/*
+ * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
+ * that think a non-zero value indicates we cannot preempt.
+ */
static __always_inline int preempt_count(void)
{
- return current_thread_info()->preempt_count;
+ return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED;
}
static __always_inline int *preempt_count_ptr(void)
@@ -20,11 +30,40 @@ static __always_inline int *preempt_count_ptr(void)
return ¤t_thread_info()->preempt_count;
}
+/*
+ * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
+ * alternative is loosing a reschedule. Better schedule too often -- also this
+ * should be a very rare operation.
+ */
static __always_inline void preempt_count_set(int pc)
{
*preempt_count_ptr() = pc;
}
+/*
+ * We fold the NEED_RESCHED bit into the preempt count such that
+ * preempt_enable() can decrement and test for needing to reschedule with a
+ * single instruction.
+ *
+ * We invert the actual bit, so that when the decrement hits 0 we know we both
+ * need to resched (the bit is cleared) and can resched (no preempt count).
+ */
+
+static __always_inline void set_preempt_need_resched(void)
+{
+ *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline void clear_preempt_need_resched(void)
+{
+ *preempt_count_ptr() |= PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline bool test_preempt_need_resched(void)
+{
+ return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED);
+}
+
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
extern void add_preempt_count(int val);
extern void sub_preempt_count(int val);
@@ -42,7 +81,7 @@ asmlinkage void preempt_schedule(void);
#define preempt_check_resched() \
do { \
- if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+ if (unlikely(!*preempt_count_ptr())) \
preempt_schedule(); \
} while (0)
@@ -52,7 +91,7 @@ void preempt_schedule_context(void);
#define preempt_check_resched_context() \
do { \
- if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+ if (unlikely(!*preempt_count_ptr())) \
preempt_schedule_context(); \
} while (0)
#else
@@ -88,7 +127,6 @@ do { \
#define preempt_enable() \
do { \
preempt_enable_no_resched(); \
- barrier(); \
preempt_check_resched(); \
} while (0)
@@ -116,7 +154,6 @@ do { \
#define preempt_enable_notrace() \
do { \
preempt_enable_no_resched_notrace(); \
- barrier(); \
preempt_check_resched_context(); \
} while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e783ec5..9fa151f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -22,6 +22,7 @@ struct sched_param {
#include <linux/errno.h>
#include <linux/nodemask.h>
#include <linux/mm_types.h>
+#include <linux/preempt.h>
#include <asm/page.h>
#include <asm/ptrace.h>
@@ -434,7 +435,9 @@ struct task_cputime {
* We include PREEMPT_ACTIVE to avoid cond_resched() from working
* before the scheduler is active -- see should_resched().
*/
-#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE)
+#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE + PREEMPT_NEED_RESCHED)
+#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED)
+#define PREEMPT_DISABLED (1 + PREEMPT_NEED_RESCHED)
/**
* struct thread_group_cputimer - thread group interval timer counts
@@ -2408,7 +2411,7 @@ static inline int signal_pending_state(long state, struct task_struct *p)
static inline int need_resched(void)
{
- return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+ return unlikely(test_preempt_need_resched());
}
/*
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index c261409..988573a 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -105,6 +105,13 @@ static void cpu_idle_loop(void)
__current_set_polling();
}
arch_cpu_idle_exit();
+ /*
+ * We need to test and propagate the TIF_NEED_RESCHED
+ * bit here because we might not have send the
+ * reschedule IPI to idle tasks.
+ */
+ if (tif_need_resched())
+ set_preempt_need_resched();
}
tick_nohz_idle_exit();
schedule_preempt_disabled();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe89afa..ee61f5a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -525,8 +525,10 @@ void resched_task(struct task_struct *p)
set_tsk_need_resched(p);
cpu = task_cpu(p);
- if (cpu == smp_processor_id())
+ if (cpu == smp_processor_id()) {
+ set_preempt_need_resched();
return;
+ }
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
@@ -1391,6 +1393,14 @@ static void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
+ /*
+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+ * TIF_NEED_RESCHED remotely (for the first time) will also send
+ * this IPI.
+ */
+ if (tif_need_resched())
+ set_preempt_need_resched();
+
if (llist_empty(&this_rq()->wake_list)
&& !tick_nohz_full_cpu(smp_processor_id())
&& !got_nohz_idle_kick())
@@ -1714,7 +1724,7 @@ void sched_fork(struct task_struct *p)
#endif
#ifdef CONFIG_PREEMPT_COUNT
/* Want to start with kernel preemption disabled. */
- task_thread_info(p)->preempt_count = 1;
+ task_thread_info(p)->preempt_count = PREEMPT_DISABLED;
#endif
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
@@ -2425,6 +2435,7 @@ need_resched:
put_prev_task(rq, prev);
next = pick_next_task(rq);
clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
rq->skip_clock_update = 0;
if (likely(prev != next)) {
@@ -2536,11 +2547,10 @@ EXPORT_SYMBOL(preempt_schedule);
*/
asmlinkage void __sched preempt_schedule_irq(void)
{
- struct thread_info *ti = current_thread_info();
enum ctx_state prev_state;
/* Catch callers which need to be fixed */
- BUG_ON(ti->preempt_count || !irqs_disabled());
+ BUG_ON(preempt_count() || !irqs_disabled());
prev_state = exception_enter();
@@ -4207,7 +4217,7 @@ void init_idle(struct task_struct *idle, int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
- task_thread_info(idle)->preempt_count = 0;
+ task_thread_info(idle)->preempt_count = PREEMPT_ENABLED;
/*
* The idle tasks have their own, simple scheduling class:
On Wed, Sep 25, 2013 at 09:38:38AM -0700, tip-bot for Peter Zijlstra wrote:
> Commit-ID: f27dde8deef33c9e58027df11ceab2198601d6a6
> Gitweb: http://git.kernel.org/tip/f27dde8deef33c9e58027df11ceab2198601d6a6
> Author: Peter Zijlstra <[email protected]>
> AuthorDate: Wed, 14 Aug 2013 14:55:31 +0200
> Committer: Ingo Molnar <[email protected]>
> CommitDate: Wed, 25 Sep 2013 14:07:49 +0200
>
> sched: Add NEED_RESCHED to the preempt_count
Hi Peter,
FYI, here we found a pigz regression by this commit.
Here is a list all of changed stats by this commit and it's parent 4a2b4b222743bb07fedf985b884550f2ca067ea9:
f27dde8deef33c9e58027df11ce 4a2b4b222743bb07fedf985b884
pigz.throughput [ 1.3953 - 2.4574 ] [ 391.49 - 392.43 ]
vmstat.cpu.id [ 99 - 99 ] [ 11 - 12 ]
vmstat.system.cs [ 968 - 1159 ] [ 63025 - 63666 ]
iostat.cpu.user [ 0.28527 - 0.51661 ] [ 86.299 - 86.544 ]
time.user_time [ 196.66 - 280.86 ] [ 16712 - 16759 ]
time.percent_of_cpu_this_job_got [ 124 - 143 ] [ 5642 - 5658 ]
time.elapsed_time [ 338.69 - 398.91 ] [ 300.2 - 300.23 ]
time.involuntary_context_switches [ 3184 - 4753 ] [ 5.536e+05 - 5.9042e+05 ]
time.voluntary_context_switches [ 84559 - 1.2223e+05 ] [ 1.1008e+07 - 1.1092e+07 ]
lock_stat.clockevents_lock.contentions [ 1.4125e+05 - 1.6922e+05 ] [ 1.6487e+06 - 1.679e+06 ]
lock_stat.clockevents_lock.contentions.clockevents_notify [ 2.8253e+05 - 3.3844e+05 ] [ 3.2975e+06 - 3.358e+06 ]
lock_stat.&pipe->mutex/1.contentions.pipe_write [ 2533 - 3198 ] [ 27814 - 28802 ]
lock_stat.jiffies_lock.contentions.tick_do_update_jiffies64 [ 1008 - 1822 ] [ 31272 - 33448 ]
lock_stat.jiffies_lock.contentions [ 504 - 911 ] [ 15636 - 16724 ]
iostat.cpu.idle [ 99.217 - 99.446 ] [ 12.081 - 12.271 ]
lock_stat.&rq->lock.contentions [ 7878 - 9593 ] [ 1.7954e+05 - 1.8646e+05 ]
lock_stat.rcu_node_1.contentions.force_qs_rnp [ 2200 - 2564 ] [ 33544 - 35368 ]
lock_stat.&(&futex_queues[i].lock)->rlock.contentions.futex_wake [ 9993 - 16537 ] [ 1.3794e+06 - 1.4482e+06 ]
lock_stat.rcu_node_1.contentions [ 7854 - 9260 ] [ 2.4984e+05 - 2.6247e+05 ]
lock_stat.&(&futex_queues[i].lock)->rlock.contentions [ 8779 - 14582 ] [ 1.2373e+06 - 1.301e+06 ]
lock_stat.&pipe->mutex/1.contentions [ 3572 - 4125 ] [ 1.4362e+05 - 1.507e+05 ]
lock_stat.rcu_node_1.contentions.rcu_process_callbacks [ 9511 - 11254 ] [ 4.5502e+05 - 4.7776e+05 ]
lock_stat.&pipe->mutex/1.contentions.pipe_read [ 2163 - 2683 ] [ 1.3208e+05 - 1.3934e+05 ]
vmstat.system.in [ 713 - 799 ] [ 27310 - 27526 ]
lock_stat.&pipe->mutex/1.contentions.pipe_lock_nested [ 1802 - 2219 ] [ 1.2618e+05 - 1.334e+05 ]
lock_stat.&(&futex_queues[i].lock)->rlock.contentions.futex_wait_setup [ 6447 - 10781 ] [ 9.3156e+05 - 9.9482e+05 ]
iostat.cpu.system [ 0.25493 - 0.29317 ] [ 1.3752 - 1.4608 ]
vmstat.cpu.us [ 0 - 0 ] [ 86 - 86 ]
lock_stat.&rq->lock.contentions.try_to_wake_up [ 4928 - 6018 ] [ 1.1351e+05 - 1.5691e+05 ]
lock_stat.&rq->lock.contentions.__schedule [ 3036 - 3658 ] [ 81508 - 1.3994e+05 ]
lock_stat.&(&futex_queues[i].lock)->rlock/1.contentions.futex_requeue [ 57 - 114 ] [ 9139 - 16865 ]
lock_stat.&(&futex_queues[i].lock)->rlock/1.contentions [ 57 - 114 ] [ 9135 - 16860 ]
lock_stat.&(&futex_queues[i].lock)->rlock/1.contentions.futex_wait_setup [ 55 - 100 ] [ 8253 - 15331 ]
lock_stat.&(&futex_queues[i].lock)->rlock.contentions.futex_requeue [ 753 - 1810 ] [ 83288 - 1.6982e+05 ]
time.minor_page_faults [ 66817 - 67950 ] [ 69059 - 69092 ]
And here are text plots for above stats:
O for 4a2b4b222743bb07fedf985b884550f2ca067ea9
* for f27dde8deef33c9e58027df11ceab2198601d6a6
pigz.throughput
400 O+------O------O-------O-------O------O-------O-------O------O-------O
| |
350 ++ |
300 ++ |
| |
250 ++ |
| |
200 ++ |
| |
150 ++ |
100 ++ |
| |
50 ++ |
| |
0 *+------*------*-------*-------*------*-------*-------*------*-------*
time.user_time
18000 ++-----------------------------------------------------------------+
O O O O O O O O O O
16000 ++ |
14000 ++ |
| |
12000 ++ |
10000 ++ |
| |
8000 ++ |
6000 ++ |
| |
4000 ++ |
2000 ++ |
| |
0 *+-----*-------*------*-------*------*-------*------*-------*------*
time.percent_of_cpu_this_job_got
6000 ++------------------------------------------------------------------+
O O O O O O O O O O
5000 ++ |
| |
| |
4000 ++ |
| |
3000 ++ |
| |
2000 ++ |
| |
| |
1000 ++ |
| |
0 *+------*------*-------*------*-------*------*-------*------*-------*
time.elapsed_time
400 ++------*--------------*---------------------------------------------+
390 ++ ... . .. .. |
| .. . .. . |
380 *+ .. . . |
370 ++ .. .. |
| * ..*.. |
360 ++ *..... .... .. |
350 ++ . ....*. .. |
340 ++ *... |
| *.......*
330 ++ |
320 ++ |
| |
310 ++ |
300 O+------O------O-------O-------O------O-------O-------O------O-------O
time.minor_page_faults
69500 ++-----------------------------------------------------------------+
| O O O O O O O O
69000 O+ O |
68500 ++ |
| |
68000 ++ ....*......*..... ...*
*......*... .. .*. *... |
67500 ++ *..... ... . .. |
| . .. .. . |
67000 ++ *. . . |
66500 ++ . .. |
| . |
66000 ++ * |
| |
65500 ++-----------------------------------------------------------------+
time.voluntary_context_switches
1.2e+07 ++---------------------------------------------------------------+
O O O O O O O O O O
1e+07 ++ |
| |
| |
8e+06 ++ |
| |
6e+06 ++ |
| |
4e+06 ++ |
| |
| |
2e+06 ++ |
| |
0 *+-----*------*-------*------*------*------*-------*------*------*
time.involuntary_context_switches
700000 ++----------------------------------------------------------------+
| |
600000 O+ O O |
| O O O O O O O
500000 ++ |
| |
400000 ++ |
| |
300000 ++ |
| |
200000 ++ |
| |
100000 ++ |
| |
0 *+-----*-------*------*------*-------*------*------*-------*------*
vmstat.system.in
30000 ++-----------------------------------------------------------------+
O O O O O O O O O O
25000 ++ |
| |
| |
20000 ++ |
| |
15000 ++ |
| |
10000 ++ |
| |
| |
5000 ++ |
| |
0 *+-----*-------*------*-------*------*-------*------*-------*------*
vmstat.system.cs
70000 ++-----------------------------------------------------------------+
O O O O O O O O O O
60000 ++ |
| |
50000 ++ |
| |
40000 ++ |
| |
30000 ++ |
| |
20000 ++ |
| |
10000 ++ |
| |
0 *+-----*-------*------*-------*------*-------*------*-------*------*
vmstat.cpu.us
90 ++--------------------------------------------------------------------+
O O O O O O O O O O
80 ++ |
70 ++ |
| |
60 ++ |
50 ++ |
| |
40 ++ |
30 ++ |
| |
20 ++ |
10 ++ |
| |
0 *+------*-------*------*-------*-------*-------*------*-------*-------*
vmstat.cpu.id
100 *+------*------*-------*-------*------*-------*-------*------*-------*
| |
90 ++ |
80 ++ |
| |
70 ++ |
60 ++ |
| |
50 ++ |
40 ++ |
| |
30 ++ |
20 ++ |
| |
10 O+------O------O-------O-------O------O-------O-------O------O-------O
lock_stat.clockevents_lock.contentions
1.8e+06 ++---------------------------------------------------------------+
O O O O O O O O O O
1.6e+06 ++ |
1.4e+06 ++ |
| |
1.2e+06 ++ |
1e+06 ++ |
| |
800000 ++ |
600000 ++ |
| |
400000 ++ |
200000 ++ |
*......*......*.......*......*......*......*.......*......*......*
0 ++---------------------------------------------------------------+
lock_stat.clockevents_lock.contentions.clockevents_notify
3.5e+06 ++---------------------------------------------------------------+
O O O O O O O O O O
3e+06 ++ |
| |
2.5e+06 ++ |
| |
2e+06 ++ |
| |
1.5e+06 ++ |
| |
1e+06 ++ |
| |
500000 ++ |
*......*......*.......*......*......*......*.......*......*......*
0 ++---------------------------------------------------------------+
lock_stat.&rq->lock.contentions
200000 ++----------------------------------------------------------------+
180000 O+ O O O O O O O O |
| O
160000 ++ |
140000 ++ |
| |
120000 ++ |
100000 ++ |
80000 ++ |
| |
60000 ++ |
40000 ++ |
| |
20000 *+.....*.......*......*......*.......*......*......*.......*......*
0 ++----------------------------------------------------------------+
lock_stat.&rq->lock.contentions.try_to_wake_up
160000 ++-------------O---------------------O----------------------------+
| |
140000 ++ |
120000 ++ O O O O |
O O O
100000 ++ |
| |
80000 ++ |
| |
60000 ++ O |
40000 ++ |
| |
20000 ++ |
| ...*.......*...... ...*....... ...*
0 *+--------------------*------*-------*------*--------------*------+
lock_stat.&rq->lock.contentions.__schedule
160000 O+----------------------------------------------------------------+
| |
140000 ++ O |
120000 ++ O |
| |
100000 ++ O O O O
| |
80000 ++ O O O |
| |
60000 ++ |
40000 ++ |
| |
20000 ++ |
| |
0 *+-----*-------*------*------*-------*------*------*-------*------*
lock_stat.&(&futex_queues[i].lock)->rlock.contentions
1.4e+06 ++---------------------------------------------------------------+
O O O O O O O O
1.2e+06 ++ O O |
| |
1e+06 ++ |
| |
800000 ++ |
| |
600000 ++ |
| |
400000 ++ |
| |
200000 ++ |
| |
0 *+-----*------*-------*------*------*------*-------*------*------*
lock_stat.&(&futex_queues[i].lock)->rlock.contentions.futex_wake
1.6e+06 ++---------------------------------------------------------------+
O O O O O
1.4e+06 ++ O O O O O |
| |
1.2e+06 ++ |
1e+06 ++ |
| |
800000 ++ |
| |
600000 ++ |
400000 ++ |
| |
200000 ++ |
| |
0 *+-----*------*-------*------*------*------*-------*------*------*
lock_stat.&(&futex_queues[i].lock)->rlock.contentions.futex_wait_setup
1.2e+06 ++---------------------------------------------------------------+
| |
1e+06 ++ O O |
O O O O O O O O
| |
800000 ++ |
| |
600000 ++ |
| |
400000 ++ |
| |
| |
200000 ++ |
| |
0 *+-----*------*-------*------*------*------*-------*------*------*
lock_stat.&(&futex_queues[i].lock)->rlock.contentions.futex_requeue
180000 ++----------------------------------------------------------------+
O O O O O
160000 ++ O |
140000 ++ |
| O |
120000 ++ |
100000 ++ O |
| |
80000 ++ O O |
60000 ++ |
| |
40000 ++ |
20000 ++ |
| |
0 *+-----*-------*------*------*-------*------*------*-------*------*
lock_stat.rcu_node_1.contentions
300000 ++----------------------------------------------------------------+
| |
250000 O+ O O O O O O O O
| O |
| |
200000 ++ |
| |
150000 ++ |
| |
100000 ++ |
| |
| |
50000 ++ |
| |
0 *+-----*-------*------*------*-------*------*------*-------*------*
lock_stat.rcu_node_1.contentions.rcu_process_callbacks
500000 ++----------------------------------------------------------------+
450000 O+ O O O O O O O O O
| |
400000 ++ |
350000 ++ |
| |
300000 ++ |
250000 ++ |
200000 ++ |
| |
150000 ++ |
100000 ++ |
| |
50000 ++ |
0 *+-----*-------*------*------*-------*------*------*-------*------*
lock_stat.rcu_node_1.contentions.force_qs_rnp
40000 ++-----------------------------------------------------------------+
| |
35000 O+ O O O O O O O O
30000 ++ O |
| |
25000 ++ |
| |
20000 ++ |
| |
15000 ++ |
10000 ++ |
| |
5000 ++ |
*......*.......*......*.......*......*.......*......*.......*......*
0 ++-----------------------------------------------------------------+
lock_stat.&pipe->mutex/1.contentions
160000 ++-----------------------------------O----------------------------+
| O O O O O O O O
140000 O+ |
120000 ++ |
| |
100000 ++ |
| |
80000 ++ |
| |
60000 ++ |
40000 ++ |
| |
20000 ++ |
| |
0 *+-----*-------*------*------*-------*------*------*-------*------*
lock_stat.&pipe->mutex/1.contentions.pipe_lock_nested
140000 ++-----------------------------------O----------------------------+
O O O O O O O O O
120000 ++ |
| |
100000 ++ |
| |
80000 ++ |
| |
60000 ++ |
| |
40000 ++ |
| |
20000 ++ |
| |
0 *+-----*-------*------*------*-------*------*------*-------*------*
lock_stat.&pipe->mutex/1.contentions.pipe_read
160000 ++----------------------------------------------------------------+
| O |
140000 ++ O O O O O O O O
120000 O+ |
| |
100000 ++ |
| |
80000 ++ |
| |
60000 ++ |
40000 ++ |
| |
20000 ++ |
| |
0 *+-----*-------*------*------*-------*------*------*-------*------*
lock_stat.&pipe->mutex/1.contentions.pipe_write
30000 ++-----------------------------------------------------------------+
O O O O O O O O O O
25000 ++ |
| |
| |
20000 ++ |
| |
15000 ++ |
| |
10000 ++ |
| |
| |
5000 ++ ...*....... |
*......*.......*......*.......*......*.......*... *......*
0 ++-----------------------------------------------------------------+
lock_stat.jiffies_lock.contentions
18000 ++-----------------------------------------------------------------+
O O O O O O |
16000 ++ O O O O
14000 ++ |
| |
12000 ++ |
10000 ++ |
| |
8000 ++ |
6000 ++ |
| |
4000 ++ |
2000 ++ |
*......*....... ...*....... ...*....... ...*.......*......*
0 ++-------------*--------------*--------------*---------------------+
lock_stat.jiffies_lock.contentions.tick_do_update_jiffies64
35000 ++-----------------------------------------------------------------+
O O O O O O O O O
30000 ++ O |
| |
25000 ++ |
| |
20000 ++ |
| |
15000 ++ |
| |
10000 ++ |
| |
5000 ++ |
*......*.......*......*....... ...*....... ...*.......*......*
0 ++----------------------------*--------------*---------------------+
lock_stat.&(&futex_queues[i].lock)->rlock/1.contentions
18000 ++-----------------------------------------------------------------+
| O O |
16000 ++ |
14000 O+ O |
| O |
12000 ++ O O |
10000 ++ O
| O |
8000 ++ O |
6000 ++ |
| |
4000 ++ |
2000 ++ |
| |
0 *+-----*-------*------*-------*------*-------*------*-------*------*
lock_stat.&(&futex_queues[i].lock)->rlock/1.contentions.futex_requeue
18000 ++-----------------------------------------------------------------+
| O O |
16000 ++ |
14000 O+ O O |
| |
12000 ++ O O |
10000 ++ O
| O |
8000 ++ O |
6000 ++ |
| |
4000 ++ |
2000 ++ |
| |
0 *+-----*-------*------*-------*------*-------*------*-------*------*
lock_stat.&(&futex_queues[i].lock)->rlock/1.contentions.futex_wait_setup
16000 ++-------------O---------------------------------------------------+
| O |
14000 O+ |
12000 ++ O |
| |
10000 ++ O O O
| O |
8000 ++ O O |
| |
6000 ++ |
4000 ++ |
| |
2000 ++ |
| |
0 *+-----*-------*------*-------*------*-------*------*-------*------*
iostat.cpu.user
90 ++--------------------------------------------------------------------+
O O O O O O O O O O
80 ++ |
70 ++ |
| |
60 ++ |
50 ++ |
| |
40 ++ |
30 ++ |
| |
20 ++ |
10 ++ |
| |
0 *+------*-------*------*-------*-------*-------*------*-------*-------*
iostat.cpu.system
1.6 ++-------------------------------------------------------------------+
| O O O |
1.4 O+ O O O O O O
| |
1.2 ++ |
| |
1 ++ |
| |
0.8 ++ |
| |
0.6 ++ |
| |
0.4 ++ |
*.......*......*.......*.......*......*.......*.......*...... ....*
0.2 ++-----------------------------------------------------------*-------+
iostat.cpu.idle
100 *+------*------*-------*-------*------*-------*-------*------*-------*
| |
90 ++ |
80 ++ |
| |
70 ++ |
60 ++ |
| |
50 ++ |
40 ++ |
| |
30 ++ |
20 ++ |
| |
10 O+------O------O-------O-------O------O-------O-------O------O-------O
And here is the bisect log:
---
:040000 040000 138132af1aff14d343d8355837e7f98b6e92cfe1 333aa1c56d624a9991ba13cef1dc08b9976d666a M include
:040000 040000 b47be033e17c9dc558db7bdd3fab1a67b5a8596d 3ba4f3b71ee73a57f09204cba4247133479c12b6 M kernel
bisect run success
# bad: [62a27650cea1a65e4b951d0b58d089c9d206b2d7] Merge remote-tracking branch 'radeon-alex/drm-fixes-3.12' into kbuild_tmp
# good: [272b98c6455f00884f0350f775c5342358ebb73f] Linux 3.12-rc1
git bisect start '62a27650cea1a65e4b951d0b58d089c9d206b2d7' '272b98c6455f00884f0350f775c5342358ebb73f' '--'
# good: [d0060a777b3d1fd37bdeeabbc6af4a177fa1ad6d] drm/i915: clean up and simplify i9xx_crtc_mode_set wrt PLL handling
git bisect good d0060a777b3d1fd37bdeeabbc6af4a177fa1ad6d
# good: [57a42192e61e7d0074fa6027e2565461bfd60147] staging: dgap: driver.c: removes smatch warning "redundant null check"
git bisect good 57a42192e61e7d0074fa6027e2565461bfd60147
# bad: [d2d55f430eac6bd20dcb3b8b11bfb3018d3fadef] Merge remote-tracking branch 'arm-perf/aarch64' into kbuild_tmp
git bisect bad d2d55f430eac6bd20dcb3b8b11bfb3018d3fadef
# bad: [324f7d868098d2fc547f9b76a77a610d53ca61b5] Merge remote-tracking branch 'smack/next' into kbuild_tmp
git bisect bad 324f7d868098d2fc547f9b76a77a610d53ca61b5
# bad: [1a338ac32ca630f67df25b4a16436cccc314e997] sched, x86: Optimize the preempt_schedule() call
git bisect bad 1a338ac32ca630f67df25b4a16436cccc314e997
# good: [f48627e686a69f5215cb0761e731edb3d9859dd9] sched/balancing: Periodically decay max cost of idle balance
git bisect good f48627e686a69f5215cb0761e731edb3d9859dd9
# good: [4a2b4b222743bb07fedf985b884550f2ca067ea9] sched: Introduce preempt_count accessor functions
git bisect good 4a2b4b222743bb07fedf985b884550f2ca067ea9
# bad: [01028747559ac6c6f642a7bbd2875cc4f66b2feb] sched: Create more preempt_count accessors
git bisect bad 01028747559ac6c6f642a7bbd2875cc4f66b2feb
# bad: [a787870924dbd6f321661e06d4ec1c7a408c9ccf] sched, arch: Create asm/preempt.h
git bisect bad a787870924dbd6f321661e06d4ec1c7a408c9ccf
# bad: [f27dde8deef33c9e58027df11ceab2198601d6a6] sched: Add NEED_RESCHED to the preempt_count
git bisect bad f27dde8deef33c9e58027df11ceab2198601d6a6
# first bad commit: [f27dde8deef33c9e58027df11ceab2198601d6a6] sched: Add NEED_RESCHED to the preempt_count
Please feel free to ask more data if necessary.
Thanks.
--yliu
>
> In order to combine the preemption and need_resched test we need to
> fold the need_resched information into the preempt_count value.
>
> Since the NEED_RESCHED flag is set across CPUs this needs to be an
> atomic operation, however we very much want to avoid making
> preempt_count atomic, therefore we keep the existing TIF_NEED_RESCHED
> infrastructure in place but at 3 sites test it and fold its value into
> preempt_count; namely:
>
> - resched_task() when setting TIF_NEED_RESCHED on the current task
> - scheduler_ipi() when resched_task() sets TIF_NEED_RESCHED on a
> remote task it follows it up with a reschedule IPI
> and we can modify the cpu local preempt_count from
> there.
> - cpu_idle_loop() for when resched_task() found tsk_is_polling().
>
> We use an inverted bitmask to indicate need_resched so that a 0 means
> both need_resched and !atomic.
>
> Also remove the barrier() in preempt_enable() between
> preempt_enable_no_resched() and preempt_check_resched() to avoid
> having to reload the preemption value and allow the compiler to use
> the flags of the previuos decrement. I couldn't come up with any sane
> reason for this barrier() to be there as preempt_enable_no_resched()
> already has a barrier() before doing the decrement.
>
> Suggested-by: Ingo Molnar <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Link: http://lkml.kernel.org/n/[email protected]
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> include/linux/preempt.h | 47 ++++++++++++++++++++++++++++++++++++++++++-----
> include/linux/sched.h | 7 +++++--
> kernel/cpu/idle.c | 7 +++++++
> kernel/sched/core.c | 20 +++++++++++++++-----
> 4 files changed, 69 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> index eaac52a..92e3418 100644
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -10,9 +10,19 @@
> #include <linux/linkage.h>
> #include <linux/list.h>
>
> +/*
> + * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
> + * the other bits -- can't include that header due to inclusion hell.
> + */
> +#define PREEMPT_NEED_RESCHED 0x80000000
> +
> +/*
> + * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
> + * that think a non-zero value indicates we cannot preempt.
> + */
> static __always_inline int preempt_count(void)
> {
> - return current_thread_info()->preempt_count;
> + return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED;
> }
>
> static __always_inline int *preempt_count_ptr(void)
> @@ -20,11 +30,40 @@ static __always_inline int *preempt_count_ptr(void)
> return ¤t_thread_info()->preempt_count;
> }
>
> +/*
> + * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
> + * alternative is loosing a reschedule. Better schedule too often -- also this
> + * should be a very rare operation.
> + */
> static __always_inline void preempt_count_set(int pc)
> {
> *preempt_count_ptr() = pc;
> }
>
> +/*
> + * We fold the NEED_RESCHED bit into the preempt count such that
> + * preempt_enable() can decrement and test for needing to reschedule with a
> + * single instruction.
> + *
> + * We invert the actual bit, so that when the decrement hits 0 we know we both
> + * need to resched (the bit is cleared) and can resched (no preempt count).
> + */
> +
> +static __always_inline void set_preempt_need_resched(void)
> +{
> + *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
> +}
> +
> +static __always_inline void clear_preempt_need_resched(void)
> +{
> + *preempt_count_ptr() |= PREEMPT_NEED_RESCHED;
> +}
> +
> +static __always_inline bool test_preempt_need_resched(void)
> +{
> + return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED);
> +}
> +
> #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
> extern void add_preempt_count(int val);
> extern void sub_preempt_count(int val);
> @@ -42,7 +81,7 @@ asmlinkage void preempt_schedule(void);
>
> #define preempt_check_resched() \
> do { \
> - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
> + if (unlikely(!*preempt_count_ptr())) \
> preempt_schedule(); \
> } while (0)
>
> @@ -52,7 +91,7 @@ void preempt_schedule_context(void);
>
> #define preempt_check_resched_context() \
> do { \
> - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
> + if (unlikely(!*preempt_count_ptr())) \
> preempt_schedule_context(); \
> } while (0)
> #else
> @@ -88,7 +127,6 @@ do { \
> #define preempt_enable() \
> do { \
> preempt_enable_no_resched(); \
> - barrier(); \
> preempt_check_resched(); \
> } while (0)
>
> @@ -116,7 +154,6 @@ do { \
> #define preempt_enable_notrace() \
> do { \
> preempt_enable_no_resched_notrace(); \
> - barrier(); \
> preempt_check_resched_context(); \
> } while (0)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index e783ec5..9fa151f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -22,6 +22,7 @@ struct sched_param {
> #include <linux/errno.h>
> #include <linux/nodemask.h>
> #include <linux/mm_types.h>
> +#include <linux/preempt.h>
>
> #include <asm/page.h>
> #include <asm/ptrace.h>
> @@ -434,7 +435,9 @@ struct task_cputime {
> * We include PREEMPT_ACTIVE to avoid cond_resched() from working
> * before the scheduler is active -- see should_resched().
> */
> -#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE)
> +#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE + PREEMPT_NEED_RESCHED)
> +#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED)
> +#define PREEMPT_DISABLED (1 + PREEMPT_NEED_RESCHED)
>
> /**
> * struct thread_group_cputimer - thread group interval timer counts
> @@ -2408,7 +2411,7 @@ static inline int signal_pending_state(long state, struct task_struct *p)
>
> static inline int need_resched(void)
> {
> - return unlikely(test_thread_flag(TIF_NEED_RESCHED));
> + return unlikely(test_preempt_need_resched());
> }
>
> /*
> diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
> index c261409..988573a 100644
> --- a/kernel/cpu/idle.c
> +++ b/kernel/cpu/idle.c
> @@ -105,6 +105,13 @@ static void cpu_idle_loop(void)
> __current_set_polling();
> }
> arch_cpu_idle_exit();
> + /*
> + * We need to test and propagate the TIF_NEED_RESCHED
> + * bit here because we might not have send the
> + * reschedule IPI to idle tasks.
> + */
> + if (tif_need_resched())
> + set_preempt_need_resched();
> }
> tick_nohz_idle_exit();
> schedule_preempt_disabled();
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index fe89afa..ee61f5a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -525,8 +525,10 @@ void resched_task(struct task_struct *p)
> set_tsk_need_resched(p);
>
> cpu = task_cpu(p);
> - if (cpu == smp_processor_id())
> + if (cpu == smp_processor_id()) {
> + set_preempt_need_resched();
> return;
> + }
>
> /* NEED_RESCHED must be visible before we test polling */
> smp_mb();
> @@ -1391,6 +1393,14 @@ static void sched_ttwu_pending(void)
>
> void scheduler_ipi(void)
> {
> + /*
> + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
> + * TIF_NEED_RESCHED remotely (for the first time) will also send
> + * this IPI.
> + */
> + if (tif_need_resched())
> + set_preempt_need_resched();
> +
> if (llist_empty(&this_rq()->wake_list)
> && !tick_nohz_full_cpu(smp_processor_id())
> && !got_nohz_idle_kick())
> @@ -1714,7 +1724,7 @@ void sched_fork(struct task_struct *p)
> #endif
> #ifdef CONFIG_PREEMPT_COUNT
> /* Want to start with kernel preemption disabled. */
> - task_thread_info(p)->preempt_count = 1;
> + task_thread_info(p)->preempt_count = PREEMPT_DISABLED;
> #endif
> #ifdef CONFIG_SMP
> plist_node_init(&p->pushable_tasks, MAX_PRIO);
> @@ -2425,6 +2435,7 @@ need_resched:
> put_prev_task(rq, prev);
> next = pick_next_task(rq);
> clear_tsk_need_resched(prev);
> + clear_preempt_need_resched();
> rq->skip_clock_update = 0;
>
> if (likely(prev != next)) {
> @@ -2536,11 +2547,10 @@ EXPORT_SYMBOL(preempt_schedule);
> */
> asmlinkage void __sched preempt_schedule_irq(void)
> {
> - struct thread_info *ti = current_thread_info();
> enum ctx_state prev_state;
>
> /* Catch callers which need to be fixed */
> - BUG_ON(ti->preempt_count || !irqs_disabled());
> + BUG_ON(preempt_count() || !irqs_disabled());
>
> prev_state = exception_enter();
>
> @@ -4207,7 +4217,7 @@ void init_idle(struct task_struct *idle, int cpu)
> raw_spin_unlock_irqrestore(&rq->lock, flags);
>
> /* Set the preempt count _outside_ the spinlocks! */
> - task_thread_info(idle)->preempt_count = 0;
> + task_thread_info(idle)->preempt_count = PREEMPT_ENABLED;
>
> /*
> * The idle tasks have their own, simple scheduling class:
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Fri, Sep 27, 2013 at 05:14:27PM +0800, Yuanhan Liu wrote:
> On Wed, Sep 25, 2013 at 09:38:38AM -0700, tip-bot for Peter Zijlstra wrote:
> > Commit-ID: f27dde8deef33c9e58027df11ceab2198601d6a6
> > Gitweb: http://git.kernel.org/tip/f27dde8deef33c9e58027df11ceab2198601d6a6
> > Author: Peter Zijlstra <[email protected]>
> > AuthorDate: Wed, 14 Aug 2013 14:55:31 +0200
> > Committer: Ingo Molnar <[email protected]>
> > CommitDate: Wed, 25 Sep 2013 14:07:49 +0200
> >
> > sched: Add NEED_RESCHED to the preempt_count
>
> Hi Peter,
>
> FYI, here we found a pigz regression by this commit.
I just can't help myself; but pigs aren't supposed to fly anyway! :-)
> Here is a list all of changed stats by this commit and it's parent 4a2b4b222743bb07fedf985b884550f2ca067ea9:
>
> time.involuntary_context_switches [ 3184 - 4753 ] [ 5.536e+05 - 5.9042e+05 ]
> time.voluntary_context_switches [ 84559 - 1.2223e+05 ] [ 1.1008e+07 - 1.1092e+07 ]
Gah, that looks like it completely wrecked preemptions.
What kind of PREEMPT Kconfig options do you use? Or rather; could you
attach your .config?
On Fri, Sep 27, 2013 at 01:57:22PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 27, 2013 at 05:14:27PM +0800, Yuanhan Liu wrote:
> > On Wed, Sep 25, 2013 at 09:38:38AM -0700, tip-bot for Peter Zijlstra wrote:
> > > Commit-ID: f27dde8deef33c9e58027df11ceab2198601d6a6
> > > Gitweb: http://git.kernel.org/tip/f27dde8deef33c9e58027df11ceab2198601d6a6
> > > Author: Peter Zijlstra <[email protected]>
> > > AuthorDate: Wed, 14 Aug 2013 14:55:31 +0200
> > > Committer: Ingo Molnar <[email protected]>
> > > CommitDate: Wed, 25 Sep 2013 14:07:49 +0200
> > >
> > > sched: Add NEED_RESCHED to the preempt_count
> >
> > Hi Peter,
> >
> > FYI, here we found a pigz regression by this commit.
>
> I just can't help myself; but pigs aren't supposed to fly anyway! :-)
Aha!
> > Here is a list all of changed stats by this commit and it's parent 4a2b4b222743bb07fedf985b884550f2ca067ea9:
> >
> > time.involuntary_context_switches [ 3184 - 4753 ] [ 5.536e+05 - 5.9042e+05 ]
> > time.voluntary_context_switches [ 84559 - 1.2223e+05 ] [ 1.1008e+07 - 1.1092e+07 ]
>
> Gah, that looks like it completely wrecked preemptions.
>
> What kind of PREEMPT Kconfig options do you use? Or rather; could you
> attach your .config?
Attached.
Thanks,
Fengguang
On Fri, Sep 27, 2013 at 08:13:11PM +0800, Fengguang Wu wrote:
> On Fri, Sep 27, 2013 at 01:57:22PM +0200, Peter Zijlstra wrote:
> > On Fri, Sep 27, 2013 at 05:14:27PM +0800, Yuanhan Liu wrote:
> > > On Wed, Sep 25, 2013 at 09:38:38AM -0700, tip-bot for Peter Zijlstra wrote:
> > > > Commit-ID: f27dde8deef33c9e58027df11ceab2198601d6a6
> > > > Gitweb: http://git.kernel.org/tip/f27dde8deef33c9e58027df11ceab2198601d6a6
> > > > Author: Peter Zijlstra <[email protected]>
> > > > AuthorDate: Wed, 14 Aug 2013 14:55:31 +0200
> > > > Committer: Ingo Molnar <[email protected]>
> > > > CommitDate: Wed, 25 Sep 2013 14:07:49 +0200
> > > >
> > > > sched: Add NEED_RESCHED to the preempt_count
> > >
> > > Hi Peter,
> > >
> > > FYI, here we found a pigz regression by this commit.
> >
> > I just can't help myself; but pigs aren't supposed to fly anyway! :-)
>
> Aha!
> > > Here is a list all of changed stats by this commit and it's parent 4a2b4b222743bb07fedf985b884550f2ca067ea9:
> > >
> > > time.involuntary_context_switches [ 3184 - 4753 ] [ 5.536e+05 - 5.9042e+05 ]
> > > time.voluntary_context_switches [ 84559 - 1.2223e+05 ] [ 1.1008e+07 - 1.1092e+07 ]
> >
> > Gah, that looks like it completely wrecked preemptions.
> > What kind of PREEMPT Kconfig options do you use? Or rather; could you
> > attach your .config?
>
> Attached.
Sorry it's this one.
Thanks,
Fengguang
Subject: ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED
From: Peter Zijlstra <[email protected]>
Date: Fri Sep 27 17:11:00 CEST 2013
Since we now have two need_resched states; trace the two so we can
observe discrepancies.
Cc: Steven Rostedt <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
---
kernel/trace/trace.c | 3 ++-
kernel/trace/trace.h | 1 +
kernel/trace/trace_output.c | 13 +++++++++++--
3 files changed, 14 insertions(+), 3 deletions(-)
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trac
#endif
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
- (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+ (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+ (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -124,6 +124,7 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_PREEMPT_RESCHED = 0x20,
};
#define TRACE_BUF_SIZE 1024
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -618,8 +618,17 @@ int trace_print_lat_fmt(struct trace_seq
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
- need_resched =
- (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+
+ if ((entry->flags & TRACE_FLAG_NEED_RESCHED) &&
+ (entry->flags & TRACE_FLAG_PREEMPT_RESCHED))
+ need_resched = 'N';
+ else if (entry->flags & TRACE_FLAG_NEED_RESCHED)
+ need_resched = 'n';
+ else if (entry->flags & TRACE_FLAG_PREEMPT_RESCHED)
+ need_resched = 'p';
+ else
+ need_resched = '.';
+
hardsoft_irq =
(hardirq && softirq) ? 'H' :
hardirq ? 'h' :
Subject: sched: Revert need_resched() to look at TIF_NEED_RESCHED
From: Peter Zijlstra <[email protected]>
Date: Fri Sep 27 17:20:30 CEST 2013
Yuanhan reported a serious throughput regression in his pigz
benchmark. Using the ftrace patch I found that several idle paths
need more TLC before we can switch the generic need_resched() over to
preempt_need_resched.
The preemption paths benefit most from preempt_need_resched and do
indeed use it; all other need_resched() users don't really care that
much so reverting need_resched() back to tif_need_resched() is the
simple and safe solution.
Reported-by: Yuanhan Liu <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
---
arch/x86/include/asm/preempt.h | 8 --------
include/asm-generic/preempt.h | 8 --------
include/linux/sched.h | 5 +++++
3 files changed, 5 insertions(+), 16 deletions(-)
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -80,14 +80,6 @@ static __always_inline bool __preempt_co
}
/*
- * Returns true when we need to resched -- even if we can not.
- */
-static __always_inline bool need_resched(void)
-{
- return unlikely(test_preempt_need_resched());
-}
-
-/*
* Returns true when we need to resched and can (barring IRQ state).
*/
static __always_inline bool should_resched(void)
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -85,14 +85,6 @@ static __always_inline bool __preempt_co
}
/*
- * Returns true when we need to resched -- even if we can not.
- */
-static __always_inline bool need_resched(void)
-{
- return unlikely(test_preempt_need_resched());
-}
-
-/*
* Returns true when we need to resched and can (barring IRQ state).
*/
static __always_inline bool should_resched(void)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2577,6 +2577,11 @@ static inline bool __must_check current_
}
#endif
+static __always_inline bool need_resched(void)
+{
+ return unlikely(tif_need_resched());
+}
+
/*
* Thread group CPU time accounting.
*/
Commit-ID: 75f93fed50c2abadbab6ef546b265f51ca975b27
Gitweb: http://git.kernel.org/tip/75f93fed50c2abadbab6ef546b265f51ca975b27
Author: Peter Zijlstra <[email protected]>
AuthorDate: Fri, 27 Sep 2013 17:30:03 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Sat, 28 Sep 2013 10:04:47 +0200
sched: Revert need_resched() to look at TIF_NEED_RESCHED
Yuanhan reported a serious throughput regression in his pigz
benchmark. Using the ftrace patch I found that several idle
paths need more TLC before we can switch the generic
need_resched() over to preempt_need_resched.
The preemption paths benefit most from preempt_need_resched and
do indeed use it; all other need_resched() users don't really
care that much so reverting need_resched() back to
tif_need_resched() is the simple and safe solution.
Reported-by: Yuanhan Liu <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Fengguang Wu <[email protected]>
Cc: Huang Ying <[email protected]>
Cc: [email protected]
Cc: Linus Torvalds <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/preempt.h | 8 --------
include/asm-generic/preempt.h | 8 --------
include/linux/sched.h | 5 +++++
3 files changed, 5 insertions(+), 16 deletions(-)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 1de41690..8729723 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -80,14 +80,6 @@ static __always_inline bool __preempt_count_dec_and_test(void)
}
/*
- * Returns true when we need to resched -- even if we can not.
- */
-static __always_inline bool need_resched(void)
-{
- return unlikely(test_preempt_need_resched());
-}
-
-/*
* Returns true when we need to resched and can (barring IRQ state).
*/
static __always_inline bool should_resched(void)
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index 5dc14ed..ddf2b42 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -85,14 +85,6 @@ static __always_inline bool __preempt_count_dec_and_test(void)
}
/*
- * Returns true when we need to resched -- even if we can not.
- */
-static __always_inline bool need_resched(void)
-{
- return unlikely(test_preempt_need_resched());
-}
-
-/*
* Returns true when we need to resched and can (barring IRQ state).
*/
static __always_inline bool should_resched(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b09798b..2ac5285 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2577,6 +2577,11 @@ static inline bool __must_check current_clr_polling_and_test(void)
}
#endif
+static __always_inline bool need_resched(void)
+{
+ return unlikely(tif_need_resched());
+}
+
/*
* Thread group CPU time accounting.
*/
On Fri, Sep 27, 2013 at 05:29:08PM +0200, Peter Zijlstra wrote:
> Subject: ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED
> From: Peter Zijlstra <[email protected]>
> Date: Fri Sep 27 17:11:00 CEST 2013
>
> Since we now have two need_resched states; trace the two so we can
> observe discrepancies.
>
> Cc: Steven Rostedt <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
Steve, if you're done conferencing.. any objections?
> ---
> kernel/trace/trace.c | 3 ++-
> kernel/trace/trace.h | 1 +
> kernel/trace/trace_output.c | 13 +++++++++++--
> 3 files changed, 14 insertions(+), 3 deletions(-)
>
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trac
> #endif
> ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
> ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
> - (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
> + (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
> + (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
> }
> EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
>
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -124,6 +124,7 @@ enum trace_flag_type {
> TRACE_FLAG_NEED_RESCHED = 0x04,
> TRACE_FLAG_HARDIRQ = 0x08,
> TRACE_FLAG_SOFTIRQ = 0x10,
> + TRACE_FLAG_PREEMPT_RESCHED = 0x20,
> };
>
> #define TRACE_BUF_SIZE 1024
> --- a/kernel/trace/trace_output.c
> +++ b/kernel/trace/trace_output.c
> @@ -618,8 +618,17 @@ int trace_print_lat_fmt(struct trace_seq
> (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
> (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
> '.';
> - need_resched =
> - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
> +
> + if ((entry->flags & TRACE_FLAG_NEED_RESCHED) &&
> + (entry->flags & TRACE_FLAG_PREEMPT_RESCHED))
> + need_resched = 'N';
> + else if (entry->flags & TRACE_FLAG_NEED_RESCHED)
> + need_resched = 'n';
> + else if (entry->flags & TRACE_FLAG_PREEMPT_RESCHED)
> + need_resched = 'p';
> + else
> + need_resched = '.';
> +
> hardsoft_irq =
> (hardirq && softirq) ? 'H' :
> hardirq ? 'h' :
On Fri, 4 Oct 2013 10:09:09 +0200
Peter Zijlstra <[email protected]> wrote:
> On Fri, Sep 27, 2013 at 05:29:08PM +0200, Peter Zijlstra wrote:
> > Subject: ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED
> > From: Peter Zijlstra <[email protected]>
> > Date: Fri Sep 27 17:11:00 CEST 2013
> >
> > Since we now have two need_resched states; trace the two so we can
> > observe discrepancies.
> >
> > Cc: Steven Rostedt <[email protected]>
> > Signed-off-by: Peter Zijlstra <[email protected]>
>
> Steve, if you're done conferencing.. any objections?
>
Taking a quick look, my only objection so far is that the change log is
rather flimsy. What are the two states and what are their dependencies?
In other words, what does these flags in the trace actually mean?
Probably need to add comments in the code and/or update the
Documentation section
-- Steve
On Fri, Oct 04, 2013 at 10:53:42AM -0400, Steven Rostedt wrote:
> On Fri, 4 Oct 2013 10:09:09 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > On Fri, Sep 27, 2013 at 05:29:08PM +0200, Peter Zijlstra wrote:
> > > Subject: ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED
> > > From: Peter Zijlstra <[email protected]>
> > > Date: Fri Sep 27 17:11:00 CEST 2013
> > >
> > > Since we now have two need_resched states; trace the two so we can
> > > observe discrepancies.
> > >
> > > Cc: Steven Rostedt <[email protected]>
> > > Signed-off-by: Peter Zijlstra <[email protected]>
> >
> > Steve, if you're done conferencing.. any objections?
> >
>
> Taking a quick look, my only objection so far is that the change log is
> rather flimsy. What are the two states and what are their dependencies?
>
> In other words, what does these flags in the trace actually mean?
> Probably need to add comments in the code and/or update the
> Documentation section
Ah, you missed the preemption series?
1a338ac32ca6 sched, x86: Optimize the preempt_schedule() call
c2daa3bed53a sched, x86: Provide a per-cpu preempt_count implementation
a233f1120c37 sched: Prepare for per-cpu preempt_count
bdb438065890 sched: Extract the basic add/sub preempt_count modifiers
01028747559a sched: Create more preempt_count accessors
a787870924db sched, arch: Create asm/preempt.h
f27dde8deef3 sched: Add NEED_RESCHED to the preempt_count
4a2b4b222743 sched: Introduce preempt_count accessor functions
ea8117478918 sched, idle: Fix the idle polling state logic
315039862646 sched: Remove {set,clear}_need_resched
b021fe3e2509 sched, rcu: Make RCU use resched_cpu()
0c44c2d0f459 x86: Use asm goto to implement better modify_and_test() functions
preempt_count now includes a NEED_RESCHED and this patch shows which
is set: N both PREEMPT_NEED_RESCHED and TIF_NEED_RESCHED, n for TIF only
and p for preempt only.
On Fri, 4 Oct 2013 17:16:18 +0200
Peter Zijlstra <[email protected]> wrote:
> Documentation section
>
> Ah, you missed the preemption series?
Yeah, I've seen them, just haven't looked at them too deeply yet.
>
> 1a338ac32ca6 sched, x86: Optimize the preempt_schedule() call
> c2daa3bed53a sched, x86: Provide a per-cpu preempt_count implementation
> a233f1120c37 sched: Prepare for per-cpu preempt_count
> bdb438065890 sched: Extract the basic add/sub preempt_count modifiers
> 01028747559a sched: Create more preempt_count accessors
> a787870924db sched, arch: Create asm/preempt.h
> f27dde8deef3 sched: Add NEED_RESCHED to the preempt_count
> 4a2b4b222743 sched: Introduce preempt_count accessor functions
> ea8117478918 sched, idle: Fix the idle polling state logic
> 315039862646 sched: Remove {set,clear}_need_resched
> b021fe3e2509 sched, rcu: Make RCU use resched_cpu()
> 0c44c2d0f459 x86: Use asm goto to implement better modify_and_test() functions
>
> preempt_count now includes a NEED_RESCHED and this patch shows which
> is set: N both PREEMPT_NEED_RESCHED and TIF_NEED_RESCHED, n for TIF only
> and p for preempt only.
You still need to update Documentation/trace/ftrace.txt. Search for
"need-resched" and that part will need to be updated.
Thanks,
-- Steve
On Fri, Oct 04, 2013 at 10:53:42AM -0400, Steven Rostedt wrote:
> In other words, what does these flags in the trace actually mean?
> Probably need to add comments in the code and/or update the
> Documentation section
If "task need resched" is supposed to explain things; the below too will
suffice... muwhahaha!
---
Subject: ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED
From: Peter Zijlstra <[email protected]>
Date: Fri Sep 27 17:11:00 CEST 2013
Since the introduction of PREEMPT_NEED_RESCHED; see commit:
f27dde8deef3 ("sched: Add NEED_RESCHED to the preempt_count") we need
to be able to look at both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED
to understand the full preemption behaviour. Add it to the trace
output.
Cc: Steven Rostedt <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
---
Documentation/trace/ftrace.txt | 6 +++++-
kernel/trace/trace.c | 3 ++-
kernel/trace/trace.h | 1 +
kernel/trace/trace_output.c | 13 +++++++++++--
4 files changed, 19 insertions(+), 4 deletions(-)
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -655,7 +655,11 @@ explains which is which.
read the irq flags variable, an 'X' will always
be printed here.
- need-resched: 'N' task need_resched is set, '.' otherwise.
+ need-resched:
+ 'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set,
+ 'n' only TIF_NEED_RESCHED is set,
+ 'p' only PREEMPT_NEED_RESCHED is set,
+ '.' otherwise.
hardirq/softirq:
'H' - hard irq occurred inside a softirq.
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trac
#endif
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
- (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+ (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+ (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -124,6 +124,7 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_PREEMPT_RESCHED = 0x20,
};
#define TRACE_BUF_SIZE 1024
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -618,8 +618,17 @@ int trace_print_lat_fmt(struct trace_seq
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
- need_resched =
- (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+
+ if ((entry->flags & TRACE_FLAG_NEED_RESCHED) &&
+ (entry->flags & TRACE_FLAG_PREEMPT_RESCHED))
+ need_resched = 'N';
+ else if (entry->flags & TRACE_FLAG_NEED_RESCHED)
+ need_resched = 'n';
+ else if (entry->flags & TRACE_FLAG_PREEMPT_RESCHED)
+ need_resched = 'p';
+ else
+ need_resched = '.';
+
hardsoft_irq =
(hardirq && softirq) ? 'H' :
hardirq ? 'h' :
On Fri, 4 Oct 2013 17:28:26 +0200
Peter Zijlstra <[email protected]> wrote:
> On Fri, Oct 04, 2013 at 10:53:42AM -0400, Steven Rostedt wrote:
> > In other words, what does these flags in the trace actually mean?
> > Probably need to add comments in the code and/or update the
> > Documentation section
>
> If "task need resched" is supposed to explain things; the below too will
> suffice... muwhahaha!
Damn! You caught on. (/me fails to get Peter to explain my
documentation better)
>
> ---
> Subject: ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED
> From: Peter Zijlstra <[email protected]>
> Date: Fri Sep 27 17:11:00 CEST 2013
>
> Since the introduction of PREEMPT_NEED_RESCHED; see commit:
> f27dde8deef3 ("sched: Add NEED_RESCHED to the preempt_count") we need
> to be able to look at both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED
> to understand the full preemption behaviour. Add it to the trace
> output.
This is much better than your previous change log. At least now we have
a pointer to what to read to understand this change.
>
> Cc: Steven Rostedt <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Link: http://lkml.kernel.org/n/[email protected]
> ---
> Documentation/trace/ftrace.txt | 6 +++++-
> kernel/trace/trace.c | 3 ++-
> kernel/trace/trace.h | 1 +
> kernel/trace/trace_output.c | 13 +++++++++++--
> 4 files changed, 19 insertions(+), 4 deletions(-)
>
> --- a/Documentation/trace/ftrace.txt
> +++ b/Documentation/trace/ftrace.txt
> @@ -655,7 +655,11 @@ explains which is which.
> read the irq flags variable, an 'X' will always
> be printed here.
>
> - need-resched: 'N' task need_resched is set, '.' otherwise.
> + need-resched:
> + 'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set,
> + 'n' only TIF_NEED_RESCHED is set,
> + 'p' only PREEMPT_NEED_RESCHED is set,
> + '.' otherwise.
Yes this is actually good enough. I'll have to spend time to explain
this better. But I'll let you off the hook from doing it for me ;-)
>
> hardirq/softirq:
> 'H' - hard irq occurred inside a softirq.
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trac
> #endif
> ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
> ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
> - (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
> + (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
> + (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
> }
> EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
>
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -124,6 +124,7 @@ enum trace_flag_type {
> TRACE_FLAG_NEED_RESCHED = 0x04,
> TRACE_FLAG_HARDIRQ = 0x08,
> TRACE_FLAG_SOFTIRQ = 0x10,
> + TRACE_FLAG_PREEMPT_RESCHED = 0x20,
We'll have to update libtraceevent to handle this. I'm hoping it
doesn't barf on the new flag. I don't think it would, but I need to
look at that code to make sure.
> };
>
> #define TRACE_BUF_SIZE 1024
> --- a/kernel/trace/trace_output.c
> +++ b/kernel/trace/trace_output.c
> @@ -618,8 +618,17 @@ int trace_print_lat_fmt(struct trace_seq
> (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
> (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
> '.';
> - need_resched =
> - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
> +
> + if ((entry->flags & TRACE_FLAG_NEED_RESCHED) &&
> + (entry->flags & TRACE_FLAG_PREEMPT_RESCHED))
> + need_resched = 'N';
> + else if (entry->flags & TRACE_FLAG_NEED_RESCHED)
> + need_resched = 'n';
> + else if (entry->flags & TRACE_FLAG_PREEMPT_RESCHED)
> + need_resched = 'p';
> + else
> + need_resched = '.';
> +
We could optimize the above with:
int ns;
ns = entry->flags & (TRACE_FLAG_NEED_RESCHED |
TRACE_FLAG_PREEMPT_RESCHED);
switch (ns) {
case 0:
need_resched = '.';
break;
case TRACE_FLAG_NEED_RSCHED:
need_resched = 'n';
break;
case TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'p';
break;
case TRACE_FLAG_NEED_RESCHED |
TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'N';
break;
}
Not sure if the above is more readable or not, or if it is worth it.
-- Steve
> hardsoft_irq =
> (hardirq && softirq) ? 'H' :
> hardirq ? 'h' :
On Fri, Oct 04, 2013 at 11:57:12AM -0400, Steven Rostedt wrote:
> > + need-resched:
> > + 'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set,
> > + 'n' only TIF_NEED_RESCHED is set,
> > + 'p' only PREEMPT_NEED_RESCHED is set,
> > + '.' otherwise.
>
> Yes this is actually good enough. I'll have to spend time to explain
> this better. But I'll let you off the hook from doing it for me ;-)
beware; such endevours might lead to books and those are typically out
of date by the time they hit the street -- ie. they're a fair waste of
time ;-)