Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752196AbdIVLym (ORCPT ); Fri, 22 Sep 2017 07:54:42 -0400 Received: from bombadil.infradead.org ([65.50.211.133]:57788 "EHLO bombadil.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752039AbdIVLyl (ORCPT ); Fri, 22 Sep 2017 07:54:41 -0400 Date: Fri, 22 Sep 2017 13:54:30 +0200 From: Peter Zijlstra To: Markus Trippelsdorf Cc: Tejun Heo , linux-kernel@vger.kernel.org, "Luis R. Rodriguez" , "Eric W. Biederman" , "Paul E. McKenney" , Linus Torvalds , Steven Rostedt , Thomas Gleixner , Ingo Molnar Subject: [RFC][PATCH] sched: Cleanup task->state printing Message-ID: <20170922115430.moipv7sts6v4t7sw@hirez.programming.kicks-ass.net> References: <20170910073653.GA284@x4> <20170911131128.GD1774378@devbig577.frc2.facebook.com> <20170911142133.GA2265@x4> <20170921110842.GA4020@x4> <20170921123000.bip2whks53bwn7de@hirez.programming.kicks-ass.net> <20170921144127.GA236@x4> <20170922093533.GA235@x4> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20170922093533.GA235@x4> User-Agent: NeoMutt/20170609 (1.8.3) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 13027 Lines: 375 On Fri, Sep 22, 2017 at 11:35:33AM +0200, Markus Trippelsdorf wrote: > > It seems to work. Simply returning "I (idle)" from get_task_state() in > > fs/proc/array.c when the state is TASK_IDLE does the trick. > > I've tested top, htop and ps. I ended up with the below; there was quite a lot of inconsistent state printing around it seems. I should probably split this thing into a bunch of patches :/ Alongside an explicit idle state, this also exposes TASK_PARKED, although arguably we could map that to idle too. Opinions? --- fs/proc/array.c | 35 ++++++++++++----------- include/linux/sched.h | 58 +++++++++++++++++++++++---------------- include/trace/events/sched.h | 24 +++++++++++----- kernel/sched/core.c | 22 ++++++++++++++- kernel/sched/debug.c | 2 -- kernel/trace/trace_output.c | 21 ++++---------- kernel/trace/trace_sched_wakeup.c | 12 ++++---- 7 files changed, 103 insertions(+), 71 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 88c355574aa0..5a076854857f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -118,28 +118,31 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char * const task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "t (tracing stop)", /* 8 */ - "X (dead)", /* 16 */ - "Z (zombie)", /* 32 */ + /* states inside TASK_REPORT */ + + "R (running)", /* 0x00 */ + "S (sleeping)", /* 0x01 */ + "D (disk sleep)", /* 0x02 */ + "T (stopped)", /* 0x04 */ + "t (tracing stop)", /* 0x08 */ + "X (dead)", /* 0x10 */ + "Z (zombie)", /* 0x20 */ + "P (parked)", /* 0x40 */ + + /* extra states, beyond TASK_REPORT */ + + "I (idle)", /* 0x80 */ }; static inline const char *get_task_state(struct task_struct *tsk) { - unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; + unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; - /* - * Parked tasks do not run; they sit in __kthread_parkme(). - * Without this check, we would report them as running, which is - * clearly wrong, so we report them as sleeping instead. - */ - if (tsk->state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; + if (tsk_state == TASK_IDLE) + state = TASK_REPORT_MAX; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-2); return task_state_array[fls(state)]; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 68b38335d33c..7ae81efb17bd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -65,25 +65,27 @@ struct task_group; */ /* Used in tsk->state: */ -#define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define __TASK_STOPPED 4 -#define __TASK_TRACED 8 +#define TASK_RUNNING 0x0000 +#define TASK_INTERRUPTIBLE 0x0001 +#define TASK_UNINTERRUPTIBLE 0x0002 +#define __TASK_STOPPED 0x0004 +#define __TASK_TRACED 0x0008 /* Used in tsk->exit_state: */ -#define EXIT_DEAD 16 -#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 0x0010 +#define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 64 -#define TASK_WAKEKILL 128 -#define TASK_WAKING 256 -#define TASK_PARKED 512 -#define TASK_NOLOAD 1024 -#define TASK_NEW 2048 -#define TASK_STATE_MAX 4096 +#define TASK_PARKED 0x0040 +#define TASK_REPORT_MAX 0x0080 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" +/* Not in TASK_REPORT: */ +#define TASK_DEAD 0x0080 +#define TASK_WAKEKILL 0x0100 +#define TASK_WAKING 0x0200 +#define TASK_NOLOAD 0x0400 +#define TASK_NEW 0x0800 + +#define TASK_STATE_MAX 0x1000 /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) @@ -96,10 +98,11 @@ struct task_group; #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) -/* get_task_state(): */ +/* task_state_to_char(), get_task_state(), trace_sched_switch() */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) @@ -1244,17 +1247,24 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } -static inline char task_state_to_char(struct task_struct *task) +static inline char __task_state_to_char(unsigned int state) { - const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - unsigned long state = task->state; + static const char state_char[] = "RSDTtXZPI"; - state = state ? __ffs(state) + 1 : 0; + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 3); + + return state_char[fls(state)]; +} + +static inline char task_state_to_char(struct task_struct *task) +{ + unsigned int tsk_state = READ_ONCE(task->state); + unsigned int state = (tsk_state | task->exit_state) & TASK_REPORT; - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); + if (tsk_state == TASK_IDLE) + state = TASK_REPORT_MAX; - return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'; + return __task_state_to_char(state); } /** diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ae1409ffe99a..af1858a01335 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -106,6 +106,8 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, #ifdef CREATE_TRACE_POINTS static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) { + unsigned int state = READ_ONCE(p->state); + #ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); #endif /* CONFIG_SCHED_DEBUG */ @@ -114,10 +116,18 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * * Preemption ignores task state, therefore preempted tasks are always * RUNNING (we will not have dequeued if state != RUNNING). */ - return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state; + if (preempt) + return TASK_REPORT_MAX << 1; + + if (state == TASK_IDLE) + return TASK_REPORT_MAX; + + return (state | p->exit_state) & TASK_REPORT; } #endif /* CREATE_TRACE_POINTS */ +#define TRACE_REPORT_MASK ((TASK_REPORT_MAX << 1) - 1) + /* * Tracepoint for task switches, performed by the scheduler: */ @@ -152,13 +162,13 @@ TRACE_EVENT(sched_switch, TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->prev_state & (TASK_STATE_MAX-1) ? - __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", + + __entry->prev_state & TRACE_REPORT_MASK ? + __print_flags(__entry->prev_state & TRACE_REPORT_MASK, "|", { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, - { 16, "Z" }, { 32, "X" }, { 64, "x" }, - { 128, "K" }, { 256, "W" }, { 512, "P" }, - { 1024, "N" }) : "R", - __entry->prev_state & TASK_STATE_MAX ? "+" : "", + { 16, "X" }, { 32, "Z" }, { 64, "P" }, + { 128, "I" }) : "R", + __entry->prev_state & (TASK_REPORT_MAX << 1) ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 703f5831738e..431e2d6c709e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5164,6 +5164,26 @@ void sched_show_task(struct task_struct *p) put_task_stack(p); } +static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p) +{ + /* no filter, everything matches */ + if (!state_filter) + return true; + + /* filter, but doesn't match */ + if (!(p->state & state_filter)) + return false; + + /* + * When looking for TASK_UNINTERRUPTIBLE, skip TASK_IDLE, but allow + * TASK_KILLABLE. + */ + if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) + return false; + + return true; +} + void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; @@ -5186,7 +5206,7 @@ void show_state_filter(unsigned long state_filter) */ touch_nmi_watchdog(); touch_all_softlockup_watchdogs(); - if (!state_filter || (p->state & state_filter)) + if (state_filter_match(state_filter, p)) sched_show_task(p); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4a23bbc3111b..244619e402cc 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -461,8 +461,6 @@ static char *task_group_path(struct task_group *tg) } #endif -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bac629af2285..c738e764e2a5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); + T = __task_state_to_char(field->next_state); + S = __task_state_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ddec53b67646..b14caa0afd35 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -380,7 +380,7 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) } static void -tracing_sched_switch_trace(struct trace_array *tr, +tracing_sched_switch_trace(bool preempt, struct trace_array *tr, struct task_struct *prev, struct task_struct *next, unsigned long flags, int pc) @@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = prev->state; + entry->prev_state = __trace_sched_switch_state(preempt, prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = next->state; + entry->next_state = __trace_sched_switch_state(false, next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = curr->state; + entry->prev_state = __trace_sched_switch_state(false, curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = wakee->state; + entry->next_state = __trace_sched_switch_state(false, wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -482,7 +482,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); - tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); + tracing_sched_switch_trace(preempt, wakeup_trace, prev, next, flags, pc); T0 = data->preempt_timestamp; T1 = ftrace_now(cpu);