2013-09-17 06:17:52

by Vineet Gupta

[permalink] [raw]
Subject: [RFC PATCH REBASED 0/3] Move ARCH specific fpu_counter out of task_struct

Hi,

This is a resend of the patches I sent last week which failed to draw any
attention.

This basically came up when ARC SMP 3.11 failed to build due to a ST insn
dealing with task_struct.thread going out of range.

When staring at code I spotted @fpu_counter in task_struct which only SH/x86
happen to use, and can be easily moved out into corresponding ARCH specific
thread_struct. This saves 4 bytes per task_struct instantiated, for all the
other 18 arches.

One of the resistance to the patch(es) could be that task_struct fields were at
some point arranged to reduce the number of line misses. However there is lot
of variance there due to so many #ifdefs, variable size of cpumask (>32 cores..)
etc, so that argument alone might not be sufficient vs. saving 4 bytes per
instance.

Comments please !

Thx,
-Vineet

Vineet Gupta (3):
sh: Move fpu_counter into ARCH specific thread_struct
x86: Move fpu_counter into ARCH specific thread_struct
sched: Remove ARCH specific fpu_counter from task_struct

arch/sh/include/asm/fpu.h | 2 +-
arch/sh/include/asm/processor_32.h | 10 ++++++++++
arch/sh/include/asm/processor_64.h | 10 ++++++++++
arch/sh/kernel/cpu/fpu.c | 2 +-
arch/sh/kernel/process_32.c | 6 +++---
arch/x86/include/asm/fpu-internal.h | 10 +++++-----
arch/x86/include/asm/processor.h | 9 +++++++++
arch/x86/kernel/i387.c | 2 +-
arch/x86/kernel/process_32.c | 4 ++--
arch/x86/kernel/process_64.c | 2 +-
arch/x86/kernel/traps.c | 2 +-
include/linux/sched.h | 9 ---------
12 files changed, 44 insertions(+), 24 deletions(-)

--
1.8.1.2


2013-09-17 06:18:23

by Vineet Gupta

[permalink] [raw]
Subject: [RFC PATCH REBASED 2/3] x86: Move fpu_counter into ARCH specific thread_struct

Only a couple of arches (sh/x86) use fpu_counter in task_struct so it
can be moved out into ARCH specific thread_struct, reducing the size of
task_struct for other arches.

Compile tested i386_defconfig + gcc 4.7.3

Signed-off-by: Vineet Gupta <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: [email protected]
Cc: Suresh Siddha <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Vincent Palatin <[email protected]>
Cc: Len Brown <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Paul Gortmaker <[email protected]>
Cc: Pekka Riikonen <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Dave Jones <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: [email protected]
---
arch/x86/include/asm/fpu-internal.h | 10 +++++-----
arch/x86/include/asm/processor.h | 9 +++++++++
arch/x86/kernel/i387.c | 2 +-
arch/x86/kernel/process_32.c | 4 ++--
arch/x86/kernel/process_64.c | 2 +-
arch/x86/kernel/traps.c | 2 +-
6 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 4d0bda7..c49a613 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -365,7 +365,7 @@ static inline void drop_fpu(struct task_struct *tsk)
* Forget coprocessor state..
*/
preempt_disable();
- tsk->fpu_counter = 0;
+ tsk->thread.fpu_counter = 0;
__drop_fpu(tsk);
clear_used_math();
preempt_enable();
@@ -424,7 +424,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
* or if the past 5 consecutive context-switches used math.
*/
fpu.preload = tsk_used_math(new) && (use_eager_fpu() ||
- new->fpu_counter > 5);
+ new->thread.fpu_counter > 5);
if (__thread_has_fpu(old)) {
if (!__save_init_fpu(old))
cpu = ~0;
@@ -433,16 +433,16 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta

/* Don't change CR0.TS if we just switch! */
if (fpu.preload) {
- new->fpu_counter++;
+ new->thread.fpu_counter++;
__thread_set_has_fpu(new);
prefetch(new->thread.fpu.state);
} else if (!use_eager_fpu())
stts();
} else {
- old->fpu_counter = 0;
+ old->thread.fpu_counter = 0;
old->thread.fpu.last_cpu = ~0;
if (fpu.preload) {
- new->fpu_counter++;
+ new->thread.fpu_counter++;
if (!use_eager_fpu() && fpu_lazy_restore(new, cpu))
fpu.preload = 0;
else
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 987c75e..7b034a4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -488,6 +488,15 @@ struct thread_struct {
unsigned long iopl;
/* Max allowed port in the bitmap, in bytes: */
unsigned io_bitmap_max;
+ /*
+ * fpu_counter contains the number of consecutive context switches
+ * that the FPU is used. If this is over a threshold, the lazy fpu
+ * saving becomes unlazy to save the trap. This is an unsigned char
+ * so that after 256 times the counter wraps and the behavior turns
+ * lazy again; this to deal with bursty apps that only use FPU for
+ * a short time
+ */
+ unsigned char fpu_counter;
};

/*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 5d576ab..e8368c6 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -100,7 +100,7 @@ void unlazy_fpu(struct task_struct *tsk)
__save_init_fpu(tsk);
__thread_fpu_end(tsk);
} else
- tsk->fpu_counter = 0;
+ tsk->thread.fpu_counter = 0;
preempt_enable();
}
EXPORT_SYMBOL(unlazy_fpu);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 884f98f..6af43b0 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -153,7 +153,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
childregs->orig_ax = -1;
childregs->cs = __KERNEL_CS | get_kernel_rpl();
childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
- p->fpu_counter = 0;
+ p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
return 0;
@@ -166,7 +166,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.ip = (unsigned long) ret_from_fork;
task_user_gs(p) = get_user_gs(current_pt_regs());

- p->fpu_counter = 0;
+ p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
tsk = current;
err = -ENOMEM;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bb1dc51..bbab295 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -163,7 +163,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.sp = (unsigned long) childregs;
p->thread.usersp = me->thread.usersp;
set_tsk_thread_flag(p, TIF_FORK);
- p->fpu_counter = 0;
+ p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;

savesegment(gs, p->thread.gsindex);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8c8093b..64b980f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -653,7 +653,7 @@ void math_state_restore(void)
return;
}

- tsk->fpu_counter++;
+ tsk->thread.fpu_counter++;
}
EXPORT_SYMBOL_GPL(math_state_restore);

--
1.8.1.2

2013-09-17 06:18:30

by Vineet Gupta

[permalink] [raw]
Subject: [RFC PATCH REBASED 3/3] sched: Remove ARCH specific fpu_counter from task_struct

fpu_counter in task_struct was used only by sh/x86.
Both of these now carry it in ARCH specific thread_struct, hence this
can now be removed from generic task_struct, shrinking it slightly for
other arches.

Signed-off-by: Vineet Gupta <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Andrew Morton <[email protected]>
---
include/linux/sched.h | 9 ---------
1 file changed, 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6682da3..78386c1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,15 +1046,6 @@ struct task_struct {
struct hlist_head preempt_notifiers;
#endif

- /*
- * fpu_counter contains the number of consecutive context switches
- * that the FPU is used. If this is over a threshold, the lazy fpu
- * saving becomes unlazy to save the trap. This is an unsigned char
- * so that after 256 times the counter wraps and the behavior turns
- * lazy again; this to deal with bursty apps that only use FPU for
- * a short time
- */
- unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
--
1.8.1.2

2013-09-17 06:18:36

by Vineet Gupta

[permalink] [raw]
Subject: [RFC PATCH REBASED 1/3] sh: Move fpu_counter into ARCH specific thread_struct

Only a couple of arches (sh/x86) use fpu_counter in task_struct so it
can be moved out into ARCH specific thread_struct, reducing the size of
task_struct for other arches.

Compile tested sh defconfig + sh4-linux-gcc (4.6.3)

Signed-off-by: Vineet Gupta <[email protected]>
Cc: Paul Mundt <[email protected]>
Cc: Michel Lespinasse <[email protected]>
Cc: Kuninori Morimoto <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Jesper Nilsson <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: [email protected]
---
arch/sh/include/asm/fpu.h | 2 +-
arch/sh/include/asm/processor_32.h | 10 ++++++++++
arch/sh/include/asm/processor_64.h | 10 ++++++++++
arch/sh/kernel/cpu/fpu.c | 2 +-
arch/sh/kernel/process_32.c | 6 +++---
5 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/arch/sh/include/asm/fpu.h b/arch/sh/include/asm/fpu.h
index 06c4281..09fc2bc 100644
--- a/arch/sh/include/asm/fpu.h
+++ b/arch/sh/include/asm/fpu.h
@@ -46,7 +46,7 @@ static inline void __unlazy_fpu(struct task_struct *tsk, struct pt_regs *regs)
save_fpu(tsk);
release_fpu(regs);
} else
- tsk->fpu_counter = 0;
+ tsk->thread.fpu_counter = 0;
}

static inline void unlazy_fpu(struct task_struct *tsk, struct pt_regs *regs)
diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h
index e699a12..18e0377 100644
--- a/arch/sh/include/asm/processor_32.h
+++ b/arch/sh/include/asm/processor_32.h
@@ -111,6 +111,16 @@ struct thread_struct {

/* Extended processor state */
union thread_xstate *xstate;
+
+ /*
+ * fpu_counter contains the number of consecutive context switches
+ * that the FPU is used. If this is over a threshold, the lazy fpu
+ * saving becomes unlazy to save the trap. This is an unsigned char
+ * so that after 256 times the counter wraps and the behavior turns
+ * lazy again; this to deal with bursty apps that only use FPU for
+ * a short time
+ */
+ unsigned char fpu_counter;
};

#define INIT_THREAD { \
diff --git a/arch/sh/include/asm/processor_64.h b/arch/sh/include/asm/processor_64.h
index 1cc7d31..eedd4f6 100644
--- a/arch/sh/include/asm/processor_64.h
+++ b/arch/sh/include/asm/processor_64.h
@@ -126,6 +126,16 @@ struct thread_struct {

/* floating point info */
union thread_xstate *xstate;
+
+ /*
+ * fpu_counter contains the number of consecutive context switches
+ * that the FPU is used. If this is over a threshold, the lazy fpu
+ * saving becomes unlazy to save the trap. This is an unsigned char
+ * so that after 256 times the counter wraps and the behavior turns
+ * lazy again; this to deal with bursty apps that only use FPU for
+ * a short time
+ */
+ unsigned char fpu_counter;
};

#define INIT_MMAP \
diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c
index f8f7af5..4e33224 100644
--- a/arch/sh/kernel/cpu/fpu.c
+++ b/arch/sh/kernel/cpu/fpu.c
@@ -44,7 +44,7 @@ void __fpu_state_restore(void)
restore_fpu(tsk);

task_thread_info(tsk)->status |= TS_USEDFPU;
- tsk->fpu_counter++;
+ tsk->thread.fpu_counter++;
}

void fpu_state_restore(struct pt_regs *regs)
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index ebd3933..2885fc9 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
#endif
ti->addr_limit = KERNEL_DS;
ti->status &= ~TS_USEDFPU;
- p->fpu_counter = 0;
+ p->thread.fpu_counter = 0;
return 0;
}
*childregs = *current_pt_regs();
@@ -189,7 +189,7 @@ __switch_to(struct task_struct *prev, struct task_struct *next)
unlazy_fpu(prev, task_pt_regs(prev));

/* we're going to use this soon, after a few expensive things */
- if (next->fpu_counter > 5)
+ if (next->thread.fpu_counter > 5)
prefetch(next_t->xstate);

#ifdef CONFIG_MMU
@@ -207,7 +207,7 @@ __switch_to(struct task_struct *prev, struct task_struct *next)
* restore of the math state immediately to avoid the trap; the
* chances of needing FPU soon are obviously high now
*/
- if (next->fpu_counter > 5)
+ if (next->thread.fpu_counter > 5)
__fpu_state_restore();

return prev;
--
1.8.1.2

2013-10-01 10:33:52

by Vineet Gupta

[permalink] [raw]
Subject: Re: [RFC PATCH REBASED 1/3] sh: Move fpu_counter into ARCH specific thread_struct

Hi Paul/SH folks.

Would appreciate your ACK/NAK on this.

Thx,
-Vineet

On 09/17/2013 11:47 AM, Vineet Gupta wrote:
> Only a couple of arches (sh/x86) use fpu_counter in task_struct so it
> can be moved out into ARCH specific thread_struct, reducing the size of
> task_struct for other arches.
>
> Compile tested sh defconfig + sh4-linux-gcc (4.6.3)
>
> Signed-off-by: Vineet Gupta <[email protected]>
> Cc: Paul Mundt <[email protected]>
> Cc: Michel Lespinasse <[email protected]>
> Cc: Kuninori Morimoto <[email protected]>
> Cc: Al Viro <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Jesper Nilsson <[email protected]>
> Cc: Chris Metcalf <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: Ingo Molnar <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: [email protected]
> ---
> arch/sh/include/asm/fpu.h | 2 +-
> arch/sh/include/asm/processor_32.h | 10 ++++++++++
> arch/sh/include/asm/processor_64.h | 10 ++++++++++
> arch/sh/kernel/cpu/fpu.c | 2 +-
> arch/sh/kernel/process_32.c | 6 +++---
> 5 files changed, 25 insertions(+), 5 deletions(-)
>
> diff --git a/arch/sh/include/asm/fpu.h b/arch/sh/include/asm/fpu.h
> index 06c4281..09fc2bc 100644
> --- a/arch/sh/include/asm/fpu.h
> +++ b/arch/sh/include/asm/fpu.h
> @@ -46,7 +46,7 @@ static inline void __unlazy_fpu(struct task_struct *tsk, struct pt_regs *regs)
> save_fpu(tsk);
> release_fpu(regs);
> } else
> - tsk->fpu_counter = 0;
> + tsk->thread.fpu_counter = 0;
> }
>
> static inline void unlazy_fpu(struct task_struct *tsk, struct pt_regs *regs)
> diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h
> index e699a12..18e0377 100644
> --- a/arch/sh/include/asm/processor_32.h
> +++ b/arch/sh/include/asm/processor_32.h
> @@ -111,6 +111,16 @@ struct thread_struct {
>
> /* Extended processor state */
> union thread_xstate *xstate;
> +
> + /*
> + * fpu_counter contains the number of consecutive context switches
> + * that the FPU is used. If this is over a threshold, the lazy fpu
> + * saving becomes unlazy to save the trap. This is an unsigned char
> + * so that after 256 times the counter wraps and the behavior turns
> + * lazy again; this to deal with bursty apps that only use FPU for
> + * a short time
> + */
> + unsigned char fpu_counter;
> };
>
> #define INIT_THREAD { \
> diff --git a/arch/sh/include/asm/processor_64.h b/arch/sh/include/asm/processor_64.h
> index 1cc7d31..eedd4f6 100644
> --- a/arch/sh/include/asm/processor_64.h
> +++ b/arch/sh/include/asm/processor_64.h
> @@ -126,6 +126,16 @@ struct thread_struct {
>
> /* floating point info */
> union thread_xstate *xstate;
> +
> + /*
> + * fpu_counter contains the number of consecutive context switches
> + * that the FPU is used. If this is over a threshold, the lazy fpu
> + * saving becomes unlazy to save the trap. This is an unsigned char
> + * so that after 256 times the counter wraps and the behavior turns
> + * lazy again; this to deal with bursty apps that only use FPU for
> + * a short time
> + */
> + unsigned char fpu_counter;
> };
>
> #define INIT_MMAP \
> diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c
> index f8f7af5..4e33224 100644
> --- a/arch/sh/kernel/cpu/fpu.c
> +++ b/arch/sh/kernel/cpu/fpu.c
> @@ -44,7 +44,7 @@ void __fpu_state_restore(void)
> restore_fpu(tsk);
>
> task_thread_info(tsk)->status |= TS_USEDFPU;
> - tsk->fpu_counter++;
> + tsk->thread.fpu_counter++;
> }
>
> void fpu_state_restore(struct pt_regs *regs)
> diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
> index ebd3933..2885fc9 100644
> --- a/arch/sh/kernel/process_32.c
> +++ b/arch/sh/kernel/process_32.c
> @@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
> #endif
> ti->addr_limit = KERNEL_DS;
> ti->status &= ~TS_USEDFPU;
> - p->fpu_counter = 0;
> + p->thread.fpu_counter = 0;
> return 0;
> }
> *childregs = *current_pt_regs();
> @@ -189,7 +189,7 @@ __switch_to(struct task_struct *prev, struct task_struct *next)
> unlazy_fpu(prev, task_pt_regs(prev));
>
> /* we're going to use this soon, after a few expensive things */
> - if (next->fpu_counter > 5)
> + if (next->thread.fpu_counter > 5)
> prefetch(next_t->xstate);
>
> #ifdef CONFIG_MMU
> @@ -207,7 +207,7 @@ __switch_to(struct task_struct *prev, struct task_struct *next)
> * restore of the math state immediately to avoid the trap; the
> * chances of needing FPU soon are obviously high now
> */
> - if (next->fpu_counter > 5)
> + if (next->thread.fpu_counter > 5)
> __fpu_state_restore();
>
> return prev;
>