2009-01-03 04:22:22

by Brian Gerst

[permalink] [raw]
Subject: PDA changes (take 3)

On Fri, Jan 2, 2009 at 3:11 PM, H. Peter Anvin <[email protected]> wrote:
> Brian Gerst wrote:
>> Here are the PDA patches again. I changed the first patch so that the PDA is cacheline
>> aligned. I dropped the cpunumber patch for now, and added conversions of the TLB state
>> and IRQ stats to match the 32-bit code. Stats for defconfig:
>>
>> text data bss dec hex filename
>> 7033649 1754476 758508 9546633 91ab89 vmlinux.before
>> 7029643 1754716 758508 9542867 919cd3 vmlinux.after
>>
>> Patches are against 2.6.28.
>
> Hi there,
>
> I just tried to apply your patchset, but it fails rather dramatically on
> patch 3. Could you refresh it against tip:master or current Linus git?

It was a conflict with commit 915b0d0104b72fd36af088ba4b11b5690bc96a6c
(x86: hardirq: introduce inc_irq_stat()). Resending the series
rebased against current Linus git.

--
Brian Gerst


2009-01-03 04:23:45

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 1/4] x86-64: Convert the PDA to percpu.

This patch makes the PDA a normal per-cpu variable, allowing the
removal of the special allocator code. %gs still points to the
base of the PDA.

Tested on a dual-core AMD64 system.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 4 --
arch/x86/include/asm/percpu.h | 3 --
arch/x86/include/asm/setup.h | 1 -
arch/x86/kernel/cpu/common.c | 6 ++--
arch/x86/kernel/dumpstack_64.c | 8 ++--
arch/x86/kernel/head64.c | 23 +------------
arch/x86/kernel/irq.c | 2 +-
arch/x86/kernel/nmi.c | 2 +-
arch/x86/kernel/setup_percpu.c | 70 ++++++++--------------------------------
arch/x86/kernel/smpboot.c | 58 +--------------------------------
arch/x86/xen/enlighten.c | 2 +-
arch/x86/xen/smp.c | 12 +------
12 files changed, 27 insertions(+), 164 deletions(-)

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff8..60e8d91 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -23,7 +23,6 @@ struct x8664_pda {
#endif
char *irqstackptr;
short nodenumber; /* number of current node (32k max) */
- short in_bootmem; /* pda lives in bootmem */
unsigned int __softirq_pending;
unsigned int __nmi_count; /* number of NMI on this CPUs */
short mmu_state;
@@ -39,11 +38,8 @@ struct x8664_pda {
unsigned irq_spurious_count;
} ____cacheline_aligned_in_smp;

-extern struct x8664_pda **_cpu_pda;
extern void pda_init(int);

-#define cpu_pda(i) (_cpu_pda[i])
-
/*
* There is no fast way to get the base address of the PDA, all the accesses
* have to mention %fs/%gs. So it needs to be done this Torvaldian way.
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index ece7205..6f866fd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -12,11 +12,8 @@
#ifdef CONFIG_SMP
#include <asm/pda.h>

-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
#define __my_cpu_offset read_pda(data_offset)

-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
#endif
#include <asm-generic/percpu.h>

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 4fcd53f..2f3e50e 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -100,7 +100,6 @@ extern unsigned long init_pg_tables_start;
extern unsigned long init_pg_tables_end;

#else
-void __init x86_64_init_pda(void);
void __init x86_64_start_kernel(char *real_mode);
void __init x86_64_start_reservations(char *real_mode_data);

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 42e0853..d039178 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -859,8 +859,8 @@ __setup("clearcpuid=", setup_disablecpuid);
cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

#ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
+DEFINE_PER_CPU_SHARED_ALIGNED(struct x8664_pda, pda);
+EXPORT_PER_CPU_SYMBOL(pda);

struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };

@@ -868,7 +868,7 @@ static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;

void __cpuinit pda_init(int cpu)
{
- struct x8664_pda *pda = cpu_pda(cpu);
+ struct x8664_pda *pda = &per_cpu(pda, cpu);

/* Setup up data that may be needed in __get_free_pages early */
loadsegment(fs, 0);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index c302d07..23e13e7 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -106,7 +106,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irqstack_end = (unsigned long *)per_cpu(pda, cpu).irqstackptr;
unsigned used = 0;
struct thread_info *tinfo;
int graph = 0;
@@ -200,9 +200,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
int i;
const int cpu = smp_processor_id();
unsigned long *irqstack_end =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ (unsigned long *) (per_cpu(pda, cpu).irqstackptr);
unsigned long *irqstack =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ (unsigned long *) (per_cpu(pda, cpu).irqstackptr - IRQSTACKSIZE);

/*
* debugging aid: "show_stack(NULL, NULL);" prints the
@@ -241,7 +241,7 @@ void show_registers(struct pt_regs *regs)
int i;
unsigned long sp;
const int cpu = smp_processor_id();
- struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+ struct task_struct *cur = per_cpu(pda, cpu).pcurrent;

sp = regs->sp;
printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 388e05a..af67d32 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,27 +26,6 @@
#include <asm/bios_ebda.h>
#include <asm/trampoline.h>

-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
-void __init x86_64_init_pda(void)
-{
- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
-}
-
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -112,7 +91,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
if (console_loglevel == 10)
early_printk("Kernel alive\n");

- x86_64_init_pda();
+ pda_init(0);

x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index bce53e1..90f87fd 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -39,7 +39,7 @@ void ack_bad_irq(unsigned int irq)
#ifdef CONFIG_X86_32
# define irq_stats(x) (&per_cpu(irq_stat, x))
#else
-# define irq_stats(x) cpu_pda(x)
+# define irq_stats(x) (&per_cpu(pda, x))
#endif
/*
* /proc/interrupts printing:
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 8bd1bf9..235672f 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -64,7 +64,7 @@ static int endflag __initdata;
static inline unsigned int get_nmi_count(int cpu)
{
#ifdef CONFIG_X86_64
- return cpu_pda(cpu)->__nmi_count;
+ return per_cpu(pda, cpu).__nmi_count;
#else
return nmi_count(cpu);
#endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0b63b08..f27e7e7 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,58 +80,8 @@ static void __init setup_per_cpu_maps(void)
#endif
}

-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
- char *pda;
- struct x8664_pda **new_cpu_pda;
- unsigned long size;
- int cpu;
-
- size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
- /* allocate cpu_pda array and pointer table */
- {
- unsigned long tsize = nr_cpu_ids * sizeof(void *);
- unsigned long asize = size * (nr_cpu_ids - 1);
-
- tsize = roundup(tsize, cache_line_size());
- new_cpu_pda = alloc_bootmem(tsize + asize);
- pda = (char *)new_cpu_pda + tsize;
- }
-
- /* initialize pointer table to static pda's */
- for_each_possible_cpu(cpu) {
- if (cpu == 0) {
- /* leave boot cpu pda in place */
- new_cpu_pda[0] = cpu_pda(0);
- continue;
- }
- new_cpu_pda[cpu] = (struct x8664_pda *)pda;
- new_cpu_pda[cpu]->in_bootmem = 1;
- pda += size;
- }
-
- /* point to new pointer table */
- _cpu_pda = new_cpu_pda;
-}
-#endif

/*
* Great future plan:
@@ -145,9 +95,6 @@ void __init setup_per_cpu_areas(void)
int cpu;
unsigned long align = 1;

- /* Setup cpu_pda map */
- setup_cpu_pda_map();
-
/* Copy section for each CPU (we discard the original) */
old_size = PERCPU_ENOUGH_ROOM;
align = max_t(unsigned long, PAGE_SIZE, align);
@@ -187,10 +134,21 @@ void __init setup_per_cpu_areas(void)
cpu, node, __pa(ptr));
}
#endif
- per_cpu_offset(cpu) = ptr - __per_cpu_start;
+ __per_cpu_offset[cpu] = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+#ifdef CONFIG_X86_64
+ if (cpu)
+ memset(&per_cpu(pda, cpu), 0, sizeof(struct x8664_pda));
+ per_cpu(pda, cpu).data_offset = __per_cpu_offset[cpu];
+#endif
}

+#ifdef CONFIG_X86_64
+ mb();
+ wrmsrl(MSR_GS_BASE, &per_cpu(pda, 0));
+ mb();
+#endif
+
/* Setup percpu data maps */
setup_per_cpu_maps();

@@ -234,8 +192,8 @@ void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

- if (cpu_pda(cpu) && node != NUMA_NO_NODE)
- cpu_pda(cpu)->nodenumber = node;
+ if (node != NUMA_NO_NODE)
+ per_cpu(pda, cpu).nodenumber = node;

if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 31869bf..e50fea9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -744,52 +744,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
complete(&c_idle->done);
}

-#ifdef CONFIG_X86_64
-
-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
-static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
-{
- if (!after_bootmem)
- free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
-}
-
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
- struct x8664_pda *oldpda, *newpda;
- unsigned long size = sizeof(struct x8664_pda);
- int node = cpu_to_node(cpu);
-
- if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
- return 0;
-
- oldpda = cpu_pda(cpu);
- newpda = kmalloc_node(size, GFP_ATOMIC, node);
- if (!newpda) {
- printk(KERN_ERR "Could not allocate node local PDA "
- "for CPU %d on node %d\n", cpu, node);
-
- if (oldpda)
- return 0; /* have a usable pda */
- else
- return -1;
- }
-
- if (oldpda) {
- memcpy(newpda, oldpda, size);
- free_bootmem_pda(oldpda);
- }
-
- newpda->in_bootmem = 0;
- cpu_pda(cpu) = newpda;
- return 0;
-}
-#endif /* CONFIG_X86_64 */
-
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -807,16 +761,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
};
INIT_WORK(&c_idle.work, do_fork_idle);

-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- if (cpu > 0) {
- boot_error = get_local_pda(cpu);
- if (boot_error)
- goto restore_state;
- /* if can't get pda memory, can't start cpu */
- }
-#endif
-
alternatives_smp_switch(1);

c_idle.idle = get_idle_for_cpu(cpu);
@@ -852,7 +796,7 @@ do_rest:
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = c_idle.idle;
+ per_cpu(pda, cpu).pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bea2152..76e092d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1652,7 +1652,7 @@ asmlinkage void __init xen_start_kernel(void)
#ifdef CONFIG_X86_64
/* Disable until direct per-cpu data access. */
have_vcpu_info_placement = 0;
- x86_64_init_pda();
+ pda_init(0);
#endif

xen_smp_init();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c44e206..0d8d19e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -283,22 +283,12 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
struct task_struct *idle = idle_task(cpu);
int rc;

-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- WARN_ON(cpu == 0);
- if (cpu > 0) {
- rc = get_local_pda(cpu);
- if (rc)
- return rc;
- }
-#endif
-
#ifdef CONFIG_X86_32
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = idle;
+ per_cpu(pda, cpu).pcurrent = idle;
clear_tsk_thread_flag(idle, TIF_FORK);
#endif
xen_setup_timer(cpu);
--
1.6.1.rc1

2009-01-03 04:23:59

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 2/4] x86-64: Unify x86_*_percpu() functions.

Merge the 32-bit and 64-bit versions of these functions. Unlike 32-bit,
the segment base is the current cpu's PDA instead of the offset from the
original per-cpu area. This is because GCC hardcodes the stackprotector
canary at %gs:40. Since the assembler is incapable of relocating against
multiple symbols, the code ends up looking like:

movq $per_cpu__var, reg
subq $per_cpu__pda, reg
movq %gs:(reg), reg

This is still atomic since the offset is a constant (just calculated at
runtime) and not dependant on the cpu number.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/percpu.h | 92 +++++++++++++++++-----------------------
1 files changed, 39 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 6f866fd..f704243 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -1,54 +1,9 @@
#ifndef _ASM_X86_PERCPU_H
#define _ASM_X86_PERCPU_H

-#ifdef CONFIG_X86_64
-#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
- in the PDA. Longer term the PDA and every per cpu variable
- should be just put into a single section and referenced directly
- from %gs */
-
-#ifdef CONFIG_SMP
-#include <asm/pda.h>
-
-#define __my_cpu_offset read_pda(data_offset)
-
-#endif
-#include <asm-generic/percpu.h>
-
-DECLARE_PER_CPU(struct x8664_pda, pda);
-
-/*
- * These are supposed to be implemented as a single instruction which
- * operates on the per-cpu data base segment. x86-64 doesn't have
- * that yet, so this is a fairly inefficient workaround for the
- * meantime. The single instruction is atomic with respect to
- * preemption and interrupts, so we need to explicitly disable
- * interrupts here to achieve the same effect. However, because it
- * can be used from within interrupt-disable/enable, we can't actually
- * disable interrupts; disabling preemption is enough.
- */
-#define x86_read_percpu(var) \
- ({ \
- typeof(per_cpu_var(var)) __tmp; \
- preempt_disable(); \
- __tmp = __get_cpu_var(var); \
- preempt_enable(); \
- __tmp; \
- })
-
-#define x86_write_percpu(var, val) \
- do { \
- preempt_disable(); \
- __get_cpu_var(var) = (val); \
- preempt_enable(); \
- } while(0)
-
-#else /* CONFIG_X86_64 */
-
#ifdef __ASSEMBLY__

+#ifdef CONFIG_X86_32
/*
* PER_CPU finds an address of a per-cpu variable.
*
@@ -72,6 +27,8 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
#define PER_CPU_VAR(var) per_cpu__##var
#endif /* SMP */

+#endif /* X86_32 */
+
#else /* ...!ASSEMBLY */

/*
@@ -88,19 +45,37 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
*/
#ifdef CONFIG_SMP

+#ifdef CONFIG_X86_32
+
#define __my_cpu_offset x86_read_percpu(this_cpu_off)

/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
#define __percpu_seg "%%fs:"
+#define __percpu_seg_off(x) (x)
+
+#else
+
+#define __my_cpu_offset read_pda(data_offset)
+
+#define __percpu_seg "%%gs:"
+#define __percpu_seg_off(x) RELOC_HIDE((x), -(unsigned long)&per_cpu__pda)
+
+#endif

#else /* !SMP */

#define __percpu_seg ""
+#define __percpu_seg_off(x) (x)

#endif /* SMP */

#include <asm-generic/percpu.h>

+#ifdef CONFIG_X86_64
+#include <asm/pda.h>
+DECLARE_PER_CPU(struct x8664_pda, pda);
+#endif
+
/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);

@@ -111,6 +86,7 @@ extern void __bad_percpu_size(void);
#define percpu_to_op(op, var, val) \
do { \
typedef typeof(var) T__; \
+ typeof(var) *var__ = __percpu_seg_off(&var); \
if (0) { \
T__ tmp__; \
tmp__ = (val); \
@@ -118,17 +94,22 @@ do { \
switch (sizeof(var)) { \
case 1: \
asm(op "b %1,"__percpu_seg"%0" \
- : "+m" (var) \
+ : "+m" (*var__) \
: "ri" ((T__)val)); \
break; \
case 2: \
asm(op "w %1,"__percpu_seg"%0" \
- : "+m" (var) \
+ : "+m" (*var__) \
: "ri" ((T__)val)); \
break; \
case 4: \
asm(op "l %1,"__percpu_seg"%0" \
- : "+m" (var) \
+ : "+m" (*var__) \
+ : "ri" ((T__)val)); \
+ break; \
+ case 8: \
+ asm(op "q %1,"__percpu_seg"%0" \
+ : "+m" (*var__) \
: "ri" ((T__)val)); \
break; \
default: __bad_percpu_size(); \
@@ -138,21 +119,27 @@ do { \
#define percpu_from_op(op, var) \
({ \
typeof(var) ret__; \
+ typeof(var) *var__ = __percpu_seg_off(&var); \
switch (sizeof(var)) { \
case 1: \
asm(op "b "__percpu_seg"%1,%0" \
: "=r" (ret__) \
- : "m" (var)); \
+ : "m" (*var__)); \
break; \
case 2: \
asm(op "w "__percpu_seg"%1,%0" \
: "=r" (ret__) \
- : "m" (var)); \
+ : "m" (*var__)); \
break; \
case 4: \
asm(op "l "__percpu_seg"%1,%0" \
: "=r" (ret__) \
- : "m" (var)); \
+ : "m" (*var__)); \
+ break; \
+ case 8: \
+ asm(op "q "__percpu_seg"%1,%0" \
+ : "=r" (ret__) \
+ : "m" (*var__)); \
break; \
default: __bad_percpu_size(); \
} \
@@ -165,7 +152,6 @@ do { \
#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
#endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */

#ifdef CONFIG_SMP

--
1.6.1.rc1

2009-01-03 04:24:24

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 3/4] x86-64: Move irq stats from PDA to per-cpu and consolidate with 32-bit.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/hardirq_64.h | 24 +++++++++++++++++++-----
arch/x86/include/asm/pda.h | 10 ----------
arch/x86/kernel/irq.c | 6 +-----
arch/x86/kernel/irq_64.c | 3 +++
arch/x86/kernel/nmi.c | 10 +---------
arch/x86/xen/smp.c | 18 +++---------------
6 files changed, 27 insertions(+), 44 deletions(-)

diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
index b5a6b5d..213df9a 100644
--- a/arch/x86/include/asm/hardirq_64.h
+++ b/arch/x86/include/asm/hardirq_64.h
@@ -3,22 +3,36 @@

#include <linux/threads.h>
#include <linux/irq.h>
-#include <asm/pda.h>
#include <asm/apic.h>

+typedef struct {
+ unsigned int __softirq_pending;
+ unsigned int __nmi_count; /* arch dependent */
+ unsigned int apic_timer_irqs; /* arch dependent */
+ unsigned int irq0_irqs;
+ unsigned int irq_resched_count;
+ unsigned int irq_call_count;
+ unsigned int irq_tlb_count;
+ unsigned int irq_thermal_count;
+ unsigned int irq_spurious_count;
+ unsigned int irq_threshold_count;
+} ____cacheline_aligned irq_cpustat_t;
+
+DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+
/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
#define MAX_HARDIRQS_PER_CPU NR_VECTORS

#define __ARCH_IRQ_STAT 1

-#define inc_irq_stat(member) add_pda(member, 1)
+#define inc_irq_stat(member) x86_add_percpu(irq_stat.member, 1)

-#define local_softirq_pending() read_pda(__softirq_pending)
+#define local_softirq_pending() x86_read_percpu(irq_stat.__softirq_pending)

#define __ARCH_SET_SOFTIRQ_PENDING 1

-#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
-#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
+#define set_softirq_pending(x) x86_write_percpu(irq_stat.__softirq_pending, (x))
+#define or_softirq_pending(x) x86_or_percpu(irq_stat.__softirq_pending, (x))

extern void ack_bad_irq(unsigned int irq);

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 60e8d91..97a95fa 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -23,19 +23,9 @@ struct x8664_pda {
#endif
char *irqstackptr;
short nodenumber; /* number of current node (32k max) */
- unsigned int __softirq_pending;
- unsigned int __nmi_count; /* number of NMI on this CPUs */
short mmu_state;
short isidle;
struct mm_struct *active_mm;
- unsigned apic_timer_irqs;
- unsigned irq0_irqs;
- unsigned irq_resched_count;
- unsigned irq_call_count;
- unsigned irq_tlb_count;
- unsigned irq_thermal_count;
- unsigned irq_threshold_count;
- unsigned irq_spurious_count;
} ____cacheline_aligned_in_smp;

extern void pda_init(int);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 90f87fd..4be7ebf 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -36,11 +36,7 @@ void ack_bad_irq(unsigned int irq)
#endif
}

-#ifdef CONFIG_X86_32
-# define irq_stats(x) (&per_cpu(irq_stat, x))
-#else
-# define irq_stats(x) (&per_cpu(pda, x))
-#endif
+#define irq_stats(x) (&per_cpu(irq_stat, x))
/*
* /proc/interrupts printing:
*/
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6383d50..b98fd64 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -19,6 +19,9 @@
#include <asm/idle.h>
#include <asm/smp.h>

+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
/*
* Probabilistic stack overflow check:
*
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 235672f..1872967 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -63,11 +63,7 @@ static int endflag __initdata;

static inline unsigned int get_nmi_count(int cpu)
{
-#ifdef CONFIG_X86_64
- return per_cpu(pda, cpu).__nmi_count;
-#else
- return nmi_count(cpu);
-#endif
+ return per_cpu(irq_stat, cpu).__nmi_count;
}

static inline int mce_in_progress(void)
@@ -84,12 +80,8 @@ static inline int mce_in_progress(void)
*/
static inline unsigned int get_timer_irqs(int cpu)
{
-#ifdef CONFIG_X86_64
- return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
-#else
return per_cpu(irq_stat, cpu).apic_timer_irqs +
per_cpu(irq_stat, cpu).irq0_irqs;
-#endif
}

#ifdef CONFIG_SMP
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 0d8d19e..9d3865d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_resched_count++;
-#else
- add_pda(irq_resched_count, 1);
-#endif
+ inc_irq_stat(irq_resched_count);

return IRQ_HANDLED;
}
@@ -435,11 +431,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();

return IRQ_HANDLED;
@@ -449,11 +441,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();

return IRQ_HANDLED;
--
1.6.1.rc1

2009-01-03 04:24:41

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 4/4] x86-64: Move TLB state from PDA to per-cpu and consolidate with 32-bit.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/mmu_context_64.h | 14 +++++++-------
arch/x86/include/asm/pda.h | 2 --
arch/x86/include/asm/tlbflush.h | 7 ++-----
arch/x86/kernel/cpu/common.c | 2 --
arch/x86/kernel/tlb_32.c | 12 ++----------
arch/x86/kernel/tlb_64.c | 13 ++++++++-----
arch/x86/xen/mmu.c | 6 +-----
7 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
index 677d36e..8fb6060 100644
--- a/arch/x86/include/asm/mmu_context_64.h
+++ b/arch/x86/include/asm/mmu_context_64.h
@@ -6,8 +6,8 @@
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
- if (read_pda(mmu_state) == TLBSTATE_OK)
- write_pda(mmu_state, TLBSTATE_LAZY);
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}

@@ -19,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* stop flush ipis for the previous mm */
cpu_clear(cpu, prev->cpu_vm_mask);
#ifdef CONFIG_SMP
- write_pda(mmu_state, TLBSTATE_OK);
- write_pda(active_mm, next);
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
+ x86_write_percpu(cpu_tlbstate.active_mm, next);
#endif
cpu_set(cpu, next->cpu_vm_mask);
load_cr3(next->pgd);
@@ -30,9 +30,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
}
#ifdef CONFIG_SMP
else {
- write_pda(mmu_state, TLBSTATE_OK);
- if (read_pda(active_mm) != next)
- BUG();
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
+
if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
/* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 97a95fa..bc3b719 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -23,9 +23,7 @@ struct x8664_pda {
#endif
char *irqstackptr;
short nodenumber; /* number of current node (32k max) */
- short mmu_state;
short isidle;
- struct mm_struct *active_mm;
} ____cacheline_aligned_in_smp;

extern void pda_init(int);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0e7bbb5..b344098 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,20 +148,17 @@ void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2

-#ifdef CONFIG_X86_32
struct tlb_state {
struct mm_struct *active_mm;
int state;
- char __cacheline_padding[L1_CACHE_BYTES-8];
};
DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);

-void reset_lazy_tlbstate(void);
-#else
static inline void reset_lazy_tlbstate(void)
{
+ x86_write_percpu(cpu_tlbstate.state, 0);
+ x86_write_percpu(cpu_tlbstate.active_mm, &init_mm);
}
-#endif

#endif /* SMP */

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d039178..2a696d1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -882,8 +882,6 @@ void __cpuinit pda_init(int cpu)
pda->irqcount = -1;
pda->kernelstack = (unsigned long)stack_thread_info() -
PDA_STACKOFFSET + THREAD_SIZE;
- pda->active_mm = &init_mm;
- pda->mmu_state = 0;

if (cpu == 0) {
/* others are initialized in smpboot.c */
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index ce50546..15833ae 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -4,8 +4,8 @@

#include <asm/tlbflush.h>

-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
- ____cacheline_aligned = { &init_mm, 0, };
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+ = { &init_mm, 0, };

/* must come after the send_IPI functions above for inlining */
#include <mach_ipi.h>
@@ -246,11 +246,3 @@ void flush_tlb_all(void)
on_each_cpu(do_flush_tlb_all, NULL, 1);
}

-void reset_lazy_tlbstate(void)
-{
- int cpu = raw_smp_processor_id();
-
- per_cpu(cpu_tlbstate, cpu).state = 0;
- per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index f8be6f1..3bcb78d 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -18,6 +18,9 @@
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_bau.h>

+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+ = { &init_mm, 0, };
+
#include <mach_ipi.h>
/*
* Smarter SMP flushing macros.
@@ -62,9 +65,9 @@ static DEFINE_PER_CPU(union smp_flush_state, flush_state);
*/
void leave_mm(int cpu)
{
- if (read_pda(mmu_state) == TLBSTATE_OK)
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
+ cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -142,8 +145,8 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
* BUG();
*/

- if (f->flush_mm == read_pda(active_mm)) {
- if (read_pda(mmu_state) == TLBSTATE_OK) {
+ if (f->flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
@@ -274,7 +277,7 @@ static void do_flush_tlb_all(void *info)
unsigned long cpu = smp_processor_id();

__flush_tlb_all();
- if (read_pda(mmu_state) == TLBSTATE_LAZY)
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
}

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240..0d9ed77 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1063,11 +1063,7 @@ static void drop_other_mm_ref(void *info)
struct mm_struct *mm = info;
struct mm_struct *active_mm;

-#ifdef CONFIG_X86_64
- active_mm = read_pda(active_mm);
-#else
- active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
-#endif
+ active_mm = x86_read_percpu(cpu_tlbstate.active_mm);

if (active_mm == mm)
leave_mm(smp_processor_id());
--
1.6.1.rc1