Here are the PDA patches again. I changed the first patch so that the PDA is cacheline
aligned. I dropped the cpunumber patch for now, and added conversions of the TLB state
and IRQ stats to match the 32-bit code. Stats for defconfig:
text data bss dec hex filename
7033649 1754476 758508 9546633 91ab89 vmlinux.before
7029643 1754716 758508 9542867 919cd3 vmlinux.after
Patches are against 2.6.28.
This patch makes the PDA a normal per-cpu variable, allowing the
removal of the special allocator code. %gs still points to the
base of the PDA.
Tested on a dual-core AMD64 system.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 4 --
arch/x86/include/asm/percpu.h | 3 --
arch/x86/include/asm/setup.h | 1 -
arch/x86/kernel/cpu/common.c | 6 ++--
arch/x86/kernel/dumpstack_64.c | 8 ++--
arch/x86/kernel/head64.c | 23 +------------
arch/x86/kernel/irq.c | 2 +-
arch/x86/kernel/nmi.c | 2 +-
arch/x86/kernel/setup_percpu.c | 70 ++++++++--------------------------------
arch/x86/kernel/smpboot.c | 58 +--------------------------------
arch/x86/xen/enlighten.c | 2 +-
arch/x86/xen/smp.c | 12 +------
12 files changed, 27 insertions(+), 164 deletions(-)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff8..60e8d91 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -23,7 +23,6 @@ struct x8664_pda {
#endif
char *irqstackptr;
short nodenumber; /* number of current node (32k max) */
- short in_bootmem; /* pda lives in bootmem */
unsigned int __softirq_pending;
unsigned int __nmi_count; /* number of NMI on this CPUs */
short mmu_state;
@@ -39,11 +38,8 @@ struct x8664_pda {
unsigned irq_spurious_count;
} ____cacheline_aligned_in_smp;
-extern struct x8664_pda **_cpu_pda;
extern void pda_init(int);
-#define cpu_pda(i) (_cpu_pda[i])
-
/*
* There is no fast way to get the base address of the PDA, all the accesses
* have to mention %fs/%gs. So it needs to be done this Torvaldian way.
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index ece7205..6f866fd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -12,11 +12,8 @@
#ifdef CONFIG_SMP
#include <asm/pda.h>
-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
#define __my_cpu_offset read_pda(data_offset)
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
#endif
#include <asm-generic/percpu.h>
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index f12d372..b751aaf 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,7 +93,6 @@ extern unsigned long init_pg_tables_start;
extern unsigned long init_pg_tables_end;
#else
-void __init x86_64_init_pda(void);
void __init x86_64_start_kernel(char *real_mode);
void __init x86_64_start_reservations(char *real_mode_data);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0..eaf404f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -857,8 +857,8 @@ __setup("clearcpuid=", setup_disablecpuid);
cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
#ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
+DEFINE_PER_CPU_SHARED_ALIGNED(struct x8664_pda, pda);
+EXPORT_PER_CPU_SYMBOL(pda);
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
@@ -866,7 +866,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
void __cpuinit pda_init(int cpu)
{
- struct x8664_pda *pda = cpu_pda(cpu);
+ struct x8664_pda *pda = &per_cpu(pda, cpu);
/* Setup up data that may be needed in __get_free_pages early */
loadsegment(fs, 0);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7..1098b21 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -163,7 +163,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irqstack_end = (unsigned long *)per_cpu(pda, cpu).irqstackptr;
unsigned used = 0;
struct thread_info *tinfo;
@@ -306,9 +306,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
int i;
const int cpu = smp_processor_id();
unsigned long *irqstack_end =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ (unsigned long *) (per_cpu(pda, cpu).irqstackptr);
unsigned long *irqstack =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ (unsigned long *) (per_cpu(pda, cpu).irqstackptr - IRQSTACKSIZE);
/*
* debugging aid: "show_stack(NULL, NULL);" prints the
@@ -374,7 +374,7 @@ void show_registers(struct pt_regs *regs)
int i;
unsigned long sp;
const int cpu = smp_processor_id();
- struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+ struct task_struct *cur = per_cpu(pda, cpu).pcurrent;
sp = regs->sp;
printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d16084f..274e2a9 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,27 +25,6 @@
#include <asm/e820.h>
#include <asm/bios_ebda.h>
-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
-void __init x86_64_init_pda(void)
-{
- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
-}
-
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -111,7 +90,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
if (console_loglevel == 10)
early_printk("Kernel alive\n");
- x86_64_init_pda();
+ pda_init(0);
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc5..066e680 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -38,7 +38,7 @@ void ack_bad_irq(unsigned int irq)
#ifdef CONFIG_X86_32
# define irq_stats(x) (&per_cpu(irq_stat, x))
#else
-# define irq_stats(x) cpu_pda(x)
+# define irq_stats(x) (&per_cpu(pda, x))
#endif
/*
* /proc/interrupts printing:
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 2c97f07..4a5bb40 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -64,7 +64,7 @@ static int endflag __initdata;
static inline unsigned int get_nmi_count(int cpu)
{
#ifdef CONFIG_X86_64
- return cpu_pda(cpu)->__nmi_count;
+ return per_cpu(pda, cpu).__nmi_count;
#else
return nmi_count(cpu);
#endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3..fb0ccdc 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,58 +80,8 @@ static void __init setup_per_cpu_maps(void)
#endif
}
-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
- char *pda;
- struct x8664_pda **new_cpu_pda;
- unsigned long size;
- int cpu;
-
- size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
- /* allocate cpu_pda array and pointer table */
- {
- unsigned long tsize = nr_cpu_ids * sizeof(void *);
- unsigned long asize = size * (nr_cpu_ids - 1);
-
- tsize = roundup(tsize, cache_line_size());
- new_cpu_pda = alloc_bootmem(tsize + asize);
- pda = (char *)new_cpu_pda + tsize;
- }
-
- /* initialize pointer table to static pda's */
- for_each_possible_cpu(cpu) {
- if (cpu == 0) {
- /* leave boot cpu pda in place */
- new_cpu_pda[0] = cpu_pda(0);
- continue;
- }
- new_cpu_pda[cpu] = (struct x8664_pda *)pda;
- new_cpu_pda[cpu]->in_bootmem = 1;
- pda += size;
- }
-
- /* point to new pointer table */
- _cpu_pda = new_cpu_pda;
-}
-#endif
/*
* Great future plan:
@@ -145,9 +95,6 @@ void __init setup_per_cpu_areas(void)
int cpu;
unsigned long align = 1;
- /* Setup cpu_pda map */
- setup_cpu_pda_map();
-
/* Copy section for each CPU (we discard the original) */
old_size = PERCPU_ENOUGH_ROOM;
align = max_t(unsigned long, PAGE_SIZE, align);
@@ -179,10 +126,21 @@ void __init setup_per_cpu_areas(void)
cpu, node, __pa(ptr));
}
#endif
- per_cpu_offset(cpu) = ptr - __per_cpu_start;
+ __per_cpu_offset[cpu] = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+#ifdef CONFIG_X86_64
+ if (cpu)
+ memset(&per_cpu(pda, cpu), 0, sizeof(struct x8664_pda));
+ per_cpu(pda, cpu).data_offset = __per_cpu_offset[cpu];
+#endif
}
+#ifdef CONFIG_X86_64
+ mb();
+ wrmsrl(MSR_GS_BASE, &per_cpu(pda, 0));
+ mb();
+#endif
+
printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
NR_CPUS, nr_cpu_ids, nr_node_ids);
@@ -229,8 +187,8 @@ void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
- if (cpu_pda(cpu) && node != NUMA_NO_NODE)
- cpu_pda(cpu)->nodenumber = node;
+ if (node != NUMA_NO_NODE)
+ per_cpu(pda, cpu).nodenumber = node;
if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f71f96f..d25b989 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -753,52 +753,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
complete(&c_idle->done);
}
-#ifdef CONFIG_X86_64
-
-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
-static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
-{
- if (!after_bootmem)
- free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
-}
-
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
- struct x8664_pda *oldpda, *newpda;
- unsigned long size = sizeof(struct x8664_pda);
- int node = cpu_to_node(cpu);
-
- if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
- return 0;
-
- oldpda = cpu_pda(cpu);
- newpda = kmalloc_node(size, GFP_ATOMIC, node);
- if (!newpda) {
- printk(KERN_ERR "Could not allocate node local PDA "
- "for CPU %d on node %d\n", cpu, node);
-
- if (oldpda)
- return 0; /* have a usable pda */
- else
- return -1;
- }
-
- if (oldpda) {
- memcpy(newpda, oldpda, size);
- free_bootmem_pda(oldpda);
- }
-
- newpda->in_bootmem = 0;
- cpu_pda(cpu) = newpda;
- return 0;
-}
-#endif /* CONFIG_X86_64 */
-
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -816,16 +770,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
};
INIT_WORK(&c_idle.work, do_fork_idle);
-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- if (cpu > 0) {
- boot_error = get_local_pda(cpu);
- if (boot_error)
- goto restore_state;
- /* if can't get pda memory, can't start cpu */
- }
-#endif
-
alternatives_smp_switch(1);
c_idle.idle = get_idle_for_cpu(cpu);
@@ -861,7 +805,7 @@ do_rest:
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = c_idle.idle;
+ per_cpu(pda, cpu).pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5e4686d..0160bb6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1649,7 +1649,7 @@ asmlinkage void __init xen_start_kernel(void)
#ifdef CONFIG_X86_64
/* Disable until direct per-cpu data access. */
have_vcpu_info_placement = 0;
- x86_64_init_pda();
+ pda_init(0);
#endif
xen_smp_init();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index acd9b67..17823cb 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -280,22 +280,12 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
struct task_struct *idle = idle_task(cpu);
int rc;
-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- WARN_ON(cpu == 0);
- if (cpu > 0) {
- rc = get_local_pda(cpu);
- if (rc)
- return rc;
- }
-#endif
-
#ifdef CONFIG_X86_32
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = idle;
+ per_cpu(pda, cpu).pcurrent = idle;
clear_tsk_thread_flag(idle, TIF_FORK);
#endif
xen_setup_timer(cpu);
--
1.6.1.rc1
Merge the 32-bit and 64-bit versions of these functions. Unlike 32-bit,
the segment base is the current cpu's PDA instead of the offset from the
original per-cpu area. This is because GCC hardcodes the stackprotector
canary at %gs:40. Since the assembler is incapable of relocating against
multiple symbols, the code ends up looking like:
movq $per_cpu__var, reg
subq $per_cpu__pda, reg
movq %gs:(reg), reg
This is still atomic since the offset is a constant (just calculated at
runtime) and not dependant on the cpu number.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/percpu.h | 92 +++++++++++++++++-----------------------
1 files changed, 39 insertions(+), 53 deletions(-)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 6f866fd..f704243 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -1,54 +1,9 @@
#ifndef _ASM_X86_PERCPU_H
#define _ASM_X86_PERCPU_H
-#ifdef CONFIG_X86_64
-#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
- in the PDA. Longer term the PDA and every per cpu variable
- should be just put into a single section and referenced directly
- from %gs */
-
-#ifdef CONFIG_SMP
-#include <asm/pda.h>
-
-#define __my_cpu_offset read_pda(data_offset)
-
-#endif
-#include <asm-generic/percpu.h>
-
-DECLARE_PER_CPU(struct x8664_pda, pda);
-
-/*
- * These are supposed to be implemented as a single instruction which
- * operates on the per-cpu data base segment. x86-64 doesn't have
- * that yet, so this is a fairly inefficient workaround for the
- * meantime. The single instruction is atomic with respect to
- * preemption and interrupts, so we need to explicitly disable
- * interrupts here to achieve the same effect. However, because it
- * can be used from within interrupt-disable/enable, we can't actually
- * disable interrupts; disabling preemption is enough.
- */
-#define x86_read_percpu(var) \
- ({ \
- typeof(per_cpu_var(var)) __tmp; \
- preempt_disable(); \
- __tmp = __get_cpu_var(var); \
- preempt_enable(); \
- __tmp; \
- })
-
-#define x86_write_percpu(var, val) \
- do { \
- preempt_disable(); \
- __get_cpu_var(var) = (val); \
- preempt_enable(); \
- } while(0)
-
-#else /* CONFIG_X86_64 */
-
#ifdef __ASSEMBLY__
+#ifdef CONFIG_X86_32
/*
* PER_CPU finds an address of a per-cpu variable.
*
@@ -72,6 +27,8 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
#define PER_CPU_VAR(var) per_cpu__##var
#endif /* SMP */
+#endif /* X86_32 */
+
#else /* ...!ASSEMBLY */
/*
@@ -88,19 +45,37 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
*/
#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_32
+
#define __my_cpu_offset x86_read_percpu(this_cpu_off)
/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
#define __percpu_seg "%%fs:"
+#define __percpu_seg_off(x) (x)
+
+#else
+
+#define __my_cpu_offset read_pda(data_offset)
+
+#define __percpu_seg "%%gs:"
+#define __percpu_seg_off(x) RELOC_HIDE((x), -(unsigned long)&per_cpu__pda)
+
+#endif
#else /* !SMP */
#define __percpu_seg ""
+#define __percpu_seg_off(x) (x)
#endif /* SMP */
#include <asm-generic/percpu.h>
+#ifdef CONFIG_X86_64
+#include <asm/pda.h>
+DECLARE_PER_CPU(struct x8664_pda, pda);
+#endif
+
/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);
@@ -111,6 +86,7 @@ extern void __bad_percpu_size(void);
#define percpu_to_op(op, var, val) \
do { \
typedef typeof(var) T__; \
+ typeof(var) *var__ = __percpu_seg_off(&var); \
if (0) { \
T__ tmp__; \
tmp__ = (val); \
@@ -118,17 +94,22 @@ do { \
switch (sizeof(var)) { \
case 1: \
asm(op "b %1,"__percpu_seg"%0" \
- : "+m" (var) \
+ : "+m" (*var__) \
: "ri" ((T__)val)); \
break; \
case 2: \
asm(op "w %1,"__percpu_seg"%0" \
- : "+m" (var) \
+ : "+m" (*var__) \
: "ri" ((T__)val)); \
break; \
case 4: \
asm(op "l %1,"__percpu_seg"%0" \
- : "+m" (var) \
+ : "+m" (*var__) \
+ : "ri" ((T__)val)); \
+ break; \
+ case 8: \
+ asm(op "q %1,"__percpu_seg"%0" \
+ : "+m" (*var__) \
: "ri" ((T__)val)); \
break; \
default: __bad_percpu_size(); \
@@ -138,21 +119,27 @@ do { \
#define percpu_from_op(op, var) \
({ \
typeof(var) ret__; \
+ typeof(var) *var__ = __percpu_seg_off(&var); \
switch (sizeof(var)) { \
case 1: \
asm(op "b "__percpu_seg"%1,%0" \
: "=r" (ret__) \
- : "m" (var)); \
+ : "m" (*var__)); \
break; \
case 2: \
asm(op "w "__percpu_seg"%1,%0" \
: "=r" (ret__) \
- : "m" (var)); \
+ : "m" (*var__)); \
break; \
case 4: \
asm(op "l "__percpu_seg"%1,%0" \
: "=r" (ret__) \
- : "m" (var)); \
+ : "m" (*var__)); \
+ break; \
+ case 8: \
+ asm(op "q "__percpu_seg"%1,%0" \
+ : "=r" (ret__) \
+ : "m" (*var__)); \
break; \
default: __bad_percpu_size(); \
} \
@@ -165,7 +152,6 @@ do { \
#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
#endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */
#ifdef CONFIG_SMP
--
1.6.1.rc1
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/mmu_context_64.h | 14 +++++++-------
arch/x86/include/asm/pda.h | 2 --
arch/x86/include/asm/tlbflush.h | 7 ++-----
arch/x86/kernel/cpu/common.c | 2 --
arch/x86/kernel/tlb_32.c | 12 ++----------
arch/x86/kernel/tlb_64.c | 13 ++++++++-----
arch/x86/xen/mmu.c | 6 +-----
7 files changed, 20 insertions(+), 36 deletions(-)
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
index 677d36e..8fb6060 100644
--- a/arch/x86/include/asm/mmu_context_64.h
+++ b/arch/x86/include/asm/mmu_context_64.h
@@ -6,8 +6,8 @@
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
- if (read_pda(mmu_state) == TLBSTATE_OK)
- write_pda(mmu_state, TLBSTATE_LAZY);
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}
@@ -19,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* stop flush ipis for the previous mm */
cpu_clear(cpu, prev->cpu_vm_mask);
#ifdef CONFIG_SMP
- write_pda(mmu_state, TLBSTATE_OK);
- write_pda(active_mm, next);
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
+ x86_write_percpu(cpu_tlbstate.active_mm, next);
#endif
cpu_set(cpu, next->cpu_vm_mask);
load_cr3(next->pgd);
@@ -30,9 +30,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
}
#ifdef CONFIG_SMP
else {
- write_pda(mmu_state, TLBSTATE_OK);
- if (read_pda(active_mm) != next)
- BUG();
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
+
if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
/* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 97a95fa..bc3b719 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -23,9 +23,7 @@ struct x8664_pda {
#endif
char *irqstackptr;
short nodenumber; /* number of current node (32k max) */
- short mmu_state;
short isidle;
- struct mm_struct *active_mm;
} ____cacheline_aligned_in_smp;
extern void pda_init(int);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0e7bbb5..b344098 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,20 +148,17 @@ void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
-#ifdef CONFIG_X86_32
struct tlb_state {
struct mm_struct *active_mm;
int state;
- char __cacheline_padding[L1_CACHE_BYTES-8];
};
DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
-void reset_lazy_tlbstate(void);
-#else
static inline void reset_lazy_tlbstate(void)
{
+ x86_write_percpu(cpu_tlbstate.state, 0);
+ x86_write_percpu(cpu_tlbstate.active_mm, &init_mm);
}
-#endif
#endif /* SMP */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index eaf404f..7bdf87f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -880,8 +880,6 @@ void __cpuinit pda_init(int cpu)
pda->irqcount = -1;
pda->kernelstack = (unsigned long)stack_thread_info() -
PDA_STACKOFFSET + THREAD_SIZE;
- pda->active_mm = &init_mm;
- pda->mmu_state = 0;
if (cpu == 0) {
/* others are initialized in smpboot.c */
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index f4049f3..4c151a9 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -4,8 +4,8 @@
#include <asm/tlbflush.h>
-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
- ____cacheline_aligned = { &init_mm, 0, };
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+ = { &init_mm, 0, };
/* must come after the send_IPI functions above for inlining */
#include <mach_ipi.h>
@@ -247,11 +247,3 @@ void flush_tlb_all(void)
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
-void reset_lazy_tlbstate(void)
-{
- int cpu = raw_smp_processor_id();
-
- per_cpu(cpu_tlbstate, cpu).state = 0;
- per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index d4c9a00..938549c 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -18,6 +18,9 @@
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_bau.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+ = { &init_mm, 0, };
+
#include <mach_ipi.h>
/*
* Smarter SMP flushing macros.
@@ -62,9 +65,9 @@ static DEFINE_PER_CPU(union smp_flush_state, flush_state);
*/
void leave_mm(int cpu)
{
- if (read_pda(mmu_state) == TLBSTATE_OK)
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
+ cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -142,8 +145,8 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
* BUG();
*/
- if (f->flush_mm == read_pda(active_mm)) {
- if (read_pda(mmu_state) == TLBSTATE_OK) {
+ if (f->flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
@@ -274,7 +277,7 @@ static void do_flush_tlb_all(void *info)
unsigned long cpu = smp_processor_id();
__flush_tlb_all();
- if (read_pda(mmu_state) == TLBSTATE_LAZY)
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 636ef4c..49bdd72 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1060,11 +1060,7 @@ static void drop_other_mm_ref(void *info)
struct mm_struct *mm = info;
struct mm_struct *active_mm;
-#ifdef CONFIG_X86_64
- active_mm = read_pda(active_mm);
-#else
- active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
-#endif
+ active_mm = x86_read_percpu(cpu_tlbstate.active_mm);
if (active_mm == mm)
leave_mm(smp_processor_id());
--
1.6.1.rc1
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/hardirq_64.h | 22 ++++++++++++++++++----
arch/x86/include/asm/pda.h | 10 ----------
arch/x86/kernel/apic.c | 8 ++------
arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 2 +-
arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +-
arch/x86/kernel/irq.c | 6 +-----
arch/x86/kernel/irq_64.c | 3 +++
arch/x86/kernel/nmi.c | 10 +---------
arch/x86/kernel/smp.c | 18 +++---------------
arch/x86/kernel/time_64.c | 2 +-
arch/x86/kernel/tlb_64.c | 2 +-
arch/x86/kernel/traps.c | 6 +-----
arch/x86/xen/smp.c | 18 +++---------------
13 files changed, 36 insertions(+), 73 deletions(-)
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
index 1ba381f..4e003b5 100644
--- a/arch/x86/include/asm/hardirq_64.h
+++ b/arch/x86/include/asm/hardirq_64.h
@@ -3,20 +3,34 @@
#include <linux/threads.h>
#include <linux/irq.h>
-#include <asm/pda.h>
#include <asm/apic.h>
+typedef struct {
+ unsigned int __softirq_pending;
+ unsigned int __nmi_count; /* arch dependent */
+ unsigned int apic_timer_irqs; /* arch dependent */
+ unsigned int irq0_irqs;
+ unsigned int irq_resched_count;
+ unsigned int irq_call_count;
+ unsigned int irq_tlb_count;
+ unsigned int irq_thermal_count;
+ unsigned int irq_spurious_count;
+ unsigned int irq_threshold_count;
+} ____cacheline_aligned irq_cpustat_t;
+
+DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+
/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
#define MAX_HARDIRQS_PER_CPU NR_VECTORS
#define __ARCH_IRQ_STAT 1
-#define local_softirq_pending() read_pda(__softirq_pending)
+#define local_softirq_pending() x86_read_percpu(irq_stat.__softirq_pending)
#define __ARCH_SET_SOFTIRQ_PENDING 1
-#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
-#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
+#define set_softirq_pending(x) x86_write_percpu(irq_stat.__softirq_pending, (x))
+#define or_softirq_pending(x) x86_or_percpu(irq_stat.__softirq_pending, (x))
extern void ack_bad_irq(unsigned int irq);
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 60e8d91..97a95fa 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -23,19 +23,9 @@ struct x8664_pda {
#endif
char *irqstackptr;
short nodenumber; /* number of current node (32k max) */
- unsigned int __softirq_pending;
- unsigned int __nmi_count; /* number of NMI on this CPUs */
short mmu_state;
short isidle;
struct mm_struct *active_mm;
- unsigned apic_timer_irqs;
- unsigned irq0_irqs;
- unsigned irq_resched_count;
- unsigned irq_call_count;
- unsigned irq_tlb_count;
- unsigned irq_thermal_count;
- unsigned irq_threshold_count;
- unsigned irq_spurious_count;
} ____cacheline_aligned_in_smp;
extern void pda_init(int);
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 16f9487..088ecd6 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -783,11 +783,7 @@ static void local_apic_timer_interrupt(void)
/*
* the NMI deadlock-detector uses this.
*/
-#ifdef CONFIG_X86_64
- add_pda(apic_timer_irqs, 1);
-#else
- per_cpu(irq_stat, cpu).apic_timer_irqs++;
-#endif
+ x86_add_percpu(irq_stat.apic_timer_irqs, 1);
evt->event_handler(evt);
}
@@ -1696,7 +1692,7 @@ void smp_spurious_interrupt(struct pt_regs *regs)
ack_APIC_irq();
#ifdef CONFIG_X86_64
- add_pda(irq_spurious_count, 1);
+ x86_add_percpu(irq_stat.irq_spurious_count, 1);
#else
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 5eb390a..e79e939 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -237,7 +237,7 @@ asmlinkage void mce_threshold_interrupt(void)
}
}
out:
- add_pda(irq_threshold_count, 1);
+ x86_add_percpu(irq_stat.irq_threshold_count, 1);
irq_exit();
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index c17eaf5..96d6ba0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(void)
if (therm_throt_process(msr_val & 1))
mce_log_therm_throt_event(smp_processor_id(), msr_val);
- add_pda(irq_thermal_count, 1);
+ x86_add_percpu(irq_stat.irq_thermal_count, 1);
irq_exit();
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 066e680..efeca70 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -35,11 +35,7 @@ void ack_bad_irq(unsigned int irq)
#endif
}
-#ifdef CONFIG_X86_32
-# define irq_stats(x) (&per_cpu(irq_stat, x))
-#else
-# define irq_stats(x) (&per_cpu(pda, x))
-#endif
+#define irq_stats(x) (&per_cpu(irq_stat, x))
/*
* /proc/interrupts printing:
*/
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 60eb84e..ac54951 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,9 @@
#include <asm/idle.h>
#include <asm/smp.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/*
* Probabilistic stack overflow check:
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 4a5bb40..bbbaffd 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -63,11 +63,7 @@ static int endflag __initdata;
static inline unsigned int get_nmi_count(int cpu)
{
-#ifdef CONFIG_X86_64
- return per_cpu(pda, cpu).__nmi_count;
-#else
- return nmi_count(cpu);
-#endif
+ return per_cpu(irq_stat, cpu).__nmi_count;
}
static inline int mce_in_progress(void)
@@ -84,12 +80,8 @@ static inline int mce_in_progress(void)
*/
static inline unsigned int get_timer_irqs(int cpu)
{
-#ifdef CONFIG_X86_64
- return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
-#else
return per_cpu(irq_stat, cpu).apic_timer_irqs +
per_cpu(irq_stat, cpu).irq0_irqs;
-#endif
}
#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18f9b19..863ef11 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -178,11 +178,7 @@ static void native_smp_send_stop(void)
void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_resched_count++;
-#else
- add_pda(irq_resched_count, 1);
-#endif
+ x86_add_percpu(irq_stat.irq_resched_count, 1);
}
void smp_call_function_interrupt(struct pt_regs *regs)
@@ -190,11 +186,7 @@ void smp_call_function_interrupt(struct pt_regs *regs)
ack_APIC_irq();
irq_enter();
generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ x86_add_percpu(irq_stat.irq_call_count, 1);
irq_exit();
}
@@ -203,11 +195,7 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
ack_APIC_irq();
irq_enter();
generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ x86_add_percpu(irq_stat.irq_call_count, 1);
irq_exit();
}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index cb19d65..48391cf 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL(profile_pc);
irqreturn_t timer_interrupt(int irq, void *dev_id)
{
- add_pda(irq0_irqs, 1);
+ x86_add_percpu(irq_stat.irq0_irqs, 1);
global_clock_event->event_handler(global_clock_event);
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index 8f919ca..d4c9a00 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
out:
ack_APIC_irq();
cpu_clear(cpu, f->flush_cpumask);
- add_pda(irq_tlb_count, 1);
+ x86_add_percpu(irq_stat.irq_tlb_count, 1);
}
void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 04d242a..281a2ab 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -481,11 +481,7 @@ do_nmi(struct pt_regs *regs, long error_code)
{
nmi_enter();
-#ifdef CONFIG_X86_32
- { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
-#else
- add_pda(__nmi_count, 1);
-#endif
+ x86_add_percpu(irq_stat.__nmi_count, 1);
if (!ignore_nmis)
default_do_nmi(regs);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 17823cb..123f621 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_resched_count++;
-#else
- add_pda(irq_resched_count, 1);
-#endif
+ x86_add_percpu(irq_stat.irq_resched_count, 1);
return IRQ_HANDLED;
}
@@ -432,11 +428,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ x86_add_percpu(irq_stat.irq_call_count, 1);
irq_exit();
return IRQ_HANDLED;
@@ -446,11 +438,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ x86_add_percpu(irq_stat.irq_call_count, 1);
irq_exit();
return IRQ_HANDLED;
--
1.6.1.rc1
* Brian Gerst <[email protected]> wrote:
> Here are the PDA patches again. I changed the first patch so that the
> PDA is cacheline aligned. I dropped the cpunumber patch for now, and
> added conversions of the TLB state and IRQ stats to match the 32-bit
> code. Stats for defconfig:
>
> text data bss dec hex filename
> 7033649 1754476 758508 9546633 91ab89 vmlinux.before
> 7029643 1754716 758508 9542867 919cd3 vmlinux.after
>
> Patches are against 2.6.28.
looks pretty good!
Ingo
Brian Gerst wrote:
> Here are the PDA patches again. I changed the first patch so that the PDA is cacheline
> aligned. I dropped the cpunumber patch for now, and added conversions of the TLB state
> and IRQ stats to match the 32-bit code. Stats for defconfig:
>
> text data bss dec hex filename
> 7033649 1754476 758508 9546633 91ab89 vmlinux.before
> 7029643 1754716 758508 9542867 919cd3 vmlinux.after
>
> Patches are against 2.6.28.
Hi there,
I just tried to apply your patchset, but it fails rather dramatically on
patch 3. Could you refresh it against tip:master or current Linus git?
Thanks,
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.