Subject: [gcv v3 27/35] arm: Replace __get_cpu_var uses

__get_cpu_var() is used for multiple purposes in the kernel source. One of them is
address calculation via the form &__get_cpu_var(x). This calculates the address for
the instance of the percpu variable of the current processor based on an offset.

Other use cases are for storing and retrieving data from the current processors percpu area.
__get_cpu_var() can be used as an lvalue when writing data or on the right side of an assignment.

__get_cpu_var() is defined as :


#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))



__get_cpu_var() always only does an address determination. However, store and retrieve operations
could use a segment prefix (or global register on other platforms) to avoid the address calculation.

this_cpu_write() and this_cpu_read() can directly take an offset into a percpu area and use
optimized assembly code to read and write per cpu variables.


This patch converts __get_cpu_var into either an explicit address calculation using this_cpu_ptr()
or into a use of this_cpu operations that use the offset. Thereby address calcualtions are avoided
and less registers are used when code is generated.

At the end of the patchset all uses of __get_cpu_var have been removed so the macro is removed too.

The patchset includes passes over all arches as well. Once these operations are used throughout then
specialized macros can be defined in non -x86 arches as well in order to optimize per cpu access by
f.e. using a global register that may be set to the per cpu base.




Transformations done to __get_cpu_var()


1. Determine the address of the percpu instance of the current processor.

DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);

Converts to

int *x = this_cpu_ptr(&y);


2. Same as #1 but this time an array structure is involved.

DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);

Converts to

int *x = this_cpu_ptr(y);


3. Retrieve the content of the current processors instance of a per cpu variable.

DEFINE_PER_CPU(int, u);
int x = __get_cpu_var(y)

Converts to

int x = __this_cpu_read(y);


4. Retrieve the content of a percpu struct

DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);

Converts to

memcpy(this_cpu_ptr(&x), y, sizeof(x));


5. Assignment to a per cpu variable

DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;

Converts to

this_cpu_write(y, x);


6. Increment/Decrement etc of a per cpu variable

DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++

Converts to

this_cpu_inc(y)



Acked-by: Catalin Marinas <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>

Index: linux/arch/arm/kernel/hw_breakpoint.c
===================================================================
--- linux.orig/arch/arm/kernel/hw_breakpoint.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm/kernel/hw_breakpoint.c 2013-08-26 13:48:40.952795024 -0500
@@ -344,13 +344,13 @@ int arch_install_hw_breakpoint(struct pe
/* Breakpoint */
ctrl_base = ARM_BASE_BCR;
val_base = ARM_BASE_BVR;
- slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
max_slots = core_num_brps;
} else {
/* Watchpoint */
ctrl_base = ARM_BASE_WCR;
val_base = ARM_BASE_WVR;
- slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
max_slots = core_num_wrps;
}

@@ -396,12 +396,12 @@ void arch_uninstall_hw_breakpoint(struct
if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
/* Breakpoint */
base = ARM_BASE_BCR;
- slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
max_slots = core_num_brps;
} else {
/* Watchpoint */
base = ARM_BASE_WCR;
- slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
max_slots = core_num_wrps;
}

@@ -697,7 +697,7 @@ static void watchpoint_handler(unsigned
struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(wp_on_reg);

for (i = 0; i < core_num_wrps; ++i) {
rcu_read_lock();
@@ -768,7 +768,7 @@ static void watchpoint_single_step_handl
struct perf_event *wp, **slots;
struct arch_hw_breakpoint *info;

- slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(wp_on_reg);

for (i = 0; i < core_num_wrps; ++i) {
rcu_read_lock();
@@ -802,7 +802,7 @@ static void breakpoint_handler(unsigned
struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(bp_on_reg);

/* The exception entry code places the amended lr in the PC. */
addr = regs->ARM_pc;
Index: linux/arch/arm/kernel/kprobes.c
===================================================================
--- linux.orig/arch/arm/kernel/kprobes.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm/kernel/kprobes.c 2013-08-26 13:48:40.952795024 -0500
@@ -171,13 +171,13 @@ static void __kprobes save_previous_kpro

static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
- __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+ __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
}

static void __kprobes set_current_kprobe(struct kprobe *p)
{
- __get_cpu_var(current_kprobe) = p;
+ __this_cpu_write(current_kprobe, p);
}

static void __kprobes
@@ -421,10 +421,10 @@ static __used __kprobes void *trampoline
continue;

if (ri->rp && ri->rp->handler) {
- __get_cpu_var(current_kprobe) = &ri->rp->kp;
+ __this_cpu_write(current_kprobe, &ri->rp->kp);
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
ri->rp->handler(ri, regs);
- __get_cpu_var(current_kprobe) = NULL;
+ __this_cpu_write(current_kprobe, NULL);
}

orig_ret_address = (unsigned long)ri->ret_addr;
Index: linux/arch/arm/kernel/perf_event_cpu.c
===================================================================
--- linux.orig/arch/arm/kernel/perf_event_cpu.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm/kernel/perf_event_cpu.c 2013-08-26 13:48:40.952795024 -0500
@@ -68,7 +68,7 @@ EXPORT_SYMBOL_GPL(perf_num_counters);

static struct pmu_hw_events *cpu_pmu_get_cpu_events(void)
{
- return &__get_cpu_var(cpu_hw_events);
+ return this_cpu_ptr(&cpu_hw_events);
}

static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
Index: linux/arch/arm/kvm/arm.c
===================================================================
--- linux.orig/arch/arm/kvm/arm.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm/kvm/arm.c 2013-08-26 13:48:40.952795024 -0500
@@ -65,7 +65,7 @@ static bool vgic_present;
static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
{
BUG_ON(preemptible());
- __get_cpu_var(kvm_arm_running_vcpu) = vcpu;
+ __this_cpu_write(kvm_arm_running_vcpu, vcpu);
}

/**
@@ -75,7 +75,7 @@ static void kvm_arm_set_running_vcpu(str
struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
{
BUG_ON(preemptible());
- return __get_cpu_var(kvm_arm_running_vcpu);
+ return __this_cpu_read(kvm_arm_running_vcpu);
}

/**
@@ -811,7 +811,7 @@ static void cpu_init_hyp_mode(void *dumm

boot_pgd_ptr = kvm_mmu_get_boot_httbr();
pgd_ptr = kvm_mmu_get_httbr();
- stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
+ stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
hyp_stack_ptr = stack_page + PAGE_SIZE;
vector_ptr = (unsigned long)__kvm_hyp_vector;

Index: linux/arch/arm64/kernel/debug-monitors.c
===================================================================
--- linux.orig/arch/arm64/kernel/debug-monitors.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm64/kernel/debug-monitors.c 2013-08-26 13:48:40.952795024 -0500
@@ -98,11 +98,11 @@ void enable_debug_monitors(enum debug_el

WARN_ON(preemptible());

- if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1)
+ if (this_cpu_inc_return(mde_ref_count) == 1)
enable = DBG_MDSCR_MDE;

if (el == DBG_ACTIVE_EL1 &&
- local_inc_return(&__get_cpu_var(kde_ref_count)) == 1)
+ this_cpu_inc_return(kde_ref_count) == 1)
enable |= DBG_MDSCR_KDE;

if (enable && debug_enabled) {
@@ -118,11 +118,11 @@ void disable_debug_monitors(enum debug_e

WARN_ON(preemptible());

- if (local_dec_and_test(&__get_cpu_var(mde_ref_count)))
+ if (local_dec_and_test(this_cpu_ptr(&mde_ref_count)))
disable = ~DBG_MDSCR_MDE;

if (el == DBG_ACTIVE_EL1 &&
- local_dec_and_test(&__get_cpu_var(kde_ref_count)))
+ local_dec_and_test(this_cpu_ptr(&kde_ref_count)))
disable &= ~DBG_MDSCR_KDE;

if (disable) {
Index: linux/arch/arm64/kernel/hw_breakpoint.c
===================================================================
--- linux.orig/arch/arm64/kernel/hw_breakpoint.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm64/kernel/hw_breakpoint.c 2013-08-26 13:48:40.952795024 -0500
@@ -184,14 +184,14 @@ int arch_install_hw_breakpoint(struct pe
/* Breakpoint */
ctrl_reg = AARCH64_DBG_REG_BCR;
val_reg = AARCH64_DBG_REG_BVR;
- slots = __get_cpu_var(bp_on_reg);
+ slots = __this_cpu_read(bp_on_reg);
max_slots = core_num_brps;
reg_enable = !debug_info->bps_disabled;
} else {
/* Watchpoint */
ctrl_reg = AARCH64_DBG_REG_WCR;
val_reg = AARCH64_DBG_REG_WVR;
- slots = __get_cpu_var(wp_on_reg);
+ slots = __this_cpu_read(wp_on_reg);
max_slots = core_num_wrps;
reg_enable = !debug_info->wps_disabled;
}
@@ -230,12 +230,12 @@ void arch_uninstall_hw_breakpoint(struct
if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
/* Breakpoint */
base = AARCH64_DBG_REG_BCR;
- slots = __get_cpu_var(bp_on_reg);
+ slots = __this_cpu_read(bp_on_reg);
max_slots = core_num_brps;
} else {
/* Watchpoint */
base = AARCH64_DBG_REG_WCR;
- slots = __get_cpu_var(wp_on_reg);
+ slots = __this_cpu_read(wp_on_reg);
max_slots = core_num_wrps;
}

@@ -505,11 +505,11 @@ static void toggle_bp_registers(int reg,

switch (reg) {
case AARCH64_DBG_REG_BCR:
- slots = __get_cpu_var(bp_on_reg);
+ slots = __this_cpu_read(bp_on_reg);
max_slots = core_num_brps;
break;
case AARCH64_DBG_REG_WCR:
- slots = __get_cpu_var(wp_on_reg);
+ slots = __this_cpu_read(wp_on_reg);
max_slots = core_num_wrps;
break;
default:
@@ -546,7 +546,7 @@ static int breakpoint_handler(unsigned l
struct debug_info *debug_info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
addr = instruction_pointer(regs);
debug_info = &current->thread.debug;

@@ -596,7 +596,7 @@ unlock:
user_enable_single_step(current);
} else {
toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL1, 0);
- kernel_step = &__get_cpu_var(stepping_kernel_bp);
+ kernel_step = this_cpu_ptr(&stepping_kernel_bp);

if (*kernel_step != ARM_KERNEL_STEP_NONE)
return 0;
@@ -623,7 +623,7 @@ static int watchpoint_handler(unsigned l
struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
debug_info = &current->thread.debug;

for (i = 0; i < core_num_wrps; ++i) {
@@ -698,7 +698,7 @@ unlock:
user_enable_single_step(current);
} else {
toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL1, 0);
- kernel_step = &__get_cpu_var(stepping_kernel_bp);
+ kernel_step = this_cpu_ptr(&stepping_kernel_bp);

if (*kernel_step != ARM_KERNEL_STEP_NONE)
return 0;
@@ -722,7 +722,7 @@ int reinstall_suspended_bps(struct pt_re
struct debug_info *debug_info = &current->thread.debug;
int handled_exception = 0, *kernel_step;

- kernel_step = &__get_cpu_var(stepping_kernel_bp);
+ kernel_step = this_cpu_ptr(&stepping_kernel_bp);

/*
* Called from single-step exception handler.
Index: linux/arch/arm64/kernel/perf_event.c
===================================================================
--- linux.orig/arch/arm64/kernel/perf_event.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm64/kernel/perf_event.c 2013-08-26 13:48:40.952795024 -0500
@@ -1041,7 +1041,7 @@ static irqreturn_t armv8pmu_handle_irq(i
*/
regs = get_irq_regs();

- cpuc = &__get_cpu_var(cpu_hw_events);
+ cpuc = this_cpu_ptr(&cpu_hw_events);
for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
struct perf_event *event = cpuc->events[idx];
struct hw_perf_event *hwc;
@@ -1254,7 +1254,7 @@ device_initcall(register_pmu_driver);

static struct pmu_hw_events *armpmu_get_cpu_events(void)
{
- return &__get_cpu_var(cpu_hw_events);
+ return this_cpu_ptr(&cpu_hw_events);
}

static void __init cpu_pmu_init(struct arm_pmu *armpmu)


2013-08-28 19:54:44

by Russell King - ARM Linux

[permalink] [raw]
Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Wed, Aug 28, 2013 at 07:48:23PM +0000, Christoph Lameter wrote:
> 3. Retrieve the content of the current processors instance of a per cpu variable.
>
> DEFINE_PER_CPU(int, u);

Shouldn't this be 'y' ?

> int x = __get_cpu_var(y)
>
> Converts to
>
> int x = __this_cpu_read(y);
>
>
> 4. Retrieve the content of a percpu struct
>
> DEFINE_PER_CPU(struct mystruct, y);
> struct mystruct x = __get_cpu_var(y);
>
> Converts to
>
> memcpy(this_cpu_ptr(&x), y, sizeof(x));

Are you sure this one's correct? Isn't 'y' the per-cpu variable?
Even though I don't see any in this patch, it's probably a good thing to
get the patch description correct.

I think you need Will Deacon's ack for this, but I think he's away for
a while.

Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Wed, 28 Aug 2013, Russell King - ARM Linux wrote:

> On Wed, Aug 28, 2013 at 07:48:23PM +0000, Christoph Lameter wrote:
> > 3. Retrieve the content of the current processors instance of a per cpu variable.
> >
> > DEFINE_PER_CPU(int, u);
>
> Shouldn't this be 'y' ?

Right.

> > memcpy(this_cpu_ptr(&x), y, sizeof(x));
>
> Are you sure this one's correct? Isn't 'y' the per-cpu variable?

Also true. Already fixed this once. Sigh.

Description patch:


--- this_x86 2013-08-28 15:35:48.933416126 -0500
+++ patches/this_x86 2013-08-28 15:41:14.386260894 -0500
@@ -26,12 +26,12 @@


This patch converts __get_cpu_var into either an explicit address calculation using this_cpu_ptr()
-or into a use of this_cpu operations that use the offset. Thereby address calcualtions are avoided
+or into a use of this_cpu operations that use the offset. Thereby address calculations are avoided
and less registers are used when code is generated.

-At the end of the patchset all uses of __get_cpu_var have been removed so the macro is removed too.
+At the end of the patch set all uses of __get_cpu_var have been removed so the macro is removed too.

-The patchset includes passes over all arches as well. Once these operations are used throughout then
+The patch set includes passes over all arches as well. Once these operations are used throughout then
specialized macros can be defined in non -x86 arches as well in order to optimize per cpu access by
f.e. using a global register that may be set to the per cpu base.

@@ -63,7 +63,7 @@

3. Retrieve the content of the current processors instance of a per cpu variable.

- DEFINE_PER_CPU(int, u);
+ DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)

Converts to
@@ -78,7 +78,7 @@

Converts to

- memcpy(this_cpu_ptr(&x), y, sizeof(x));
+ memcpy(&x, this_cpu_ptr(&y), sizeof(x));


5. Assignment to a per cpu variable

2013-08-30 10:01:29

by Will Deacon

[permalink] [raw]
Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

Hi Christoph,

Sorry for the delay in looking at this, I've been on holiday for a week.
Comments inline.

On Wed, Aug 28, 2013 at 08:48:23PM +0100, Christoph Lameter wrote:
> Transformations done to __get_cpu_var()
>
>
> 1. Determine the address of the percpu instance of the current processor.
>
> DEFINE_PER_CPU(int, y);
> int *x = &__get_cpu_var(y);
>
> Converts to
>
> int *x = this_cpu_ptr(&y);
>
>
> 2. Same as #1 but this time an array structure is involved.
>
> DEFINE_PER_CPU(int, y[20]);
> int *x = __get_cpu_var(y);
>
> Converts to
>
> int *x = this_cpu_ptr(y);

This is the flavour we have for ARM's hw_breakpoint code, where we have an
array of perf_event * instead of int...

> Index: linux/arch/arm/kernel/hw_breakpoint.c
> ===================================================================
> --- linux.orig/arch/arm/kernel/hw_breakpoint.c 2013-08-26 13:48:40.956794980 -0500
> +++ linux/arch/arm/kernel/hw_breakpoint.c 2013-08-26 13:48:40.952795024 -0500
> @@ -344,13 +344,13 @@ int arch_install_hw_breakpoint(struct pe
> /* Breakpoint */
> ctrl_base = ARM_BASE_BCR;
> val_base = ARM_BASE_BVR;
> - slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
> + slots = (struct perf_event **)__this_cpu_read(bp_on_reg);

...so I don't think this is quite right, and indeed, we get a bunch of errors
from GCC:

arch/arm/kernel/hw_breakpoint.c: In function ‘arch_install_hw_breakpoint’:
arch/arm/kernel/hw_breakpoint.c:347:33: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’
arch/arm/kernel/hw_breakpoint.c:347:1: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’
arch/arm/kernel/hw_breakpoint.c:347:1: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’
arch/arm/kernel/hw_breakpoint.c:347:1: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’

changing to match your recipe still doesn't work, however:

arch/arm/kernel/hw_breakpoint.c: In function ‘arch_install_hw_breakpoint’:
arch/arm/kernel/hw_breakpoint.c:347:33: error: cast specifies array type

> Index: linux/arch/arm64/kernel/debug-monitors.c
> ===================================================================
> --- linux.orig/arch/arm64/kernel/debug-monitors.c 2013-08-26 13:48:40.956794980 -0500
> +++ linux/arch/arm64/kernel/debug-monitors.c 2013-08-26 13:48:40.952795024 -0500
> @@ -98,11 +98,11 @@ void enable_debug_monitors(enum debug_el
>
> WARN_ON(preemptible());
>
> - if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1)
> + if (this_cpu_inc_return(mde_ref_count) == 1)
> enable = DBG_MDSCR_MDE;

I'm not sure that this is safe. We rely on local_inc_return to be atomic
with respect to the current CPU, which will end up being a wrapper around
atomic64_inc_return. However, this_cpu_inc_return simply uses a lock, so
other people accessing the count in a different manner (local_dec_and_test
below) may break local atomicity unless we start disabling interrupts or
something horrible like that.

> if (el == DBG_ACTIVE_EL1 &&
> - local_inc_return(&__get_cpu_var(kde_ref_count)) == 1)
> + this_cpu_inc_return(kde_ref_count) == 1)
> enable |= DBG_MDSCR_KDE;
>
> if (enable && debug_enabled) {
> @@ -118,11 +118,11 @@ void disable_debug_monitors(enum debug_e
>
> WARN_ON(preemptible());
>
> - if (local_dec_and_test(&__get_cpu_var(mde_ref_count)))
> + if (local_dec_and_test(this_cpu_ptr(&mde_ref_count)))
> disable = ~DBG_MDSCR_MDE;
>
> if (el == DBG_ACTIVE_EL1 &&
> - local_dec_and_test(&__get_cpu_var(kde_ref_count)))
> + local_dec_and_test(this_cpu_ptr(&kde_ref_count)))
> disable &= ~DBG_MDSCR_KDE;
>
> if (disable) {

Will

Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Fri, 30 Aug 2013, Will Deacon wrote:

> This is the flavour we have for ARM's hw_breakpoint code, where we have an
> array of perf_event * instead of int...
>
> > Index: linux/arch/arm/kernel/hw_breakpoint.c
> > ===================================================================
> > --- linux.orig/arch/arm/kernel/hw_breakpoint.c 2013-08-26 13:48:40.956794980 -0500
> > +++ linux/arch/arm/kernel/hw_breakpoint.c 2013-08-26 13:48:40.952795024 -0500
> > @@ -344,13 +344,13 @@ int arch_install_hw_breakpoint(struct pe
> > /* Breakpoint */
> > ctrl_base = ARM_BASE_BCR;
> > val_base = ARM_BASE_BVR;
> > - slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
> > + slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
>
> ...so I don't think this is quite right, and indeed, we get a bunch of errors
> from GCC:
>
> arch/arm/kernel/hw_breakpoint.c: In function ‘arch_install_hw_breakpoint’:
> arch/arm/kernel/hw_breakpoint.c:347:33: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’
> arch/arm/kernel/hw_breakpoint.c:347:1: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’
> arch/arm/kernel/hw_breakpoint.c:347:1: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’
> arch/arm/kernel/hw_breakpoint.c:347:1: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’

Did you apply the first patch of this series which is a bug fix?

> changing to match your recipe still doesn't work, however:
>
> arch/arm/kernel/hw_breakpoint.c: In function ‘arch_install_hw_breakpoint’:
> arch/arm/kernel/hw_breakpoint.c:347:33: error: cast specifies array type

Yep that is the macro bug that was fixed in the first patch.

> >
> > WARN_ON(preemptible());
> >
> > - if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1)
> > + if (this_cpu_inc_return(mde_ref_count) == 1)
> > enable = DBG_MDSCR_MDE;
>
> I'm not sure that this is safe. We rely on local_inc_return to be atomic
> with respect to the current CPU, which will end up being a wrapper around
> atomic64_inc_return. However, this_cpu_inc_return simply uses a lock, so
> other people accessing the count in a different manner (local_dec_and_test
> below) may break local atomicity unless we start disabling interrupts or
> something horrible like that.

I do not see any special code for ARM for this_cpu_inc_return. The
fallback solution in the core code is to disable interrupts for the
inc_return and arch/arm/include/asm/percpu.h includes
asm-generic/percpu.h.

Where did you see it using a lock?

2013-09-04 09:34:12

by Will Deacon

[permalink] [raw]
Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

Hi Christoph,

On Tue, Sep 03, 2013 at 03:39:57PM +0100, Christoph Lameter wrote:
> On Fri, 30 Aug 2013, Will Deacon wrote:
> > ...so I don't think this is quite right, and indeed, we get a bunch of errors
> > from GCC:
> >
> > arch/arm/kernel/hw_breakpoint.c: In function ‘arch_install_hw_breakpoint’:
> > arch/arm/kernel/hw_breakpoint.c:347:33: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’
> > arch/arm/kernel/hw_breakpoint.c:347:1: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’
> > arch/arm/kernel/hw_breakpoint.c:347:1: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’
> > arch/arm/kernel/hw_breakpoint.c:347:1: error: incompatible types when assigning to type ‘struct perf_event *[16]’ from type ‘struct perf_event **’
>
> Did you apply the first patch of this series which is a bug fix?

No, sorry, I didn't see that. Do you have a branch anywhere that I can play
with?

> > changing to match your recipe still doesn't work, however:
> >
> > arch/arm/kernel/hw_breakpoint.c: In function ‘arch_install_hw_breakpoint’:
> > arch/arm/kernel/hw_breakpoint.c:347:33: error: cast specifies array type
>
> Yep that is the macro bug that was fixed in the first patch.

Ok. Sorry for the noise.

> > >
> > > WARN_ON(preemptible());
> > >
> > > - if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1)
> > > + if (this_cpu_inc_return(mde_ref_count) == 1)
> > > enable = DBG_MDSCR_MDE;
> >
> > I'm not sure that this is safe. We rely on local_inc_return to be atomic
> > with respect to the current CPU, which will end up being a wrapper around
> > atomic64_inc_return. However, this_cpu_inc_return simply uses a lock, so
> > other people accessing the count in a different manner (local_dec_and_test
> > below) may break local atomicity unless we start disabling interrupts or
> > something horrible like that.
>
> I do not see any special code for ARM for this_cpu_inc_return. The
> fallback solution in the core code is to disable interrupts for the
> inc_return and arch/arm/include/asm/percpu.h includes
> asm-generic/percpu.h.
>
> Where did you see it using a lock?

God knows! You're completely right, and we simply disable interrupts which I
somehow misread as taking a lock. However, is it guaranteed that mixing
an atomic64_* access with a this_cpu_inc_return will retain atomicity
between the two? E.g. if you get interrupted during an atomic64_xchg
operation, the interrupt handler issues this_cpu_inc_return, then on return
to the xchg operation it must reissue any reads that had been executed
prior to the interrupt. This should work on ARM/ARM64 (returning from the
interrupt will clear the exclusive monitor) but I don't know about other
architectures.

Will

2013-09-04 14:24:12

by Will Deacon

[permalink] [raw]
Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Wed, Sep 04, 2013 at 03:17:09PM +0100, Christoph Lameter wrote:
> On Wed, 4 Sep 2013, Will Deacon wrote:
> > God knows! You're completely right, and we simply disable interrupts which I
> > somehow misread as taking a lock. However, is it guaranteed that mixing
> > an atomic64_* access with a this_cpu_inc_return will retain atomicity
> > between the two? E.g. if you get interrupted during an atomic64_xchg
> > operation, the interrupt handler issues this_cpu_inc_return, then on return
> > to the xchg operation it must reissue any reads that had been executed
> > prior to the interrupt. This should work on ARM/ARM64 (returning from the
> > interrupt will clear the exclusive monitor) but I don't know about other
> > architectures.
>
> You cannot get interrupted during an atomic64_xchg operation. atomic and
> this_cpu operations are stricly serialzed since both should be behaving
> like single instructions. __this_cpu ops relax that requirement in case
> the arch code incurs significant overhead to make that happen. In cases
> where we know that preemption/interrupt disable etc takes care of things
> __this_cpu ops come into play.

Hmm, why can't you get interrupted during atomic64_xchg? On ARM, we have the
following sequence:

static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new)
{
u64 result;
unsigned long tmp;

smp_mb();

__asm__ __volatile__("@ atomic64_xchg\n"
"1: ldrexd %0, %H0, [%3]\n"
" strexd %1, %4, %H4, [%3]\n"
" teq %1, #0\n"
" bne 1b"
: "=&r" (result), "=&r" (tmp), "+Qo" (ptr->counter)
: "r" (&ptr->counter), "r" (new)
: "cc");

smp_mb();

return result;
}

which relies on interrupts clearing the exclusive monitor to force us back
around the loop in the inline asm. I could imagine other architectures doing
similar, but only detecting the other writer if it used the same
instructions.

Will

Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Wed, 4 Sep 2013, Will Deacon wrote:

> > Did you apply the first patch of this series which is a bug fix?
>
> No, sorry, I didn't see that. Do you have a branch anywhere that I can play
> with?

It was merged in Linus tree yesterday.

> > I do not see any special code for ARM for this_cpu_inc_return. The
> > fallback solution in the core code is to disable interrupts for the
> > inc_return and arch/arm/include/asm/percpu.h includes
> > asm-generic/percpu.h.
> >
> > Where did you see it using a lock?
>
> God knows! You're completely right, and we simply disable interrupts which I
> somehow misread as taking a lock. However, is it guaranteed that mixing
> an atomic64_* access with a this_cpu_inc_return will retain atomicity
> between the two? E.g. if you get interrupted during an atomic64_xchg
> operation, the interrupt handler issues this_cpu_inc_return, then on return
> to the xchg operation it must reissue any reads that had been executed
> prior to the interrupt. This should work on ARM/ARM64 (returning from the
> interrupt will clear the exclusive monitor) but I don't know about other
> architectures.

You cannot get interrupted during an atomic64_xchg operation. atomic and
this_cpu operations are stricly serialzed since both should be behaving
like single instructions. __this_cpu ops relax that requirement in case
the arch code incurs significant overhead to make that happen. In cases
where we know that preemption/interrupt disable etc takes care of things
__this_cpu ops come into play.

Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Wed, 4 Sep 2013, Will Deacon wrote:

> Hmm, why can't you get interrupted during atomic64_xchg? On ARM, we have the
> following sequence:

AFAICT atomic means one uninterruptible action.

> static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new)
> {
> u64 result;
> unsigned long tmp;
>
> smp_mb();
>
> __asm__ __volatile__("@ atomic64_xchg\n"
> "1: ldrexd %0, %H0, [%3]\n"
> " strexd %1, %4, %H4, [%3]\n"
> " teq %1, #0\n"
> " bne 1b"
> : "=&r" (result), "=&r" (tmp), "+Qo" (ptr->counter)
> : "r" (&ptr->counter), "r" (new)
> : "cc");
>
> smp_mb();
>
> return result;
> }
>
> which relies on interrupts clearing the exclusive monitor to force us back
> around the loop in the inline asm. I could imagine other architectures doing
> similar, but only detecting the other writer if it used the same
> instructions.

Well I have never done ARM asm but this looks vaguely like a cmpxchg loop?
That would either perform an atomic change or fail and retry?
If so it still fits the definition of atomic. The change or fail operation
is atomic.

2013-09-04 17:47:09

by Will Deacon

[permalink] [raw]
Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Wed, Sep 04, 2013 at 03:54:04PM +0100, Christoph Lameter wrote:
> On Wed, 4 Sep 2013, Will Deacon wrote:
>
> > Hmm, why can't you get interrupted during atomic64_xchg? On ARM, we have the
> > following sequence:
>
> AFAICT atomic means one uninterruptible action.

I think it's more subtle than that, but this is all moot for ARM.

> > static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new)
> > {
> > u64 result;
> > unsigned long tmp;
> >
> > smp_mb();
> >
> > __asm__ __volatile__("@ atomic64_xchg\n"
> > "1: ldrexd %0, %H0, [%3]\n"
> > " strexd %1, %4, %H4, [%3]\n"
> > " teq %1, #0\n"
> > " bne 1b"
> > : "=&r" (result), "=&r" (tmp), "+Qo" (ptr->counter)
> > : "r" (&ptr->counter), "r" (new)
> > : "cc");
> >
> > smp_mb();
> >
> > return result;
> > }
> >
> > which relies on interrupts clearing the exclusive monitor to force us back
> > around the loop in the inline asm. I could imagine other architectures doing
> > similar, but only detecting the other writer if it used the same
> > instructions.
>
> Well I have never done ARM asm but this looks vaguely like a cmpxchg loop?
> That would either perform an atomic change or fail and retry?

Correct! The strexd instruction can fail if another access clears the
exclusive monitor.

> If so it still fits the definition of atomic. The change or fail operation
> is atomic.

On ARM, yes. I'm worried that there may be an architecture where the change-
or-fail operation would only fail if the access from the interrupt handler
*also* used that change-or-fail instruction, which isn't the case with
this_cpu_inc.

I have no idea if such an architecture exists :)

Will

Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Wed, 4 Sep 2013, Will Deacon wrote:

> On ARM, yes. I'm worried that there may be an architecture where the change-
> or-fail operation would only fail if the access from the interrupt handler
> *also* used that change-or-fail instruction, which isn't the case with
> this_cpu_inc.
>
> I have no idea if such an architecture exists :)

Atomic operations use atomic_t. this_cpu operations can only use regular
scalars. So the set of variables that are updated by each should be
distinct.


2013-09-04 18:21:46

by Will Deacon

[permalink] [raw]
Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Wed, Sep 04, 2013 at 07:09:04PM +0100, Christoph Lameter wrote:
> On Wed, 4 Sep 2013, Will Deacon wrote:
>
> > On ARM, yes. I'm worried that there may be an architecture where the change-
> > or-fail operation would only fail if the access from the interrupt handler
> > *also* used that change-or-fail instruction, which isn't the case with
> > this_cpu_inc.
> >
> > I have no idea if such an architecture exists :)
>
> Atomic operations use atomic_t. this_cpu operations can only use regular
> scalars. So the set of variables that are updated by each should be
> distinct.

Right, except that your patch contained the following hunk:

Index: linux/arch/arm64/kernel/debug-monitors.c
===================================================================
--- linux.orig/arch/arm64/kernel/debug-monitors.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm64/kernel/debug-monitors.c 2013-08-26 13:48:40.952795024 -0500
@@ -98,11 +98,11 @@ void enable_debug_monitors(enum debug_el

WARN_ON(preemptible());

- if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1)
+ if (this_cpu_inc_return(mde_ref_count) == 1)
enable = DBG_MDSCR_MDE;

if (el == DBG_ACTIVE_EL1 &&
- local_inc_return(&__get_cpu_var(kde_ref_count)) == 1)
+ this_cpu_inc_return(kde_ref_count) == 1)
enable |= DBG_MDSCR_KDE;

if (enable && debug_enabled) {


Then we have:

#define local_inc_return(l) atomic_long_inc_return(&(l)->a)

static inline long atomic_long_inc_return(atomic_long_t *l)
{
atomic_t *v = (atomic_t *)l;

return (long)atomic_inc_return(v);
}


So that casting lets the two interfaces overlap (and indeed they do after
your patch, since local_dec_and_test is still used to the same variable).

Will

Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Wed, 4 Sep 2013, Will Deacon wrote:

> > Atomic operations use atomic_t. this_cpu operations can only use regular
> > scalars. So the set of variables that are updated by each should be
> > distinct.
>
> Right, except that your patch contained the following hunk:
>
> Index: linux/arch/arm64/kernel/debug-monitors.c
> ===================================================================
> --- linux.orig/arch/arm64/kernel/debug-monitors.c 2013-08-26 13:48:40.956794980 -0500
> +++ linux/arch/arm64/kernel/debug-monitors.c 2013-08-26 13:48:40.952795024 -0500
> @@ -98,11 +98,11 @@ void enable_debug_monitors(enum debug_el
>
> WARN_ON(preemptible());
>
> - if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1)
> + if (this_cpu_inc_return(mde_ref_count) == 1)
> enable = DBG_MDSCR_MDE;
>
> if (el == DBG_ACTIVE_EL1 &&
> - local_inc_return(&__get_cpu_var(kde_ref_count)) == 1)
> + this_cpu_inc_return(kde_ref_count) == 1)
> enable |= DBG_MDSCR_KDE;
>
> if (enable && debug_enabled) {
>
>
> Then we have:
>
> #define local_inc_return(l) atomic_long_inc_return(&(l)->a)
>
> static inline long atomic_long_inc_return(atomic_long_t *l)
> {
> atomic_t *v = (atomic_t *)l;
>
> return (long)atomic_inc_return(v);
> }
>
>
> So that casting lets the two interfaces overlap (and indeed they do after
> your patch, since local_dec_and_test is still used to the same variable).


Ok that is indeed wrong. You would have to switch out the whole treatment
of the variable to consistently use this_cpu ops.

Lets convert the &__get_cpu_vars to the this_cpu_ptr(& xxx )
form. This is almost an this_cpu_xx op but not quite ;-). Looks strange.


Fixed up patch:

Subject: arm: Replace __get_cpu_var uses
Cc: Russell King <[email protected]>
Cc: Catalin Marinas <[email protected]>
CC: Will Deacon <[email protected]>

__get_cpu_var() is used for multiple purposes in the kernel source. One of them is
address calculation via the form &__get_cpu_var(x). This calculates the address for
the instance of the percpu variable of the current processor based on an offset.

Other use cases are for storing and retrieving data from the current processors percpu area.
__get_cpu_var() can be used as an lvalue when writing data or on the right side of an assignment.

__get_cpu_var() is defined as :


#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))



__get_cpu_var() always only does an address determination. However, store and retrieve operations
could use a segment prefix (or global register on other platforms) to avoid the address calculation.

this_cpu_write() and this_cpu_read() can directly take an offset into a percpu area and use
optimized assembly code to read and write per cpu variables.


This patch converts __get_cpu_var into either an explicit address calculation using this_cpu_ptr()
or into a use of this_cpu operations that use the offset. Thereby address calculations are avoided
and less registers are used when code is generated.

At the end of the patch set all uses of __get_cpu_var have been removed so the macro is removed too.

The patch set includes passes over all arches as well. Once these operations are used throughout then
specialized macros can be defined in non -x86 arches as well in order to optimize per cpu access by
f.e. using a global register that may be set to the per cpu base.




Transformations done to __get_cpu_var()


1. Determine the address of the percpu instance of the current processor.

DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);

Converts to

int *x = this_cpu_ptr(&y);


2. Same as #1 but this time an array structure is involved.

DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);

Converts to

int *x = this_cpu_ptr(y);


3. Retrieve the content of the current processors instance of a per cpu variable.

DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)

Converts to

int x = __this_cpu_read(y);


4. Retrieve the content of a percpu struct

DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);

Converts to

memcpy(&x, this_cpu_ptr(&y), sizeof(x));


5. Assignment to a per cpu variable

DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;

Converts to

this_cpu_write(y, x);


6. Increment/Decrement etc of a per cpu variable

DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++

Converts to

this_cpu_inc(y)



Acked-by: Catalin Marinas <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>

Index: linux/arch/arm/kernel/hw_breakpoint.c
===================================================================
--- linux.orig/arch/arm/kernel/hw_breakpoint.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm/kernel/hw_breakpoint.c 2013-08-26 13:48:40.952795024 -0500
@@ -344,13 +344,13 @@ int arch_install_hw_breakpoint(struct pe
/* Breakpoint */
ctrl_base = ARM_BASE_BCR;
val_base = ARM_BASE_BVR;
- slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
max_slots = core_num_brps;
} else {
/* Watchpoint */
ctrl_base = ARM_BASE_WCR;
val_base = ARM_BASE_WVR;
- slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
max_slots = core_num_wrps;
}

@@ -396,12 +396,12 @@ void arch_uninstall_hw_breakpoint(struct
if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
/* Breakpoint */
base = ARM_BASE_BCR;
- slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
max_slots = core_num_brps;
} else {
/* Watchpoint */
base = ARM_BASE_WCR;
- slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
max_slots = core_num_wrps;
}

@@ -697,7 +697,7 @@ static void watchpoint_handler(unsigned
struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(wp_on_reg);

for (i = 0; i < core_num_wrps; ++i) {
rcu_read_lock();
@@ -768,7 +768,7 @@ static void watchpoint_single_step_handl
struct perf_event *wp, **slots;
struct arch_hw_breakpoint *info;

- slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(wp_on_reg);

for (i = 0; i < core_num_wrps; ++i) {
rcu_read_lock();
@@ -802,7 +802,7 @@ static void breakpoint_handler(unsigned
struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(bp_on_reg);

/* The exception entry code places the amended lr in the PC. */
addr = regs->ARM_pc;
Index: linux/arch/arm/kernel/kprobes.c
===================================================================
--- linux.orig/arch/arm/kernel/kprobes.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm/kernel/kprobes.c 2013-08-26 13:48:40.952795024 -0500
@@ -171,13 +171,13 @@ static void __kprobes save_previous_kpro

static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
- __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+ __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
}

static void __kprobes set_current_kprobe(struct kprobe *p)
{
- __get_cpu_var(current_kprobe) = p;
+ __this_cpu_write(current_kprobe, p);
}

static void __kprobes
@@ -421,10 +421,10 @@ static __used __kprobes void *trampoline
continue;

if (ri->rp && ri->rp->handler) {
- __get_cpu_var(current_kprobe) = &ri->rp->kp;
+ __this_cpu_write(current_kprobe, &ri->rp->kp);
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
ri->rp->handler(ri, regs);
- __get_cpu_var(current_kprobe) = NULL;
+ __this_cpu_write(current_kprobe, NULL);
}

orig_ret_address = (unsigned long)ri->ret_addr;
Index: linux/arch/arm/kernel/perf_event_cpu.c
===================================================================
--- linux.orig/arch/arm/kernel/perf_event_cpu.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm/kernel/perf_event_cpu.c 2013-08-26 13:48:40.952795024 -0500
@@ -68,7 +68,7 @@ EXPORT_SYMBOL_GPL(perf_num_counters);

static struct pmu_hw_events *cpu_pmu_get_cpu_events(void)
{
- return &__get_cpu_var(cpu_hw_events);
+ return this_cpu_ptr(&cpu_hw_events);
}

static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
Index: linux/arch/arm/kvm/arm.c
===================================================================
--- linux.orig/arch/arm/kvm/arm.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm/kvm/arm.c 2013-08-26 13:48:40.952795024 -0500
@@ -65,7 +65,7 @@ static bool vgic_present;
static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
{
BUG_ON(preemptible());
- __get_cpu_var(kvm_arm_running_vcpu) = vcpu;
+ __this_cpu_write(kvm_arm_running_vcpu, vcpu);
}

/**
@@ -75,7 +75,7 @@ static void kvm_arm_set_running_vcpu(str
struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
{
BUG_ON(preemptible());
- return __get_cpu_var(kvm_arm_running_vcpu);
+ return __this_cpu_read(kvm_arm_running_vcpu);
}

/**
@@ -811,7 +811,7 @@ static void cpu_init_hyp_mode(void *dumm

boot_pgd_ptr = kvm_mmu_get_boot_httbr();
pgd_ptr = kvm_mmu_get_httbr();
- stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
+ stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
hyp_stack_ptr = stack_page + PAGE_SIZE;
vector_ptr = (unsigned long)__kvm_hyp_vector;

Index: linux/arch/arm64/kernel/debug-monitors.c
===================================================================
--- linux.orig/arch/arm64/kernel/debug-monitors.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm64/kernel/debug-monitors.c 2013-08-26 13:48:40.952795024 -0500
@@ -98,11 +98,11 @@ void enable_debug_monitors(enum debug_el

WARN_ON(preemptible());

- if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1)
+ if (local_inc_return(this_cpu_ptr(&mde_ref_count)) == 1)
enable = DBG_MDSCR_MDE;

if (el == DBG_ACTIVE_EL1 &&
- local_inc_return(&__get_cpu_var(kde_ref_count)) == 1)
+ local_inc_return(this_cpu_ptr(&kde_ref_count)) == 1)
enable |= DBG_MDSCR_KDE;

if (enable && debug_enabled) {
@@ -118,11 +118,11 @@ void disable_debug_monitors(enum debug_e

WARN_ON(preemptible());

- if (local_dec_and_test(&__get_cpu_var(mde_ref_count)))
+ if (local_dec_and_test(this_cpu_ptr(&mde_ref_count)))
disable = ~DBG_MDSCR_MDE;

if (el == DBG_ACTIVE_EL1 &&
- local_dec_and_test(&__get_cpu_var(kde_ref_count)))
+ local_dec_and_test(this_cpu_ptr(&kde_ref_count)))
disable &= ~DBG_MDSCR_KDE;

if (disable) {
Index: linux/arch/arm64/kernel/hw_breakpoint.c
===================================================================
--- linux.orig/arch/arm64/kernel/hw_breakpoint.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm64/kernel/hw_breakpoint.c 2013-08-26 13:48:40.952795024 -0500
@@ -184,14 +184,14 @@ int arch_install_hw_breakpoint(struct pe
/* Breakpoint */
ctrl_reg = AARCH64_DBG_REG_BCR;
val_reg = AARCH64_DBG_REG_BVR;
- slots = __get_cpu_var(bp_on_reg);
+ slots = __this_cpu_read(bp_on_reg);
max_slots = core_num_brps;
reg_enable = !debug_info->bps_disabled;
} else {
/* Watchpoint */
ctrl_reg = AARCH64_DBG_REG_WCR;
val_reg = AARCH64_DBG_REG_WVR;
- slots = __get_cpu_var(wp_on_reg);
+ slots = __this_cpu_read(wp_on_reg);
max_slots = core_num_wrps;
reg_enable = !debug_info->wps_disabled;
}
@@ -230,12 +230,12 @@ void arch_uninstall_hw_breakpoint(struct
if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
/* Breakpoint */
base = AARCH64_DBG_REG_BCR;
- slots = __get_cpu_var(bp_on_reg);
+ slots = __this_cpu_read(bp_on_reg);
max_slots = core_num_brps;
} else {
/* Watchpoint */
base = AARCH64_DBG_REG_WCR;
- slots = __get_cpu_var(wp_on_reg);
+ slots = __this_cpu_read(wp_on_reg);
max_slots = core_num_wrps;
}

@@ -505,11 +505,11 @@ static void toggle_bp_registers(int reg,

switch (reg) {
case AARCH64_DBG_REG_BCR:
- slots = __get_cpu_var(bp_on_reg);
+ slots = __this_cpu_read(bp_on_reg);
max_slots = core_num_brps;
break;
case AARCH64_DBG_REG_WCR:
- slots = __get_cpu_var(wp_on_reg);
+ slots = __this_cpu_read(wp_on_reg);
max_slots = core_num_wrps;
break;
default:
@@ -546,7 +546,7 @@ static int breakpoint_handler(unsigned l
struct debug_info *debug_info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
addr = instruction_pointer(regs);
debug_info = &current->thread.debug;

@@ -596,7 +596,7 @@ unlock:
user_enable_single_step(current);
} else {
toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL1, 0);
- kernel_step = &__get_cpu_var(stepping_kernel_bp);
+ kernel_step = this_cpu_ptr(&stepping_kernel_bp);

if (*kernel_step != ARM_KERNEL_STEP_NONE)
return 0;
@@ -623,7 +623,7 @@ static int watchpoint_handler(unsigned l
struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+ slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
debug_info = &current->thread.debug;

for (i = 0; i < core_num_wrps; ++i) {
@@ -698,7 +698,7 @@ unlock:
user_enable_single_step(current);
} else {
toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL1, 0);
- kernel_step = &__get_cpu_var(stepping_kernel_bp);
+ kernel_step = this_cpu_ptr(&stepping_kernel_bp);

if (*kernel_step != ARM_KERNEL_STEP_NONE)
return 0;
@@ -722,7 +722,7 @@ int reinstall_suspended_bps(struct pt_re
struct debug_info *debug_info = &current->thread.debug;
int handled_exception = 0, *kernel_step;

- kernel_step = &__get_cpu_var(stepping_kernel_bp);
+ kernel_step = this_cpu_ptr(&stepping_kernel_bp);

/*
* Called from single-step exception handler.
Index: linux/arch/arm64/kernel/perf_event.c
===================================================================
--- linux.orig/arch/arm64/kernel/perf_event.c 2013-08-26 13:48:40.956794980 -0500
+++ linux/arch/arm64/kernel/perf_event.c 2013-08-26 13:48:40.952795024 -0500
@@ -1041,7 +1041,7 @@ static irqreturn_t armv8pmu_handle_irq(i
*/
regs = get_irq_regs();

- cpuc = &__get_cpu_var(cpu_hw_events);
+ cpuc = this_cpu_ptr(&cpu_hw_events);
for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
struct perf_event *event = cpuc->events[idx];
struct hw_perf_event *hwc;
@@ -1254,7 +1254,7 @@ device_initcall(register_pmu_driver);

static struct pmu_hw_events *armpmu_get_cpu_events(void)
{
- return &__get_cpu_var(cpu_hw_events);
+ return this_cpu_ptr(&cpu_hw_events);
}

static void __init cpu_pmu_init(struct arm_pmu *armpmu)

Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

Here is a patch to be applied after the earlier one to convert the local_t
use to this_cpu. Not sure if I got the local_dec_and_test conversion
right.


Index: linux/arch/arm64/kernel/debug-monitors.c
===================================================================
--- linux.orig/arch/arm64/kernel/debug-monitors.c 2013-09-04 15:53:53.374943378 -0500
+++ linux/arch/arm64/kernel/debug-monitors.c 2013-09-04 15:57:19.564792739 -0500
@@ -27,7 +27,6 @@
#include <linux/uaccess.h>

#include <asm/debug-monitors.h>
-#include <asm/local.h>
#include <asm/cputype.h>
#include <asm/system_misc.h>

@@ -89,8 +88,8 @@ early_param("nodebugmon", early_debug_di
* Keep track of debug users on each core.
* The ref counts are per-cpu so we use a local_t type.
*/
-static DEFINE_PER_CPU(local_t, mde_ref_count);
-static DEFINE_PER_CPU(local_t, kde_ref_count);
+static DEFINE_PER_CPU(int, mde_ref_count);
+static DEFINE_PER_CPU(int, kde_ref_count);

void enable_debug_monitors(enum debug_el el)
{
@@ -98,11 +97,11 @@ void enable_debug_monitors(enum debug_el

WARN_ON(preemptible());

- if (local_inc_return(this_cpu_ptr(&mde_ref_count)) == 1)
+ if (this_cpu_inc_return(mde_ref_count) == 1)
enable = DBG_MDSCR_MDE;

if (el == DBG_ACTIVE_EL1 &&
- local_inc_return(this_cpu_ptr(&kde_ref_count)) == 1)
+ this_cpu_inc_return(kde_ref_count) == 1)
enable |= DBG_MDSCR_KDE;

if (enable && debug_enabled) {
@@ -118,11 +117,11 @@ void disable_debug_monitors(enum debug_e

WARN_ON(preemptible());

- if (local_dec_and_test(this_cpu_ptr(&mde_ref_count)))
+ if (this_cpu_dec_return(mde_ref_count))
disable = ~DBG_MDSCR_MDE;

if (el == DBG_ACTIVE_EL1 &&
- local_dec_and_test(this_cpu_ptr(&kde_ref_count)))
+ this_cpu_dec_return(kde_ref_count))
disable &= ~DBG_MDSCR_KDE;

if (disable) {

2013-09-05 13:04:28

by Will Deacon

[permalink] [raw]
Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

Hi again Christoph,

On Wed, Sep 04, 2013 at 09:58:31PM +0100, Christoph Lameter wrote:
> Here is a patch to be applied after the earlier one to convert the local_t
> use to this_cpu. Not sure if I got the local_dec_and_test conversion
> right.

[...]

> @@ -118,11 +117,11 @@ void disable_debug_monitors(enum debug_e
>
> WARN_ON(preemptible());
>
> - if (local_dec_and_test(this_cpu_ptr(&mde_ref_count)))
> + if (this_cpu_dec_return(mde_ref_count))
> disable = ~DBG_MDSCR_MDE;
>
> if (el == DBG_ACTIVE_EL1 &&
> - local_dec_and_test(this_cpu_ptr(&kde_ref_count)))
> + this_cpu_dec_return(kde_ref_count))
> disable &= ~DBG_MDSCR_KDE;

Almost! I think we just need an '== 0' check on the result from the
decrement, since local_dec_and_test is simply a way to check that we've
decremented to zero, so this patch would otherwise invert the meaning.

Will

Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Thu, 5 Sep 2013, Will Deacon wrote:

> >
> > if (el == DBG_ACTIVE_EL1 &&
> > - local_dec_and_test(this_cpu_ptr(&kde_ref_count)))
> > + this_cpu_dec_return(kde_ref_count))
> > disable &= ~DBG_MDSCR_KDE;
>
> Almost! I think we just need an '== 0' check on the result from the
> decrement, since local_dec_and_test is simply a way to check that we've
> decremented to zero, so this patch would otherwise invert the meaning.

Ok can you take it from here and modify it? I have no build and test
environment set up for ARM.

2013-09-05 17:28:37

by Will Deacon

[permalink] [raw]
Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Thu, Sep 05, 2013 at 03:24:25PM +0100, Christoph Lameter wrote:
> On Thu, 5 Sep 2013, Will Deacon wrote:
>
> > >
> > > if (el == DBG_ACTIVE_EL1 &&
> > > - local_dec_and_test(this_cpu_ptr(&kde_ref_count)))
> > > + this_cpu_dec_return(kde_ref_count))
> > > disable &= ~DBG_MDSCR_KDE;
> >
> > Almost! I think we just need an '== 0' check on the result from the
> > decrement, since local_dec_and_test is simply a way to check that we've
> > decremented to zero, so this patch would otherwise invert the meaning.
>
> Ok can you take it from here and modify it? I have no build and test
> environment set up for ARM.

Sure. Does that include the original arm/arm64 patches from your v3 series
as well as these two fixups?

Will

Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Thu, 5 Sep 2013, Will Deacon wrote:

> > Ok can you take it from here and modify it? I have no build and test
> > environment set up for ARM.
>
> Sure. Does that include the original arm/arm64 patches from your v3 series
> as well as these two fixups?

I think so. Take whatever you can and I will make another pass after the
merge and pick up anything that was missed.

2013-09-06 11:05:20

by Will Deacon

[permalink] [raw]
Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Thu, Sep 05, 2013 at 06:52:19PM +0100, Christoph Lameter wrote:
> On Thu, 5 Sep 2013, Will Deacon wrote:
>
> > > Ok can you take it from here and modify it? I have no build and test
> > > environment set up for ARM.
> >
> > Sure. Does that include the original arm/arm64 patches from your v3 series
> > as well as these two fixups?
>
> I think so. Take whatever you can and I will make another pass after the
> merge and pick up anything that was missed.

Ok, I had a crack at putting something together on top of HEAD, but I still
see some errors due to pcpu array types (even with your patch in mainline).
I think some of your conversions to __this_cpu_read should use this_cpu_ptr
instead, but could you have a quick look at my fixup below please?

Will

--->8

diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index e7e6eca..3d44660 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -344,13 +344,13 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
/* Breakpoint */
ctrl_base = ARM_BASE_BCR;
val_base = ARM_BASE_BVR;
- slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
+ slots = this_cpu_ptr(bp_on_reg);
max_slots = core_num_brps;
} else {
/* Watchpoint */
ctrl_base = ARM_BASE_WCR;
val_base = ARM_BASE_WVR;
- slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
+ slots = this_cpu_ptr(wp_on_reg);
max_slots = core_num_wrps;
}

@@ -396,12 +396,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
/* Breakpoint */
base = ARM_BASE_BCR;
- slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
+ slots = this_cpu_ptr(bp_on_reg);
max_slots = core_num_brps;
} else {
/* Watchpoint */
base = ARM_BASE_WCR;
- slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
+ slots = this_cpu_ptr(wp_on_reg);
max_slots = core_num_wrps;
}

@@ -697,7 +697,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
+ slots = this_cpu_ptr(wp_on_reg);

for (i = 0; i < core_num_wrps; ++i) {
rcu_read_lock();
@@ -768,7 +768,7 @@ static void watchpoint_single_step_handler(unsigned long pc)
struct perf_event *wp, **slots;
struct arch_hw_breakpoint *info;

- slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
+ slots = this_cpu_ptr(wp_on_reg);

for (i = 0; i < core_num_wrps; ++i) {
rcu_read_lock();
@@ -802,7 +802,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
+ slots = this_cpu_ptr(bp_on_reg);

/* The exception entry code places the amended lr in the PC. */
addr = regs->ARM_pc;
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 53dc018..ff516f6 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -184,14 +184,14 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
/* Breakpoint */
ctrl_reg = AARCH64_DBG_REG_BCR;
val_reg = AARCH64_DBG_REG_BVR;
- slots = __this_cpu_read(bp_on_reg);
+ slots = this_cpu_ptr(bp_on_reg);
max_slots = core_num_brps;
reg_enable = !debug_info->bps_disabled;
} else {
/* Watchpoint */
ctrl_reg = AARCH64_DBG_REG_WCR;
val_reg = AARCH64_DBG_REG_WVR;
- slots = __this_cpu_read(wp_on_reg);
+ slots = this_cpu_ptr(wp_on_reg);
max_slots = core_num_wrps;
reg_enable = !debug_info->wps_disabled;
}
@@ -230,12 +230,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
/* Breakpoint */
base = AARCH64_DBG_REG_BCR;
- slots = __this_cpu_read(bp_on_reg);
+ slots = this_cpu_ptr(bp_on_reg);
max_slots = core_num_brps;
} else {
/* Watchpoint */
base = AARCH64_DBG_REG_WCR;
- slots = __this_cpu_read(wp_on_reg);
+ slots = this_cpu_ptr(wp_on_reg);
max_slots = core_num_wrps;
}

@@ -505,11 +505,11 @@ static void toggle_bp_registers(int reg, enum debug_el el, int enable)

switch (reg) {
case AARCH64_DBG_REG_BCR:
- slots = __this_cpu_read(bp_on_reg);
+ slots = this_cpu_ptr(bp_on_reg);
max_slots = core_num_brps;
break;
case AARCH64_DBG_REG_WCR:
- slots = __this_cpu_read(wp_on_reg);
+ slots = this_cpu_ptr(wp_on_reg);
max_slots = core_num_wrps;
break;
default:
@@ -546,7 +546,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
struct debug_info *debug_info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__this_cpu_read(bp_on_reg);
+ slots = this_cpu_ptr(bp_on_reg);
addr = instruction_pointer(regs);
debug_info = &current->thread.debug;

@@ -623,7 +623,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;

- slots = (struct perf_event **)__this_cpu_read(wp_on_reg);
+ slots = this_cpu_ptr(wp_on_reg);
debug_info = &current->thread.debug;

for (i = 0; i < core_num_wrps; ++i) {

Subject: Re: [gcv v3 27/35] arm: Replace __get_cpu_var uses

On Fri, 6 Sep 2013, Will Deacon wrote:

> Ok, I had a crack at putting something together on top of HEAD, but I still
> see some errors due to pcpu array types (even with your patch in mainline).
> I think some of your conversions to __this_cpu_read should use this_cpu_ptr
> instead, but could you have a quick look at my fixup below please?

That could be the case since the cocci script does assume a read operation
is needed if there was no & in front of the __get_cpu_var. This is wrong
for pointers and arrays.

Patch looks fine to me.