2018-02-17 00:59:15

by Shanker Donthineni

[permalink] [raw]
Subject: [PATCH] arm64: Add support for new control bits CTR_EL0.IDC and CTR_EL0.IDC

Two point of unification cache maintenance operations 'DC CVAU' and
'IC IVAU' are optional for implementors as per ARMv8 specification.
This patch parses the updated CTR_EL0 register definition and adds
the required changes to skip POU operations if the hardware reports
CTR_EL0.IDC and/or CTR_EL0.IDC.

CTR_EL0.DIC: Instruction cache invalidation requirements for
instruction to data coherence. The meaning of this bit[29].
0: Instruction cache invalidation to the point of unification
is required for instruction to data coherence.
1: Instruction cache cleaning to the point of unification is
not required for instruction to data coherence.

CTR_EL0.IDC: Data cache clean requirements for instruction to data
coherence. The meaning of this bit[28].
0: Data cache clean to the point of unification is required for
instruction to data coherence, unless CLIDR_EL1.LoC == 0b000
or (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000).
1: Data cache clean to the point of unification is not required
for instruction to data coherence.

Signed-off-by: Philip Elcan <[email protected]>
Signed-off-by: Shanker Donthineni <[email protected]>
---
arch/arm64/include/asm/assembler.h | 48 ++++++++++++++++++++++++--------------
arch/arm64/include/asm/cache.h | 2 ++
arch/arm64/kernel/cpufeature.c | 2 ++
arch/arm64/mm/cache.S | 26 ++++++++++++++-------
4 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 3c78835..9eaa948 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -30,6 +30,7 @@
#include <asm/pgtable-hwdef.h>
#include <asm/ptrace.h>
#include <asm/thread_info.h>
+#include <asm/cache.h>

.macro save_and_disable_daif, flags
mrs \flags, daif
@@ -334,9 +335,9 @@
* raw_dcache_line_size - get the minimum D-cache line size on this CPU
* from the CTR register.
*/
- .macro raw_dcache_line_size, reg, tmp
- mrs \tmp, ctr_el0 // read CTR
- ubfm \tmp, \tmp, #16, #19 // cache line size encoding
+ .macro raw_dcache_line_size, reg, tmp, ctr
+ mrs \ctr, ctr_el0 // read CTR
+ ubfm \tmp, \ctr, #16, #19 // cache line size encoding
mov \reg, #4 // bytes per word
lsl \reg, \reg, \tmp // actual cache line size
.endm
@@ -344,9 +345,9 @@
/*
* dcache_line_size - get the safe D-cache line size across all CPUs
*/
- .macro dcache_line_size, reg, tmp
- read_ctr \tmp
- ubfm \tmp, \tmp, #16, #19 // cache line size encoding
+ .macro dcache_line_size, reg, tmp, ctr
+ read_ctr \ctr
+ ubfm \tmp, \ctr, #16, #19 // cache line size encoding
mov \reg, #4 // bytes per word
lsl \reg, \reg, \tmp // actual cache line size
.endm
@@ -355,9 +356,9 @@
* raw_icache_line_size - get the minimum I-cache line size on this CPU
* from the CTR register.
*/
- .macro raw_icache_line_size, reg, tmp
- mrs \tmp, ctr_el0 // read CTR
- and \tmp, \tmp, #0xf // cache line size encoding
+ .macro raw_icache_line_size, reg, tmp, ctr
+ mrs \ctr, ctr_el0 // read CTR
+ and \tmp, \ctr, #0xf // cache line size encoding
mov \reg, #4 // bytes per word
lsl \reg, \reg, \tmp // actual cache line size
.endm
@@ -365,9 +366,9 @@
/*
* icache_line_size - get the safe I-cache line size across all CPUs
*/
- .macro icache_line_size, reg, tmp
- read_ctr \tmp
- and \tmp, \tmp, #0xf // cache line size encoding
+ .macro icache_line_size, reg, tmp, ctr
+ read_ctr \ctr
+ and \tmp, \ctr, #0xf // cache line size encoding
mov \reg, #4 // bytes per word
lsl \reg, \reg, \tmp // actual cache line size
.endm
@@ -408,13 +409,21 @@
* size: size of the region
* Corrupts: kaddr, size, tmp1, tmp2
*/
- .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
- dcache_line_size \tmp1, \tmp2
+ .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2, tmp3
+ dcache_line_size \tmp1, \tmp2, \tmp3
add \size, \kaddr, \size
sub \tmp2, \tmp1, #1
bic \kaddr, \kaddr, \tmp2
9998:
- .if (\op == cvau || \op == cvac)
+ .if (\op == cvau)
+alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE
+ tbnz \tmp3, #CTR_IDC_SHIFT, 9997f
+ dc cvau, \kaddr
+alternative_else
+ dc civac, \kaddr
+ nop
+alternative_endif
+ .elseif (\op == cvac)
alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE
dc \op, \kaddr
alternative_else
@@ -433,6 +442,7 @@
cmp \kaddr, \size
b.lo 9998b
dsb \domain
+9997:
.endm

/*
@@ -441,10 +451,11 @@
*
* start, end: virtual addresses describing the region
* label: A label to branch to on user fault.
- * Corrupts: tmp1, tmp2
+ * Corrupts: tmp1, tmp2, tmp3
*/
- .macro invalidate_icache_by_line start, end, tmp1, tmp2, label
- icache_line_size \tmp1, \tmp2
+ .macro invalidate_icache_by_line start, end, tmp1, tmp2, tmp3, label
+ icache_line_size \tmp1, \tmp2, \tmp3
+ tbnz \tmp3, #CTR_DIC_SHIFT, 9996f
sub \tmp2, \tmp1, #1
bic \tmp2, \start, \tmp2
9997:
@@ -454,6 +465,7 @@
b.lo 9997b
dsb ish
isb
+9996:
.endm

/*
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index ea9bb4e..aea533b 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -22,6 +22,8 @@
#define CTR_L1IP_MASK 3
#define CTR_CWG_SHIFT 24
#define CTR_CWG_MASK 15
+#define CTR_IDC_SHIFT 28
+#define CTR_DIC_SHIFT 29

#define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 29b1f87..f42bb5a 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -200,6 +200,8 @@ static int __init register_cpu_hwcaps_dumper(void)

static const struct arm64_ftr_bits ftr_ctr[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DIC_SHIFT, 1, 0), /* DIC */
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IDC_SHIFT, 1, 0), /* IDC */
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, 24, 4, 0), /* CWG */
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), /* ERG */
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 1), /* DminLine */
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 758bde7..5764af8 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -24,6 +24,7 @@
#include <asm/cpufeature.h>
#include <asm/alternative.h>
#include <asm/asm-uaccess.h>
+#include <asm/cache.h>

/*
* flush_icache_range(start,end)
@@ -50,7 +51,12 @@ ENTRY(flush_icache_range)
*/
ENTRY(__flush_cache_user_range)
uaccess_ttbr0_enable x2, x3, x4
- dcache_line_size x2, x3
+ dcache_line_size x2, x3, x4
+alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE
+ tbnz x4, #CTR_IDC_SHIFT, 8f
+alternative_else
+ nop
+alternative_endif
sub x3, x2, #1
bic x4, x0, x3
1:
@@ -60,7 +66,9 @@ user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE
b.lo 1b
dsb ish

- invalidate_icache_by_line x0, x1, x2, x3, 9f
+8:
+ invalidate_icache_by_line x0, x1, x2, x3, x4, 9f
+
mov x0, #0
1:
uaccess_ttbr0_disable x1, x2
@@ -82,7 +90,7 @@ ENDPROC(__flush_cache_user_range)
ENTRY(invalidate_icache_range)
uaccess_ttbr0_enable x2, x3, x4

- invalidate_icache_by_line x0, x1, x2, x3, 2f
+ invalidate_icache_by_line x0, x1, x2, x3, x4, 2f
mov x0, xzr
1:
uaccess_ttbr0_disable x1, x2
@@ -102,7 +110,7 @@ ENDPROC(invalidate_icache_range)
* - size - size in question
*/
ENTRY(__flush_dcache_area)
- dcache_by_line_op civac, sy, x0, x1, x2, x3
+ dcache_by_line_op civac, sy, x0, x1, x2, x3, x4
ret
ENDPIPROC(__flush_dcache_area)

@@ -116,7 +124,7 @@ ENDPIPROC(__flush_dcache_area)
* - size - size in question
*/
ENTRY(__clean_dcache_area_pou)
- dcache_by_line_op cvau, ish, x0, x1, x2, x3
+ dcache_by_line_op cvau, ish, x0, x1, x2, x3, x4
ret
ENDPROC(__clean_dcache_area_pou)

@@ -140,7 +148,7 @@ ENTRY(__inval_dcache_area)
*/
__dma_inv_area:
add x1, x1, x0
- dcache_line_size x2, x3
+ dcache_line_size x2, x3, x4
sub x3, x2, #1
tst x1, x3 // end cache line aligned?
bic x1, x1, x3
@@ -178,7 +186,7 @@ ENTRY(__clean_dcache_area_poc)
* - size - size in question
*/
__dma_clean_area:
- dcache_by_line_op cvac, sy, x0, x1, x2, x3
+ dcache_by_line_op cvac, sy, x0, x1, x2, x3, x4
ret
ENDPIPROC(__clean_dcache_area_poc)
ENDPROC(__dma_clean_area)
@@ -193,7 +201,7 @@ ENDPROC(__dma_clean_area)
* - size - size in question
*/
ENTRY(__clean_dcache_area_pop)
- dcache_by_line_op cvap, sy, x0, x1, x2, x3
+ dcache_by_line_op cvap, sy, x0, x1, x2, x3, x4
ret
ENDPIPROC(__clean_dcache_area_pop)

@@ -206,7 +214,7 @@ ENDPIPROC(__clean_dcache_area_pop)
* - size - size in question
*/
ENTRY(__dma_flush_area)
- dcache_by_line_op civac, sy, x0, x1, x2, x3
+ dcache_by_line_op civac, sy, x0, x1, x2, x3, x4
ret
ENDPIPROC(__dma_flush_area)

--
Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.



2018-02-19 14:39:19

by Catalin Marinas

[permalink] [raw]
Subject: Re: [PATCH] arm64: Add support for new control bits CTR_EL0.IDC and CTR_EL0.IDC

On Fri, Feb 16, 2018 at 06:57:46PM -0600, Shanker Donthineni wrote:
> Two point of unification cache maintenance operations 'DC CVAU' and
> 'IC IVAU' are optional for implementors as per ARMv8 specification.
> This patch parses the updated CTR_EL0 register definition and adds
> the required changes to skip POU operations if the hardware reports
> CTR_EL0.IDC and/or CTR_EL0.IDC.
>
> CTR_EL0.DIC: Instruction cache invalidation requirements for
> instruction to data coherence. The meaning of this bit[29].
> 0: Instruction cache invalidation to the point of unification
> is required for instruction to data coherence.
> 1: Instruction cache cleaning to the point of unification is
> not required for instruction to data coherence.
>
> CTR_EL0.IDC: Data cache clean requirements for instruction to data
> coherence. The meaning of this bit[28].
> 0: Data cache clean to the point of unification is required for
> instruction to data coherence, unless CLIDR_EL1.LoC == 0b000
> or (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000).
> 1: Data cache clean to the point of unification is not required
> for instruction to data coherence.

There is a difference between cache maintenance to PoU "is not required"
and the actual instructions being optional (i.e. undef when executed).
If your caches are transparent and DC CVAU/IC IVAU is not required,
these instructions should behave as NOPs. So, are you trying to improve
the performance of the cache maintenance routines in the kernel? If yes,
please show some (relative) numbers and a better description in the
commit log.

On the patch, I'd rather have an alternative framework entry for no VAU
cache maint required and some ret instruction at the beginning of the
cache maint function rather than jumping out of the loop somewhere
inside the cache maintenance code, penalising the CPUs that do require
it.

--
Catalin

2018-02-19 14:44:33

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH] arm64: Add support for new control bits CTR_EL0.IDC and CTR_EL0.IDC

Hi Shanker,

On Fri, Feb 16, 2018 at 06:57:46PM -0600, Shanker Donthineni wrote:
> Two point of unification cache maintenance operations 'DC CVAU' and
> 'IC IVAU' are optional for implementors as per ARMv8 specification.
> This patch parses the updated CTR_EL0 register definition and adds
> the required changes to skip POU operations if the hardware reports
> CTR_EL0.IDC and/or CTR_EL0.IDC.
>
> CTR_EL0.DIC: Instruction cache invalidation requirements for
> instruction to data coherence. The meaning of this bit[29].
> 0: Instruction cache invalidation to the point of unification
> is required for instruction to data coherence.
> 1: Instruction cache cleaning to the point of unification is
> not required for instruction to data coherence.
>
> CTR_EL0.IDC: Data cache clean requirements for instruction to data
> coherence. The meaning of this bit[28].
> 0: Data cache clean to the point of unification is required for
> instruction to data coherence, unless CLIDR_EL1.LoC == 0b000
> or (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000).
> 1: Data cache clean to the point of unification is not required
> for instruction to data coherence.
>
> Signed-off-by: Philip Elcan <[email protected]>
> Signed-off-by: Shanker Donthineni <[email protected]>
> ---
> arch/arm64/include/asm/assembler.h | 48 ++++++++++++++++++++++++--------------
> arch/arm64/include/asm/cache.h | 2 ++
> arch/arm64/kernel/cpufeature.c | 2 ++
> arch/arm64/mm/cache.S | 26 ++++++++++++++-------
> 4 files changed, 51 insertions(+), 27 deletions(-)

I was looking at our CTR_EL0 code last week but forgot to post the patch I
wrote fixing up some of the fields. I just send it now, so please can
you rebase on top of:

http://lists.infradead.org/pipermail/linux-arm-kernel/2018-February/560488.html

Also:

> diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
> index ea9bb4e..aea533b 100644
> --- a/arch/arm64/include/asm/cache.h
> +++ b/arch/arm64/include/asm/cache.h
> @@ -22,6 +22,8 @@
> #define CTR_L1IP_MASK 3
> #define CTR_CWG_SHIFT 24
> #define CTR_CWG_MASK 15
> +#define CTR_IDC_SHIFT 28
> +#define CTR_DIC_SHIFT 29
>
> #define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK)
>
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 29b1f87..f42bb5a 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -200,6 +200,8 @@ static int __init register_cpu_hwcaps_dumper(void)
>
> static const struct arm64_ftr_bits ftr_ctr[] = {
> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */
> + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DIC_SHIFT, 1, 0), /* DIC */
> + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IDC_SHIFT, 1, 0), /* IDC */
> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, 24, 4, 0), /* CWG */
> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), /* ERG */
> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 1), /* DminLine */

Could you update the other table entries here to use the CTR_*_SHIFT values
as well?

Thanks,

Will

2018-02-19 16:36:32

by Shanker Donthineni

[permalink] [raw]
Subject: Re: [PATCH] arm64: Add support for new control bits CTR_EL0.IDC and CTR_EL0.IDC

Hi Catalin,

On 02/19/2018 08:38 AM, Catalin Marinas wrote:
> On Fri, Feb 16, 2018 at 06:57:46PM -0600, Shanker Donthineni wrote:
>> Two point of unification cache maintenance operations 'DC CVAU' and
>> 'IC IVAU' are optional for implementors as per ARMv8 specification.
>> This patch parses the updated CTR_EL0 register definition and adds
>> the required changes to skip POU operations if the hardware reports
>> CTR_EL0.IDC and/or CTR_EL0.IDC.
>>
>> CTR_EL0.DIC: Instruction cache invalidation requirements for
>> instruction to data coherence. The meaning of this bit[29].
>> 0: Instruction cache invalidation to the point of unification
>> is required for instruction to data coherence.
>> 1: Instruction cache cleaning to the point of unification is
>> not required for instruction to data coherence.
>>
>> CTR_EL0.IDC: Data cache clean requirements for instruction to data
>> coherence. The meaning of this bit[28].
>> 0: Data cache clean to the point of unification is required for
>> instruction to data coherence, unless CLIDR_EL1.LoC == 0b000
>> or (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000).
>> 1: Data cache clean to the point of unification is not required
>> for instruction to data coherence.
>
> There is a difference between cache maintenance to PoU "is not required"
> and the actual instructions being optional (i.e. undef when executed).
> If your caches are transparent and DC CVAU/IC IVAU is not required,
> these instructions should behave as NOPs. So, are you trying to improve
> the performance of the cache maintenance routines in the kernel? If yes,
> please show some (relative) numbers and a better description in the
> commit log.
>

Yes, I agree with you, POU instructions are NOPs if the caches are transparent.
There was no issue as per correctness point of view. But causing the unnecessary
overhead in ASM routines where code goes thorough VA range incremented
by cache line size. This overhead is noticeable with 64K PAGE, especially with
sections mappings. I'll reword the commit text to reflect your comments in v2 patch.

e.g. 512M section with 64K PAGE_SIZE kernel, assume 64Bytes cache size.
flush_icache_range() consumes around 256M cpu cycles

Icache loop overhead: 512Mbytes / 64Bytes * 4 instructions per loop
Dcache loop overhead: 512Mbytes / 64Bytes * 4 instructions per loop


With this patch it takes less than ~1K cycles.


> On the patch, I'd rather have an alternative framework entry for no VAU
> cache maint required and some ret instruction at the beginning of the
> cache maint function rather than jumping out of the loop somewhere
> inside the cache maintenance code, penalising the CPUs that do require
> it.
>

Alternative framework might break things in case of CPU hotplug. I need one
more confirmation from you on incorporating alternative framework.

--
Shanker Donthineni
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.

2018-02-19 16:37:59

by Shanker Donthineni

[permalink] [raw]
Subject: Re: [PATCH] arm64: Add support for new control bits CTR_EL0.IDC and CTR_EL0.IDC

Hi Will,

On 02/19/2018 08:43 AM, Will Deacon wrote:
> Hi Shanker,
>
> On Fri, Feb 16, 2018 at 06:57:46PM -0600, Shanker Donthineni wrote:
>> Two point of unification cache maintenance operations 'DC CVAU' and
>> 'IC IVAU' are optional for implementors as per ARMv8 specification.
>> This patch parses the updated CTR_EL0 register definition and adds
>> the required changes to skip POU operations if the hardware reports
>> CTR_EL0.IDC and/or CTR_EL0.IDC.
>>
>> CTR_EL0.DIC: Instruction cache invalidation requirements for
>> instruction to data coherence. The meaning of this bit[29].
>> 0: Instruction cache invalidation to the point of unification
>> is required for instruction to data coherence.
>> 1: Instruction cache cleaning to the point of unification is
>> not required for instruction to data coherence.
>>
>> CTR_EL0.IDC: Data cache clean requirements for instruction to data
>> coherence. The meaning of this bit[28].
>> 0: Data cache clean to the point of unification is required for
>> instruction to data coherence, unless CLIDR_EL1.LoC == 0b000
>> or (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000).
>> 1: Data cache clean to the point of unification is not required
>> for instruction to data coherence.
>>
>> Signed-off-by: Philip Elcan <[email protected]>
>> Signed-off-by: Shanker Donthineni <[email protected]>
>> ---
>> arch/arm64/include/asm/assembler.h | 48 ++++++++++++++++++++++++--------------
>> arch/arm64/include/asm/cache.h | 2 ++
>> arch/arm64/kernel/cpufeature.c | 2 ++
>> arch/arm64/mm/cache.S | 26 ++++++++++++++-------
>> 4 files changed, 51 insertions(+), 27 deletions(-)
>
> I was looking at our CTR_EL0 code last week but forgot to post the patch I
> wrote fixing up some of the fields. I just send it now, so please can
> you rebase on top of:
>
> http://lists.infradead.org/pipermail/linux-arm-kernel/2018-February/560488.html
>
> Also:
>
>> diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
>> index ea9bb4e..aea533b 100644
>> --- a/arch/arm64/include/asm/cache.h
>> +++ b/arch/arm64/include/asm/cache.h
>> @@ -22,6 +22,8 @@
>> #define CTR_L1IP_MASK 3
>> #define CTR_CWG_SHIFT 24
>> #define CTR_CWG_MASK 15
>> +#define CTR_IDC_SHIFT 28
>> +#define CTR_DIC_SHIFT 29
>>
>> #define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK)
>>
>> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
>> index 29b1f87..f42bb5a 100644
>> --- a/arch/arm64/kernel/cpufeature.c
>> +++ b/arch/arm64/kernel/cpufeature.c
>> @@ -200,6 +200,8 @@ static int __init register_cpu_hwcaps_dumper(void)
>>
>> static const struct arm64_ftr_bits ftr_ctr[] = {
>> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */
>> + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DIC_SHIFT, 1, 0), /* DIC */
>> + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IDC_SHIFT, 1, 0), /* IDC */
>> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, 24, 4, 0), /* CWG */
>> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), /* ERG */
>> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 1), /* DminLine */
>
> Could you update the other table entries here to use the CTR_*_SHIFT values
> as well?
>

I'll do.

> Thanks,
>
> Will
>
> _______________________________________________
> linux-arm-kernel mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>

--
Shanker Donthineni
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.

2018-02-19 17:20:28

by Catalin Marinas

[permalink] [raw]
Subject: Re: [PATCH] arm64: Add support for new control bits CTR_EL0.IDC and CTR_EL0.IDC

On Mon, Feb 19, 2018 at 10:35:30AM -0600, Shanker Donthineni wrote:
> On 02/19/2018 08:38 AM, Catalin Marinas wrote:
> > On the patch, I'd rather have an alternative framework entry for no VAU
> > cache maint required and some ret instruction at the beginning of the
> > cache maint function rather than jumping out of the loop somewhere
> > inside the cache maintenance code, penalising the CPUs that do require
> > it.
>
> Alternative framework might break things in case of CPU hotplug. I need one
> more confirmation from you on incorporating alternative framework.

CPU hotplug can be an issue but it should be handled like other similar
cases: if a CPU comes online late and its features are incompatible, it
should not be brought online. The cpufeature code handles this.

With Will's patch for CTR_EL0, we handle different CPU features during
boot, defaulting to the lowest value for the IDC/DIC bits.

I suggest you add new ARM64_HAS_* feature bits and enable them based on
CTR_EL0.IDC and DIC. You could check for both being 1 with a single
feature bit but I guess an implementation is allowed to have these
different (e.g. DIC == 0 and IDC == 1).

--
Catalin

2018-02-19 18:33:16

by Shanker Donthineni

[permalink] [raw]
Subject: Re: [PATCH] arm64: Add support for new control bits CTR_EL0.IDC and CTR_EL0.IDC

Thanks Catalin for your comments.

On 02/19/2018 11:18 AM, Catalin Marinas wrote:
> On Mon, Feb 19, 2018 at 10:35:30AM -0600, Shanker Donthineni wrote:
>> On 02/19/2018 08:38 AM, Catalin Marinas wrote:
>>> On the patch, I'd rather have an alternative framework entry for no VAU
>>> cache maint required and some ret instruction at the beginning of the
>>> cache maint function rather than jumping out of the loop somewhere
>>> inside the cache maintenance code, penalising the CPUs that do require
>>> it.
>>
>> Alternative framework might break things in case of CPU hotplug. I need one
>> more confirmation from you on incorporating alternative framework.
>
> CPU hotplug can be an issue but it should be handled like other similar
> cases: if a CPU comes online late and its features are incompatible, it
> should not be brought online. The cpufeature code handles this.
>
> With Will's patch for CTR_EL0, we handle different CPU features during
> boot, defaulting to the lowest value for the IDC/DIC bits.
>
> I suggest you add new ARM64_HAS_* feature bits and enable them based on
> CTR_EL0.IDC and DIC. You could check for both being 1 with a single
> feature bit but I guess an implementation is allowed to have these
> different (e.g. DIC == 0 and IDC == 1).
>

I'll add two new features ARM64_HAS_DIC and ARM64_HAS_IDC to support
all implementations. Unfortunately QCOM server chips supports IDC not DIC.


--
Shanker Donthineni
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.

2018-02-20 02:14:15

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] arm64: Add support for new control bits CTR_EL0.IDC and CTR_EL0.IDC

Hi Shanker,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.16-rc2 next-20180219]
[cannot apply to arm64/for-next/core]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url: https://github.com/0day-ci/linux/commits/Shanker-Donthineni/arm64-Add-support-for-new-control-bits-CTR_EL0-IDC-and-CTR_EL0-IDC/20180219-031155
config: arm64-defconfig (attached as .config)
compiler: aarch64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=arm64

All errors (new ones prefixed by >>):

arch/arm64/kernel/hibernate-asm.S: Assembler messages:
>> arch/arm64/kernel/hibernate-asm.S:101: Error: unexpected comma after the mnemonic name `mrs' -- `mrs ,ctr_el0'
>> arch/arm64/kernel/hibernate-asm.S:101: Error: operand 2 must be an integer register -- `ubfm x3,,#16,#19'
--
arch/arm64/kernel/relocate_kernel.S: Assembler messages:
>> arch/arm64/kernel/relocate_kernel.S:37: Error: unexpected comma after the mnemonic name `mrs' -- `mrs ,ctr_el0'
>> arch/arm64/kernel/relocate_kernel.S:37: Error: operand 2 must be an integer register -- `ubfm x0,,#16,#19'

vim +101 arch/arm64/kernel/hibernate-asm.S

82869ac57 James Morse 2016-04-27 46
82869ac57 James Morse 2016-04-27 47
82869ac57 James Morse 2016-04-27 48 /*
82869ac57 James Morse 2016-04-27 49 * Resume from hibernate
82869ac57 James Morse 2016-04-27 50 *
82869ac57 James Morse 2016-04-27 51 * Loads temporary page tables then restores the memory image.
82869ac57 James Morse 2016-04-27 52 * Finally branches to cpu_resume() to restore the state saved by
82869ac57 James Morse 2016-04-27 53 * swsusp_arch_suspend().
82869ac57 James Morse 2016-04-27 54 *
82869ac57 James Morse 2016-04-27 55 * Because this code has to be copied to a 'safe' page, it can't call out to
82869ac57 James Morse 2016-04-27 56 * other functions by PC-relative address. Also remember that it may be
82869ac57 James Morse 2016-04-27 57 * mid-way through over-writing other functions. For this reason it contains
82869ac57 James Morse 2016-04-27 58 * code from flush_icache_range() and uses the copy_page() macro.
82869ac57 James Morse 2016-04-27 59 *
82869ac57 James Morse 2016-04-27 60 * This 'safe' page is mapped via ttbr0, and executed from there. This function
82869ac57 James Morse 2016-04-27 61 * switches to a copy of the linear map in ttbr1, performs the restore, then
82869ac57 James Morse 2016-04-27 62 * switches ttbr1 to the original kernel's swapper_pg_dir.
82869ac57 James Morse 2016-04-27 63 *
82869ac57 James Morse 2016-04-27 64 * All of memory gets written to, including code. We need to clean the kernel
82869ac57 James Morse 2016-04-27 65 * text to the Point of Coherence (PoC) before secondary cores can be booted.
82869ac57 James Morse 2016-04-27 66 * Because the kernel modules and executable pages mapped to user space are
82869ac57 James Morse 2016-04-27 67 * also written as data, we clean all pages we touch to the Point of
82869ac57 James Morse 2016-04-27 68 * Unification (PoU).
82869ac57 James Morse 2016-04-27 69 *
82869ac57 James Morse 2016-04-27 70 * x0: physical address of temporary page tables
82869ac57 James Morse 2016-04-27 71 * x1: physical address of swapper page tables
82869ac57 James Morse 2016-04-27 72 * x2: address of cpu_resume
82869ac57 James Morse 2016-04-27 73 * x3: linear map address of restore_pblist in the current kernel
82869ac57 James Morse 2016-04-27 74 * x4: physical address of __hyp_stub_vectors, or 0
82869ac57 James Morse 2016-04-27 75 * x5: physical address of a zero page that remains zero after resume
82869ac57 James Morse 2016-04-27 76 */
82869ac57 James Morse 2016-04-27 77 .pushsection ".hibernate_exit.text", "ax"
82869ac57 James Morse 2016-04-27 78 ENTRY(swsusp_arch_suspend_exit)
82869ac57 James Morse 2016-04-27 79 /*
82869ac57 James Morse 2016-04-27 80 * We execute from ttbr0, change ttbr1 to our copied linear map tables
82869ac57 James Morse 2016-04-27 81 * with a break-before-make via the zero page
82869ac57 James Morse 2016-04-27 82 */
529c4b05a Kristina Martsenko 2017-12-13 83 break_before_make_ttbr_switch x5, x0, x6
82869ac57 James Morse 2016-04-27 84
82869ac57 James Morse 2016-04-27 85 mov x21, x1
82869ac57 James Morse 2016-04-27 86 mov x30, x2
82869ac57 James Morse 2016-04-27 87 mov x24, x4
82869ac57 James Morse 2016-04-27 88 mov x25, x5
82869ac57 James Morse 2016-04-27 89
82869ac57 James Morse 2016-04-27 90 /* walk the restore_pblist and use copy_page() to over-write memory */
82869ac57 James Morse 2016-04-27 91 mov x19, x3
82869ac57 James Morse 2016-04-27 92
82869ac57 James Morse 2016-04-27 93 1: ldr x10, [x19, #HIBERN_PBE_ORIG]
82869ac57 James Morse 2016-04-27 94 mov x0, x10
82869ac57 James Morse 2016-04-27 95 ldr x1, [x19, #HIBERN_PBE_ADDR]
82869ac57 James Morse 2016-04-27 96
82869ac57 James Morse 2016-04-27 97 copy_page x0, x1, x2, x3, x4, x5, x6, x7, x8, x9
82869ac57 James Morse 2016-04-27 98
82869ac57 James Morse 2016-04-27 99 add x1, x10, #PAGE_SIZE
82869ac57 James Morse 2016-04-27 100 /* Clean the copied page to PoU - based on flush_icache_range() */
072f0a633 Suzuki K Poulose 2016-09-09 @101 raw_dcache_line_size x2, x3
82869ac57 James Morse 2016-04-27 102 sub x3, x2, #1
82869ac57 James Morse 2016-04-27 103 bic x4, x10, x3
82869ac57 James Morse 2016-04-27 104 2: dc cvau, x4 /* clean D line / unified line */
82869ac57 James Morse 2016-04-27 105 add x4, x4, x2
82869ac57 James Morse 2016-04-27 106 cmp x4, x1
82869ac57 James Morse 2016-04-27 107 b.lo 2b
82869ac57 James Morse 2016-04-27 108
82869ac57 James Morse 2016-04-27 109 ldr x19, [x19, #HIBERN_PBE_NEXT]
82869ac57 James Morse 2016-04-27 110 cbnz x19, 1b
82869ac57 James Morse 2016-04-27 111 dsb ish /* wait for PoU cleaning to finish */
82869ac57 James Morse 2016-04-27 112
82869ac57 James Morse 2016-04-27 113 /* switch to the restored kernels page tables */
529c4b05a Kristina Martsenko 2017-12-13 114 break_before_make_ttbr_switch x25, x21, x6
82869ac57 James Morse 2016-04-27 115
82869ac57 James Morse 2016-04-27 116 ic ialluis
82869ac57 James Morse 2016-04-27 117 dsb ish
82869ac57 James Morse 2016-04-27 118 isb
82869ac57 James Morse 2016-04-27 119
82869ac57 James Morse 2016-04-27 120 cbz x24, 3f /* Do we need to re-initialise EL2? */
82869ac57 James Morse 2016-04-27 121 hvc #0
82869ac57 James Morse 2016-04-27 122 3: ret
82869ac57 James Morse 2016-04-27 123
82869ac57 James Morse 2016-04-27 124 .ltorg
82869ac57 James Morse 2016-04-27 125 ENDPROC(swsusp_arch_suspend_exit)
82869ac57 James Morse 2016-04-27 126

:::::: The code at line 101 was first introduced by commit
:::::: 072f0a633838aca13b5a8b211eb64f5c445cfd7c arm64: Introduce raw_{d,i}cache_line_size

:::::: TO: Suzuki K Poulose <[email protected]>
:::::: CC: Will Deacon <[email protected]>

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation


Attachments:
(No filename) (7.95 kB)
.config.gz (37.27 kB)
Download all attachments