Note: I was working on these patches for quite sometime
and realized that Toshi Kani has shared some patches
addressing the same isssue with subject
"[PATCH 0/2] fix memory leak / panic in ioremap huge pages".
I've taken slightly different approach here, so sending
to the list, finally.
This patch series attempts to fix the issue described in
this discussion: https://lkml.org/lkml/2017/12/23/3
In summary, CONFIG_HAVE_ARCH_HUGE_VMAP has 2 issues
observed on ARM64
1. Stale TLB entries
2. Page table leaks
Will Deacon has by-passed huge mapping for ARM64
until these issues are fixed properly.
I've tested these patches on ARM64 based SDM845
with 4.9 kernel. Hanjun Guo <[email protected]>
shared test-case to reproduce this issue in
https://patchwork.kernel.org/patch/10134581/
However, I had no luck reproducing this issue with
exactly this test. I added some more code to reproduce
and here is my test which surfaces this issue in some
five hours of testing.
Test Code:
#define pr_fmt(fmt) "IOREMAP_TEST: " fmt
#include <linux/console.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/random.h>
#include <linux/io.h>
#include <linux/kthread.h>
static int io_remap_test(void *arg)
{
phys_addr_t phy_addr_1 = 0x98000000;
unsigned long block_size = 0x200000;
unsigned long total_size = 0x400000;
unsigned long va_addr_3;
unsigned long va_addr_4;
struct vm_struct *area;
/*
* run this test for 10 hours
*/
u32 times_ms = 10 * 60 * 60 * 1000;
unsigned long timeout = jiffies + msecs_to_jiffies(times_ms);
int rand_type;
int rand_value;
u32 mem_size;
int i;
area = get_vm_area(total_size, VM_IOREMAP);
va_addr_3 = (unsigned long)area->addr;
va_addr_4 = va_addr_3 + block_size;
pr_err("Virtual area %lx -- %lx\n", va_addr_3, va_addr_3 + total_size);
ioremap_page_range(va_addr_3, va_addr_3 + block_size, phy_addr_1, __pgprot(PROT_DEVICE_nGnRE));
pr_err("My tests will run now 2\n");
do {
get_random_bytes(&rand_value, sizeof(u32));
rand_type = rand_value % 6;
switch (rand_type) {
case 0:
mem_size = 0x1000;
break;
case 1:
mem_size = 0x8000;
break;
case 2:
mem_size = 0x200000;
break;
case 3:
mem_size = 0x30000;
break;
case 4:
mem_size = 0x10000;
break;
case 5:
mem_size = 0x40000;
break;
default:
mem_size = 0x4000;
break;
}
/*
* Prompt TLB to speculatively pre-fetch
*/
readq(va_addr_3 + block_size - 0x20);
readq(va_addr_3 + block_size - 0x18);
readq(va_addr_3 + block_size - 0x10);
readq(va_addr_3 + block_size - 0x8);
ioremap_page_range(va_addr_4, va_addr_4 + mem_size, phy_addr_1, __pgprot(PROT_DEVICE_nGnRE));
if (mem_size >= 0x200000 && !(rand_value%10000)) {
schedule();
pr_err("possible error case\n");
}
/*
* Make CPU aware about our access pattern
* Also, these accesses should lead to crash
*/
for (i = 0; i < 5; i++ ) {
readq(va_addr_3 + block_size - 0x20);
readq(va_addr_3 + block_size - 0x18);
readq(va_addr_3 + block_size - 0x10);
readq(va_addr_3 + block_size - 0x8);
// Crash is expected here
readq(va_addr_4);
readq(va_addr_4 + 0x8);
readq(va_addr_4 + 0x10);
readq(va_addr_4 + 0x18);
readq(va_addr_4 + 0x20);
}
unmap_kernel_range(va_addr_4, mem_size);
} while (time_before(jiffies, timeout));
pr_err("my tests ended now\n");
return 0;
}
static int __init ioremap_testing_init(void)
{
struct task_struct *t;
pr_err("my tests will run now 1\n");
t = kthread_create(&io_remap_test, NULL, "ioremap-testing");
/*
* Do this so that we can run this thread on GOLD cores
*/
kthread_bind(t, 6);
wake_up_process(t);
return 0;
}
late_initcall(ioremap_testing_init);
Chintan Pandya (4):
asm/tlbflush: Add flush_tlb_pgtable() for ARM64
ioremap: Invalidate TLB after huge mappings
arm64: Fix the page leak in pud/pmd_set_huge
Revert "arm64: Enforce BBM for huge IO/VMAP mappings"
arch/arm64/include/asm/tlbflush.h | 5 +++++
arch/arm64/mm/mmu.c | 17 ++++++++---------
include/asm-generic/tlb.h | 6 ++++++
lib/ioremap.c | 9 +++++++--
4 files changed, 26 insertions(+), 11 deletions(-)
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
While setting huge page, we need to take care of
previously existing next level mapping. Since,
we are going to overrite previous mapping, the
only reference to next level page table will get
lost and the next level page table will be zombie,
occupying space forever. So, free it before
overriding.
Signed-off-by: Chintan Pandya <[email protected]>
---
arch/arm64/mm/mmu.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8c704f1..c0df264 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -32,7 +32,7 @@
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
-
+#include <linux/hugetlb.h>
#include <asm/barrier.h>
#include <asm/cputype.h>
#include <asm/fixmap.h>
@@ -45,6 +45,7 @@
#include <asm/memblock.h>
#include <asm/mmu_context.h>
#include <asm/ptdump.h>
+#include <asm/page.h>
#define NO_BLOCK_MAPPINGS BIT(0)
#define NO_CONT_MAPPINGS BIT(1)
@@ -939,6 +940,9 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
return 0;
BUG_ON(phys & ~PUD_MASK);
+ if (pud_val(*pud) && !pud_huge(*pud))
+ free_page((unsigned long)__va(pud_val(*pud)));
+
set_pud(pudp, pfn_pud(__phys_to_pfn(phys), sect_prot));
return 1;
}
@@ -953,6 +957,9 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
return 0;
BUG_ON(phys & ~PMD_MASK);
+ if (pmd_val(*pmd) && !pmd_huge(*pmd))
+ free_page((unsigned long)__va(pmd_val(*pmd)));
+
set_pmd(pmdp, pfn_pmd(__phys_to_pfn(phys), sect_prot));
return 1;
}
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
This commit 15122ee2c515a ("arm64: Enforce BBM for huge
IO/VMAP mappings") is a temporary work-around until the
issues with CONFIG_HAVE_ARCH_HUGE_VMAP gets fixed.
Revert this change as we have fixes for the issue.
Signed-off-by: Chintan Pandya <[email protected]>
---
arch/arm64/mm/mmu.c | 8 --------
1 file changed, 8 deletions(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index c0df264..19116c6 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -935,10 +935,6 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT |
pgprot_val(mk_sect_prot(prot)));
- /* ioremap_page_range doesn't honour BBM */
- if (pud_present(READ_ONCE(*pudp)))
- return 0;
-
BUG_ON(phys & ~PUD_MASK);
if (pud_val(*pud) && !pud_huge(*pud))
free_page((unsigned long)__va(pud_val(*pud)));
@@ -952,10 +948,6 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT |
pgprot_val(mk_sect_prot(prot)));
- /* ioremap_page_range doesn't honour BBM */
- if (pmd_present(READ_ONCE(*pmdp)))
- return 0;
-
BUG_ON(phys & ~PMD_MASK);
if (pmd_val(*pmd) && !pmd_huge(*pmd))
free_page((unsigned long)__va(pmd_val(*pmd)));
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
If huge mappings are enabled, they can override
valid intermediate previous mappings. Some MMU
can speculatively pre-fetch these intermediate
entries even after unmap. That's because unmap
will clear only last level entries in page table
keeping intermediate (pud/pmd) entries still valid.
This can potentially lead to stale TLB entries
which needs invalidation after map.
Some more info: https://lkml.org/lkml/2017/12/23/3
There is one noted case for ARM64 where such stale
TLB entries causes 3rd level translation fault even
after correct (huge) mapping is available.
See the case below (reproduced locally with tests),
[17505.330123] Unable to handle kernel paging request at virtual address ffffff801ae00000
[17505.338100] pgd = ffffff800a761000
[17505.341566] [ffffff801ae00000] *pgd=000000017e1be003, *pud=000000017e1be003, *pmd=00e8000098000f05
[17505.350704] ------------[ cut here ]------------
[17505.355362] Kernel BUG at ffffff8008238c30 [verbose debug info unavailable]
[17505.362375] Internal error: Oops: 96000007 [#1] PREEMPT SMP
[17505.367996] Modules linked in:
[17505.371114] CPU: 6 PID: 488 Comm: chintan-ioremap Not tainted 4.9.81+ #160
[17505.378039] Hardware name: Qualcomm Technologies, Inc. SDM845 v1 MTP (DT)
[17505.384885] task: ffffffc0e3e61180 task.stack: ffffffc0e3e70000
[17505.390868] PC is at io_remap_test+0x2e0/0x444
[17505.395352] LR is at io_remap_test+0x2d0/0x444
[17505.399835] pc : [<ffffff8008238c30>] lr : [<ffffff8008238c20>] pstate: 60c00005
[17505.407282] sp : ffffffc0e3e73d70
[17505.410624] x29: ffffffc0e3e73d70 x28: ffffff801ae00008
[17505.416031] x27: ffffff801ae00010 x26: ffffff801ae00018
[17505.421436] x25: ffffff801ae00020 x24: ffffff801adfffe0
[17505.426840] x23: ffffff801adfffe8 x22: ffffff801adffff0
[17505.432244] x21: ffffff801adffff8 x20: ffffff801ae00000
[17505.437648] x19: 0000000000000005 x18: 0000000000000000
[17505.443052] x17: 00000000b3409452 x16: 00000000923da470
[17505.448456] x15: 0000000071c9763c x14: 00000000a15658fa
[17505.453860] x13: 000000005cae96bf x12: 00000000e6d5c44a
[17505.459264] x11: 0140000000000000 x10: ffffff80099a1000
[17505.464668] x9 : 0000000000000000 x8 : ffffffc0e3e73d68
[17505.470072] x7 : ffffff80099d3220 x6 : 0000000000000015
[17505.475476] x5 : 00000c00004ad32a x4 : 000000000000000a
[17505.480880] x3 : 000000000682aaab x2 : 0000001345c2ad2e
[17505.486284] x1 : 7d78d61de56639ba x0 : 0000000000000001
Hence, invalidate once we override pmd/pud with huge
mappings.
Signed-off-by: Chintan Pandya <[email protected]>
---
lib/ioremap.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/lib/ioremap.c b/lib/ioremap.c
index b808a39..c1e1341 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -13,6 +13,7 @@
#include <linux/export.h>
#include <asm/cacheflush.h>
#include <asm/pgtable.h>
+#include <asm-generic/tlb.h>
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static int __read_mostly ioremap_p4d_capable;
@@ -92,8 +93,10 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
if (ioremap_pmd_enabled() &&
((next - addr) == PMD_SIZE) &&
IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
- if (pmd_set_huge(pmd, phys_addr + addr, prot))
+ if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
+ flush_tlb_pgtable(&init_mm, addr);
continue;
+ }
}
if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
@@ -118,8 +121,10 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
if (ioremap_pud_enabled() &&
((next - addr) == PUD_SIZE) &&
IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
- if (pud_set_huge(pud, phys_addr + addr, prot))
+ if (pud_set_huge(pud, phys_addr + addr, prot)) {
+ flush_tlb_pgtable(&init_mm, addr);
continue;
+ }
}
if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
ARM64 MMU implements invalidation of TLB for
intermediate page tables for perticular VA. This
may or may not be available for other arch. So,
provide this API hook only for ARM64, for now.
Signed-off-by: Chintan Pandya <[email protected]>
---
arch/arm64/include/asm/tlbflush.h | 5 +++++
include/asm-generic/tlb.h | 6 ++++++
2 files changed, 11 insertions(+)
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 9e82dd7..5f656f0 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -209,6 +209,11 @@ static inline void __flush_tlb_pgtable(struct mm_struct *mm,
dsb(ish);
}
+static inline void flush_tlb_pgtable(struct mm_struct *mm,
+ unsigned long uaddr)
+{
+ __flush_tlb_pgtable(mm, uaddr);
+}
#endif
#endif
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index faddde4..7832c0a 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -295,4 +295,10 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
#define tlb_migrate_finish(mm) do {} while (0)
+#ifndef CONFIG_ARM64
+static inline void flush_tlb_pgtable(struct mm_struct *mm,
+ unsigned long uaddr)
+{
+}
+#endif
#endif /* _ASM_GENERIC__TLB_H */
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
On 14/03/18 08:48, Chintan Pandya wrote:
> While setting huge page, we need to take care of
> previously existing next level mapping. Since,
> we are going to overrite previous mapping, the
> only reference to next level page table will get
> lost and the next level page table will be zombie,
> occupying space forever. So, free it before
> overriding.
>
> Signed-off-by: Chintan Pandya <[email protected]>
> ---
> arch/arm64/mm/mmu.c | 9 ++++++++-
> 1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 8c704f1..c0df264 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -32,7 +32,7 @@
> #include <linux/io.h>
> #include <linux/mm.h>
> #include <linux/vmalloc.h>
> -
> +#include <linux/hugetlb.h>
> #include <asm/barrier.h>
> #include <asm/cputype.h>
> #include <asm/fixmap.h>
> @@ -45,6 +45,7 @@
> #include <asm/memblock.h>
> #include <asm/mmu_context.h>
> #include <asm/ptdump.h>
> +#include <asm/page.h>
>
> #define NO_BLOCK_MAPPINGS BIT(0)
> #define NO_CONT_MAPPINGS BIT(1)
> @@ -939,6 +940,9 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
> return 0;
>
> BUG_ON(phys & ~PUD_MASK);
> + if (pud_val(*pud) && !pud_huge(*pud))
> + free_page((unsigned long)__va(pud_val(*pud)));
> +
This is absolutely scary. Isn't this page still referenced in the page
tables (assuming patch 4 has been applied too)?
> set_pud(pudp, pfn_pud(__phys_to_pfn(phys), sect_prot));
> return 1;
> }
> @@ -953,6 +957,9 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
> return 0;
>
> BUG_ON(phys & ~PMD_MASK);
> + if (pmd_val(*pmd) && !pmd_huge(*pmd))
> + free_page((unsigned long)__va(pmd_val(*pmd)));
> +
> set_pmd(pmdp, pfn_pmd(__phys_to_pfn(phys), sect_prot));
> return 1;
> }
>
Thanks,
M.
--
Jazz is not dead. It just smells funny...
On 14/03/18 08:48, Chintan Pandya wrote:
> This commit 15122ee2c515a ("arm64: Enforce BBM for huge
> IO/VMAP mappings") is a temporary work-around until the
> issues with CONFIG_HAVE_ARCH_HUGE_VMAP gets fixed.
>
> Revert this change as we have fixes for the issue.
>
> Signed-off-by: Chintan Pandya <[email protected]>
> ---
> arch/arm64/mm/mmu.c | 8 --------
> 1 file changed, 8 deletions(-)
>
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index c0df264..19116c6 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -935,10 +935,6 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
> pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT |
> pgprot_val(mk_sect_prot(prot)));
>
> - /* ioremap_page_range doesn't honour BBM */
> - if (pud_present(READ_ONCE(*pudp)))
> - return 0;
> -
> BUG_ON(phys & ~PUD_MASK);
> if (pud_val(*pud) && !pud_huge(*pud))
> free_page((unsigned long)__va(pud_val(*pud)));
> @@ -952,10 +948,6 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
> pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT |
> pgprot_val(mk_sect_prot(prot)));
>
> - /* ioremap_page_range doesn't honour BBM */
> - if (pmd_present(READ_ONCE(*pmdp)))
> - return 0;
> -
> BUG_ON(phys & ~PMD_MASK);
> if (pmd_val(*pmd) && !pmd_huge(*pmd))
> free_page((unsigned long)__va(pmd_val(*pmd)));
>
But you're still not doing a BBM, right? What prevents a speculative
access from using the (now freed) entry? The TLB invalidation you've
introduce just narrows the window where bad things can happen.
My gut feeling is that this series introduces more bugs than it fixes...
If you're going to fix it, please fix it by correctly implementing BBM
for huge mappings.
Or am I missing something terribly obvious?
M.
--
Jazz is not dead. It just smells funny...
On Wed, Mar 14, 2018 at 02:18:23PM +0530, Chintan Pandya wrote:
> If huge mappings are enabled, they can override
> valid intermediate previous mappings. Some MMU
> can speculatively pre-fetch these intermediate
> entries even after unmap. That's because unmap
> will clear only last level entries in page table
> keeping intermediate (pud/pmd) entries still valid.
>
> This can potentially lead to stale TLB entries
> which needs invalidation after map.
>
> Some more info: https://lkml.org/lkml/2017/12/23/3
>
> There is one noted case for ARM64 where such stale
> TLB entries causes 3rd level translation fault even
> after correct (huge) mapping is available.
> Hence, invalidate once we override pmd/pud with huge
> mappings.
> static int __read_mostly ioremap_p4d_capable;
> @@ -92,8 +93,10 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
> if (ioremap_pmd_enabled() &&
> ((next - addr) == PMD_SIZE) &&
> IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
> - if (pmd_set_huge(pmd, phys_addr + addr, prot))
> + if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
> + flush_tlb_pgtable(&init_mm, addr);
> continue;
> + }
> }
>
> if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
> @@ -118,8 +121,10 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
> if (ioremap_pud_enabled() &&
> ((next - addr) == PUD_SIZE) &&
> IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
> - if (pud_set_huge(pud, phys_addr + addr, prot))
> + if (pud_set_huge(pud, phys_addr + addr, prot)) {
> + flush_tlb_pgtable(&init_mm, addr);
> continue;
> + }
> }
As has been noted in previous threads, the ARM architecture requires a
Break-Before-Make sequence when changing an entry from a table to a
block, as is the case here.
The means the necessary sequence is:
1. Make the entry invalid
2. Invalidate relevant TLB entries
3. Write the new entry
Whereas above, the sequence is
1. Write the new entry
2. invalidate relevant TLB entries
Which is insufficient, and will lead to a number of problems.
Therefore, NAK to this patch.
Please read up on the Break-Before-Make requirements in the ARM ARM.
Thanks,
Mark.
On Wed, Mar 14, 2018 at 02:18:24PM +0530, Chintan Pandya wrote:
> While setting huge page, we need to take care of
> previously existing next level mapping. Since,
> we are going to overrite previous mapping, the
> only reference to next level page table will get
> lost and the next level page table will be zombie,
> occupying space forever. So, free it before
> overriding.
> @@ -939,6 +940,9 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
> return 0;
>
> BUG_ON(phys & ~PUD_MASK);
> + if (pud_val(*pud) && !pud_huge(*pud))
> + free_page((unsigned long)__va(pud_val(*pud)));
> +
> set_pud(pudp, pfn_pud(__phys_to_pfn(phys), sect_prot));
> return 1;
> }
> @@ -953,6 +957,9 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
> return 0;
>
> BUG_ON(phys & ~PMD_MASK);
> + if (pmd_val(*pmd) && !pmd_huge(*pmd))
> + free_page((unsigned long)__va(pmd_val(*pmd)));
> +
As Marc noted, (assuming the subsequent revert is applied) in both of
these cases, these tables are still live, and thus it is not safe to
free them.
Consider that immediately after freeing the pages, they may get
re-allocated elsewhere, where they may be modified. If this happens
before TLB invalidation, page table walks may allocate junk into TLBs,
resulting in a number of problems.
It is *never* safe to free a live page table, therefore NAK to this
patch.
Thanks,
Mark.
On 3/14/2018 4:18 PM, Mark Rutland wrote:
> On Wed, Mar 14, 2018 at 02:18:23PM +0530, Chintan Pandya wrote:
>> If huge mappings are enabled, they can override
>> valid intermediate previous mappings. Some MMU
>> can speculatively pre-fetch these intermediate
>> entries even after unmap. That's because unmap
>> will clear only last level entries in page table
>> keeping intermediate (pud/pmd) entries still valid.
>>
>> This can potentially lead to stale TLB entries
>> which needs invalidation after map.
>>
>> Some more info: https://lkml.org/lkml/2017/12/23/3
>>
>> There is one noted case for ARM64 where such stale
>> TLB entries causes 3rd level translation fault even
>> after correct (huge) mapping is available.
>
>> Hence, invalidate once we override pmd/pud with huge
>> mappings.
>
>> static int __read_mostly ioremap_p4d_capable;
>> @@ -92,8 +93,10 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>> if (ioremap_pmd_enabled() &&
>> ((next - addr) == PMD_SIZE) &&
>> IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
>> - if (pmd_set_huge(pmd, phys_addr + addr, prot))
>> + if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
>> + flush_tlb_pgtable(&init_mm, addr);
>> continue;
>> + }
>> }
>>
>> if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
>> @@ -118,8 +121,10 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
>> if (ioremap_pud_enabled() &&
>> ((next - addr) == PUD_SIZE) &&
>> IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
>> - if (pud_set_huge(pud, phys_addr + addr, prot))
>> + if (pud_set_huge(pud, phys_addr + addr, prot)) {
>> + flush_tlb_pgtable(&init_mm, addr);
>> continue;
>> + }
>> }
>
> As has been noted in previous threads, the ARM architecture requires a
> Break-Before-Make sequence when changing an entry from a table to a
> block, as is the case here.
>
> The means the necessary sequence is:
>
> 1. Make the entry invalid
> 2. Invalidate relevant TLB entries
> 3. Write the new entry
>
We do this for PTEs. I don't see this applicable to PMDs. Because,
1) To mark any PMD invalid, we need to be sure that next level page
table (I mean all the 512 PTEs) should be zero. That requires us
to scan entire last level page. A big perf hit !
2) We need to perform step 1 for every unmap as we never know which
unmap will make last level page table empty.
Moreover, problem comes only when 4K mapping was followed by 2M
mapping. In all other cases, retaining valid PMD has obvious perf
gain. That's what walk-cache is supposed to be introduced for.
So, I think to touch only problematic case and fix it with TLB
invalidate.
> Whereas above, the sequence is
>
> 1. Write the new entry
> 2. invalidate relevant TLB entries
>
> Which is insufficient, and will lead to a number of problems.
I couldn't think of new problems with this approach. Could you share
any problematic scenarios ?
Also, my test-case runs fine with these patches for 10+ hours.
>
> Therefore, NAK to this patch.
>
> Please read up on the Break-Before-Make requirements in the ARM ARM.
Sure, will get more from here.
>
> Thanks,
> Mark.
>
Thanks for the review Mark.
Chintan
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center,
Inc. is a member of the Code Aurora Forum, a Linux Foundation
Collaborative Project
On 3/14/2018 4:23 PM, Mark Rutland wrote:
> On Wed, Mar 14, 2018 at 02:18:24PM +0530, Chintan Pandya wrote:
>> While setting huge page, we need to take care of
>> previously existing next level mapping. Since,
>> we are going to overrite previous mapping, the
>> only reference to next level page table will get
>> lost and the next level page table will be zombie,
>> occupying space forever. So, free it before
>> overriding.
>
>> @@ -939,6 +940,9 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
>> return 0;
>>
>> BUG_ON(phys & ~PUD_MASK);
>> + if (pud_val(*pud) && !pud_huge(*pud))
>> + free_page((unsigned long)__va(pud_val(*pud)));
>> +
>> set_pud(pudp, pfn_pud(__phys_to_pfn(phys), sect_prot));
>> return 1;
>> }
>> @@ -953,6 +957,9 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
>> return 0;
>>
>> BUG_ON(phys & ~PMD_MASK);
>> + if (pmd_val(*pmd) && !pmd_huge(*pmd))
>> + free_page((unsigned long)__va(pmd_val(*pmd)));
>> +
>
> As Marc noted, (assuming the subsequent revert is applied) in both of
> these cases, these tables are still live, and thus it is not safe to
> free them.
>
> Consider that immediately after freeing the pages, they may get
> re-allocated elsewhere, where they may be modified. If this happens
> before TLB invalidation, page table walks may allocate junk into TLBs,
> resulting in a number of problems.
Ah okay. What about this sequence,
1) I store old PMD/PUD values
2) Update the PMD/PUD with section mapping
3) Invalidate TLB
4) Then free the *leaked* page
>
> It is *never* safe to free a live page table, therefore NAK to this
> patch.
>
> Thanks,
> Mark.
>
Chintan
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center,
Inc. is a member of the Code Aurora Forum, a Linux Foundation
Collaborative Project
On 3/14/2018 4:16 PM, Marc Zyngier wrote:
> On 14/03/18 08:48, Chintan Pandya wrote:
>> This commit 15122ee2c515a ("arm64: Enforce BBM for huge
>> IO/VMAP mappings") is a temporary work-around until the
>> issues with CONFIG_HAVE_ARCH_HUGE_VMAP gets fixed.
>>
>> Revert this change as we have fixes for the issue.
>>
>> Signed-off-by: Chintan Pandya <[email protected]>
>> ---
>> arch/arm64/mm/mmu.c | 8 --------
>> 1 file changed, 8 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index c0df264..19116c6 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -935,10 +935,6 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
>> pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT |
>> pgprot_val(mk_sect_prot(prot)));
>>
>> - /* ioremap_page_range doesn't honour BBM */
>> - if (pud_present(READ_ONCE(*pudp)))
>> - return 0;
>> -
>> BUG_ON(phys & ~PUD_MASK);
>> if (pud_val(*pud) && !pud_huge(*pud))
>> free_page((unsigned long)__va(pud_val(*pud)));
>> @@ -952,10 +948,6 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
>> pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT |
>> pgprot_val(mk_sect_prot(prot)));
>>
>> - /* ioremap_page_range doesn't honour BBM */
>> - if (pmd_present(READ_ONCE(*pmdp)))
>> - return 0;
>> -
>> BUG_ON(phys & ~PMD_MASK);
>> if (pmd_val(*pmd) && !pmd_huge(*pmd))
>> free_page((unsigned long)__va(pmd_val(*pmd)));
>>
>
> But you're still not doing a BBM, right? What prevents a speculative
> access from using the (now freed) entry? The TLB invalidation you've
> introduce just narrows the window where bad things can happen.
Valid point. I will rework on these patches.
Thanks Marc.
>
> My gut feeling is that this series introduces more bugs than it fixes...
> If you're going to fix it, please fix it by correctly implementing BBM
> for huge mappings.
>
> Or am I missing something terribly obvious?
>
> M.
>
Chintan
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center,
Inc. is a member of the Code Aurora Forum, a Linux Foundation
Collaborative Project
On Wed, Mar 14, 2018 at 04:50:35PM +0530, Chintan Pandya wrote:
> On 3/14/2018 4:18 PM, Mark Rutland wrote:
> > On Wed, Mar 14, 2018 at 02:18:23PM +0530, Chintan Pandya wrote:
> > As has been noted in previous threads, the ARM architecture requires a
> > Break-Before-Make sequence when changing an entry from a table to a
> > block, as is the case here.
> >
> > The means the necessary sequence is:
> >
> > 1. Make the entry invalid
> > 2. Invalidate relevant TLB entries
> > 3. Write the new entry
> >
> We do this for PTEs. I don't see this applicable to PMDs.
The architecture requires this for *all* levels of page table, when
certain changes are made. Switching an entry from a table to block (or
vice versa) is one of those changes, and this definitely applies to
PMDs.
> Because,
>
> 1) To mark any PMD invalid, we need to be sure that next level page
> table (I mean all the 512 PTEs) should be zero. That requires us
> to scan entire last level page. A big perf hit !
This is in ioremap code. Under what workload does this constitute a perf
hit?
Regardless, so long as we mark the pmd entry invalid before the TLB
invalidation, we don't need to touch the next level table at all. We
just require a sequence like:
pmd_clear(*pmdp);
flush_tlb_kernel_range(pmd_start_addr, pmd_end_addr);
pmd_set_huge(*pmdp, phys, prot);
> 2) We need to perform step 1 for every unmap as we never know which
> unmap will make last level page table empty.
Sorry, I don't follow. Could you elaborate on the problem?
> Moreover, problem comes only when 4K mapping was followed by 2M
> mapping. In all other cases, retaining valid PMD has obvious perf
> gain. That's what walk-cache is supposed to be introduced for.
Retaining a valid PMD in the TLB that *differs* from a valid PMD in the
page tables is a big problem.
The architecture requires BBM, as this permits CPUs to allocate PMDs
into TLBs at *any* time, even if there's already PMD in the TLB for a
given address.
Thus, CPUs can allocate *both* valid PMDs into the TLBs. When this
happens, a TLB lookup can:
1) return either of the PMDs.
2) raise a TLB conflict abort.
3) return an amalgamation of the two entries (e.g. provide an erroneous
address).
Note that (3) is particularly scary:
* The CPU could raise an SError if the amalgamated entry is junk.
* If a memory access hits an amalgamated entry, it may use the wrong
physical address, attributes, or permissions, resulting in a number of
potential problems.
* If the amalgamated entry looks like a partial walk, the TLB might try
to perform a walk starting at the physical address in the amalgamated
entry. This would cause page table walks to access bogus addresses,
allocating junk into TLBs, and may result in SErrors or other aborts.
> > Whereas above, the sequence is
> >
> > 1. Write the new entry
> > 2. invalidate relevant TLB entries
> >
> > Which is insufficient, and will lead to a number of problems.
> I couldn't think of new problems with this approach. Could you share
> any problematic scenarios ?
Please see above.
> Also, my test-case runs fine with these patches for 10+ hours.
While this may happen to work on particular platforms, it is not
guaranteed per the architecture, and will fail on others.
Thanks,
Mark.
On Wed, Mar 14, 2018 at 04:57:29PM +0530, Chintan Pandya wrote:
>
>
> On 3/14/2018 4:23 PM, Mark Rutland wrote:
> > On Wed, Mar 14, 2018 at 02:18:24PM +0530, Chintan Pandya wrote:
> > > While setting huge page, we need to take care of
> > > previously existing next level mapping. Since,
> > > we are going to overrite previous mapping, the
> > > only reference to next level page table will get
> > > lost and the next level page table will be zombie,
> > > occupying space forever. So, free it before
> > > overriding.
> >
> > > @@ -939,6 +940,9 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
> > > return 0;
> > > BUG_ON(phys & ~PUD_MASK);
> > > + if (pud_val(*pud) && !pud_huge(*pud))
> > > + free_page((unsigned long)__va(pud_val(*pud)));
> > > +
> > > set_pud(pudp, pfn_pud(__phys_to_pfn(phys), sect_prot));
> > > return 1;
> > > }
> > > @@ -953,6 +957,9 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
> > > return 0;
> > > BUG_ON(phys & ~PMD_MASK);
> > > + if (pmd_val(*pmd) && !pmd_huge(*pmd))
> > > + free_page((unsigned long)__va(pmd_val(*pmd)));
> > > +
> >
> > As Marc noted, (assuming the subsequent revert is applied) in both of
> > these cases, these tables are still live, and thus it is not safe to
> > free them.
> >
> > Consider that immediately after freeing the pages, they may get
> > re-allocated elsewhere, where they may be modified. If this happens
> > before TLB invalidation, page table walks may allocate junk into TLBs,
> > resulting in a number of problems.
> Ah okay. What about this sequence,
> 1) I store old PMD/PUD values
> 2) Update the PMD/PUD with section mapping
> 3) Invalidate TLB
> 4) Then free the *leaked* page
You must invalidate the TLB *before* setting the new entry:
1) store the old entry value
2) clear the entry
3) invalidate the TLB
... then you can either:
4) update the entry
5) free the old table
... or:
4) free the old table
5) update the entry
Thanks,
Mark.
On Wed, 2018-03-14 at 14:18 +0530, Chintan Pandya wrote:
> Note: I was working on these patches for quite sometime
> and realized that Toshi Kani has shared some patches
> addressing the same isssue with subject
> "[PATCH 0/2] fix memory leak / panic in ioremap huge pages".
> I've taken slightly different approach here, so sending
> to the list, finally.
Hi Chintan,
Do you have any issue in my patchset? If so, can you please comment on
them? It complicates the thing when you send a different approach
without telling why a different approach is needed. Your approach
purges TLB after updating pmd/pud, which I think is broken. Can you
work on top of my patchset and properly implement pXd_free_pte_page()
for arm64? I will send out my v2 today.
Thanks,
-Toshi
On 3/14/2018 8:08 PM, Kani, Toshi wrote:
> On Wed, 2018-03-14 at 14:18 +0530, Chintan Pandya wrote:
>> Note: I was working on these patches for quite sometime
>> and realized that Toshi Kani has shared some patches
>> addressing the same isssue with subject
>> "[PATCH 0/2] fix memory leak / panic in ioremap huge pages".
>> I've taken slightly different approach here, so sending
>> to the list, finally.
>
> Hi Chintan,
Hi Toshi
>
> Do you have any issue in my patchset? If so, can you please comment on
Not functional issues. But I didn't see issues you mentioned in your
commit text being solved for ARM64 in your patches. It is just being
masked which they were already by Will's patch. In my approach, end
goal was to get benefits of huge mapping back for ARM64.
> them? It complicates the thing when you send a different approach
> without telling why a different approach is needed. Your approach
See my reply above. I just had my original patches and I sent it.
> purges TLB after updating pmd/pud, which I think is broken. Can you
Yes, they are broken. I understood the issues after Mark and Marc's
review comments.
> work on top of my patchset and properly implement pXd_free_pte_page()
I have realized that if I address Mark's comments, my new patch will
look similar to what you have done. So, I will work on top of your
patches.
> for arm64? I will send out my v2 today.
>
> Thanks,
> -Toshi
>
Chintan
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center,
Inc. is a member of the Code Aurora Forum, a Linux Foundation
Collaborative Project
On Thu, 2018-03-15 at 12:47 +0530, Chintan Pandya wrote:
>
> On 3/14/2018 8:08 PM, Kani, Toshi wrote:
> > On Wed, 2018-03-14 at 14:18 +0530, Chintan Pandya wrote:
> > > Note: I was working on these patches for quite sometime
> > > and realized that Toshi Kani has shared some patches
> > > addressing the same isssue with subject
> > > "[PATCH 0/2] fix memory leak / panic in ioremap huge pages".
> > > I've taken slightly different approach here, so sending
> > > to the list, finally.
> >
> > Hi Chintan,
>
> Hi Toshi
> >
> > Do you have any issue in my patchset? If so, can you please comment on
>
> Not functional issues. But I didn't see issues you mentioned in your
> commit text being solved for ARM64 in your patches. It is just being
> masked which they were already by Will's patch. In my approach, end
> goal was to get benefits of huge mapping back for ARM64.
Right, my patchset does not implement the fix for arm64. The stub
version is only a workaround and is meant to be replaced by the fix.
> > them? It complicates the thing when you send a different approach
> > without telling why a different approach is needed. Your approach
>
> See my reply above. I just had my original patches and I sent it.
>
> > purges TLB after updating pmd/pud, which I think is broken. Can you
>
> Yes, they are broken. I understood the issues after Mark and Marc's
> review comments.
>
> > work on top of my patchset and properly implement pXd_free_pte_page()
>
> I have realized that if I address Mark's comments, my new patch will
> look similar to what you have done. So, I will work on top of your
> patches.
Sounds great.
Thanks,
-Toshi
Hi Chintan,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linus/master]
[also build test ERROR on v4.16-rc5 next-20180316]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Chintan-Pandya/Fix-issues-with-huge-mapping-in-ioremap/20180316-132223
config: powerpc-ppc64_defconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc
All errors (new ones prefixed by >>):
In file included from arch/powerpc/include/asm/tlb.h:36:0,
from arch/powerpc/mm/mem.c:50:
>> include/asm-generic/tlb.h:299:20: error: conflicting types for 'flush_tlb_pgtable'
static inline void flush_tlb_pgtable(struct mm_struct *mm,
^~~~~~~~~~~~~~~~~
In file included from arch/powerpc/include/asm/tlbflush.h:81:0,
from arch/powerpc/include/asm/pgtable.h:24,
from include/linux/memremap.h:8,
from include/linux/mm.h:27,
from arch/powerpc/mm/mem.c:27:
arch/powerpc/include/asm/book3s/64/tlbflush.h:143:20: note: previous definition of 'flush_tlb_pgtable' was here
static inline void flush_tlb_pgtable(struct mmu_gather *tlb, unsigned long address)
^~~~~~~~~~~~~~~~~
vim +/flush_tlb_pgtable +299 include/asm-generic/tlb.h
297
298 #ifndef CONFIG_ARM64
> 299 static inline void flush_tlb_pgtable(struct mm_struct *mm,
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi Chintan,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linus/master]
[also build test ERROR on v4.16-rc5]
[cannot apply to next-20180316]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Chintan-Pandya/Fix-issues-with-huge-mapping-in-ioremap/20180316-132223
config: arm64-allmodconfig (attached as .config)
compiler: aarch64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=arm64
All error/warnings (new ones prefixed by >>):
In file included from arch/arm64/include/asm/page.h:27:0,
from include/linux/shm.h:6,
from include/linux/sched.h:16,
from include/linux/sched/task_stack.h:9,
from include/linux/elfcore.h:7,
from include/linux/crash_core.h:6,
from include/linux/kexec.h:18,
from arch/arm64/mm/mmu.c:26:
arch/arm64/mm/mmu.c: In function 'pud_set_huge':
>> arch/arm64/mm/mmu.c:943:15: error: 'pud' undeclared (first use in this function); did you mean 'pudp'?
if (pud_val(*pud) && !pud_huge(*pud))
^
arch/arm64/include/asm/pgtable-types.h:50:22: note: in definition of macro 'pgd_val'
#define pgd_val(x) ((x).pgd)
^
>> arch/arm64/mm/mmu.c:943:6: note: in expansion of macro 'pud_val'
if (pud_val(*pud) && !pud_huge(*pud))
^~~~~~~
arch/arm64/mm/mmu.c:943:15: note: each undeclared identifier is reported only once for each function it appears in
if (pud_val(*pud) && !pud_huge(*pud))
^
arch/arm64/include/asm/pgtable-types.h:50:22: note: in definition of macro 'pgd_val'
#define pgd_val(x) ((x).pgd)
^
>> arch/arm64/mm/mmu.c:943:6: note: in expansion of macro 'pud_val'
if (pud_val(*pud) && !pud_huge(*pud))
^~~~~~~
arch/arm64/mm/mmu.c: In function 'pmd_set_huge':
>> arch/arm64/mm/mmu.c:960:15: error: 'pmd' undeclared (first use in this function); did you mean 'pmdp'?
if (pmd_val(*pmd) && !pmd_huge(*pmd))
^
arch/arm64/include/asm/pgtable-types.h:39:22: note: in definition of macro 'pmd_val'
#define pmd_val(x) ((x).pmd)
^
vim +943 arch/arm64/mm/mmu.c
932
933 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
934 {
935 pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT |
936 pgprot_val(mk_sect_prot(prot)));
937
938 /* ioremap_page_range doesn't honour BBM */
939 if (pud_present(READ_ONCE(*pudp)))
940 return 0;
941
942 BUG_ON(phys & ~PUD_MASK);
> 943 if (pud_val(*pud) && !pud_huge(*pud))
944 free_page((unsigned long)__va(pud_val(*pud)));
945
946 set_pud(pudp, pfn_pud(__phys_to_pfn(phys), sect_prot));
947 return 1;
948 }
949
950 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
951 {
952 pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT |
953 pgprot_val(mk_sect_prot(prot)));
954
955 /* ioremap_page_range doesn't honour BBM */
956 if (pmd_present(READ_ONCE(*pmdp)))
957 return 0;
958
959 BUG_ON(phys & ~PMD_MASK);
> 960 if (pmd_val(*pmd) && !pmd_huge(*pmd))
961 free_page((unsigned long)__va(pmd_val(*pmd)));
962
963 set_pmd(pmdp, pfn_pmd(__phys_to_pfn(phys), sect_prot));
964 return 1;
965 }
966
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation