From: Jisheng Zhang <[email protected]>
patch1 is a trivial improvement patch to move some functions to .init
section
Then following patches improve self-protection by:
Marking some variables __ro_after_init
Constifing some variables
Enabling ARCH_HAS_STRICT_MODULE_RWX
Jisheng Zhang (9):
riscv: add __init section marker to some functions
riscv: Mark some global variables __ro_after_init
riscv: Constify sys_call_table
riscv: Constify sbi_ipi_ops
riscv: kprobes: Implement alloc_insn_page()
riscv: bpf: Move bpf_jit_alloc_exec() and bpf_jit_free_exec() to core
riscv: bpf: Avoid breaking W^X
riscv: module: Create module allocations without exec permissions
riscv: Set ARCH_HAS_STRICT_MODULE_RWX if MMU
arch/riscv/Kconfig | 1 +
arch/riscv/include/asm/smp.h | 4 ++--
arch/riscv/include/asm/syscall.h | 2 +-
arch/riscv/kernel/module.c | 2 +-
arch/riscv/kernel/probes/kprobes.c | 8 ++++++++
arch/riscv/kernel/sbi.c | 10 +++++-----
arch/riscv/kernel/smp.c | 6 +++---
arch/riscv/kernel/syscall_table.c | 2 +-
arch/riscv/kernel/time.c | 2 +-
arch/riscv/kernel/traps.c | 2 +-
arch/riscv/kernel/vdso.c | 4 ++--
arch/riscv/mm/init.c | 12 ++++++------
arch/riscv/mm/kasan_init.c | 6 +++---
arch/riscv/mm/ptdump.c | 2 +-
arch/riscv/net/bpf_jit_comp64.c | 13 -------------
arch/riscv/net/bpf_jit_core.c | 14 ++++++++++++++
16 files changed, 50 insertions(+), 40 deletions(-)
--
2.31.0
From: Jisheng Zhang <[email protected]>
They are not needed after booting, so mark them as __init to move them
to the __init section.
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/riscv/kernel/traps.c | 2 +-
arch/riscv/mm/init.c | 6 +++---
arch/riscv/mm/kasan_init.c | 6 +++---
arch/riscv/mm/ptdump.c | 2 +-
4 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 1357abf79570..07fdded10c21 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -197,6 +197,6 @@ int is_valid_bugaddr(unsigned long pc)
#endif /* CONFIG_GENERIC_BUG */
/* stvec & scratch is already set from head.S */
-void trap_init(void)
+void __init trap_init(void)
{
}
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 067583ab1bd7..76bf2de8aa59 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -57,7 +57,7 @@ static void __init zone_sizes_init(void)
free_area_init(max_zone_pfns);
}
-static void setup_zero_page(void)
+static void __init setup_zero_page(void)
{
memset((void *)empty_zero_page, 0, PAGE_SIZE);
}
@@ -75,7 +75,7 @@ static inline void print_mlm(char *name, unsigned long b, unsigned long t)
(((t) - (b)) >> 20));
}
-static void print_vm_layout(void)
+static void __init print_vm_layout(void)
{
pr_notice("Virtual kernel memory layout:\n");
print_mlk("fixmap", (unsigned long)FIXADDR_START,
@@ -557,7 +557,7 @@ static inline void setup_vm_final(void)
#endif /* CONFIG_MMU */
#ifdef CONFIG_STRICT_KERNEL_RWX
-void protect_kernel_text_data(void)
+void __init protect_kernel_text_data(void)
{
unsigned long text_start = (unsigned long)_start;
unsigned long init_text_start = (unsigned long)__init_text_begin;
diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index 4f85c6d0ddf8..e1d041ac1534 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -60,7 +60,7 @@ asmlinkage void __init kasan_early_init(void)
local_flush_tlb_all();
}
-static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
{
phys_addr_t phys_addr;
pte_t *ptep, *base_pte;
@@ -82,7 +82,7 @@ static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long en
set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE));
}
-static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
{
phys_addr_t phys_addr;
pmd_t *pmdp, *base_pmd;
@@ -117,7 +117,7 @@ static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long en
set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
}
-static void kasan_populate_pgd(unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end)
{
phys_addr_t phys_addr;
pgd_t *pgdp = pgd_offset_k(vaddr);
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index ace74dec7492..3b7b6e4d025e 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -331,7 +331,7 @@ static int ptdump_show(struct seq_file *m, void *v)
DEFINE_SHOW_ATTRIBUTE(ptdump);
-static int ptdump_init(void)
+static int __init ptdump_init(void)
{
unsigned int i, j;
--
2.31.0
From: Jisheng Zhang <[email protected]>
All of these are never modified after init, so they can be
__ro_after_init.
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/riscv/kernel/sbi.c | 8 ++++----
arch/riscv/kernel/smp.c | 4 ++--
arch/riscv/kernel/time.c | 2 +-
arch/riscv/kernel/vdso.c | 4 ++--
arch/riscv/mm/init.c | 6 +++---
5 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index d3bf756321a5..cbd94a72eaa7 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -11,14 +11,14 @@
#include <asm/smp.h>
/* default SBI version is 0.1 */
-unsigned long sbi_spec_version = SBI_SPEC_VERSION_DEFAULT;
+unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT;
EXPORT_SYMBOL(sbi_spec_version);
-static void (*__sbi_set_timer)(uint64_t stime);
-static int (*__sbi_send_ipi)(const unsigned long *hart_mask);
+static void (*__sbi_set_timer)(uint64_t stime) __ro_after_init;
+static int (*__sbi_send_ipi)(const unsigned long *hart_mask) __ro_after_init;
static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask,
unsigned long start, unsigned long size,
- unsigned long arg4, unsigned long arg5);
+ unsigned long arg4, unsigned long arg5) __ro_after_init;
struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
unsigned long arg1, unsigned long arg2,
diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c
index ea028d9e0d24..504284d49135 100644
--- a/arch/riscv/kernel/smp.c
+++ b/arch/riscv/kernel/smp.c
@@ -30,7 +30,7 @@ enum ipi_message_type {
IPI_MAX
};
-unsigned long __cpuid_to_hartid_map[NR_CPUS] = {
+unsigned long __cpuid_to_hartid_map[NR_CPUS] __ro_after_init = {
[0 ... NR_CPUS-1] = INVALID_HARTID
};
@@ -85,7 +85,7 @@ static void ipi_stop(void)
wait_for_interrupt();
}
-static struct riscv_ipi_ops *ipi_ops;
+static struct riscv_ipi_ops *ipi_ops __ro_after_init;
void riscv_set_ipi_ops(struct riscv_ipi_ops *ops)
{
diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c
index 1b432264f7ef..8217b0f67c6c 100644
--- a/arch/riscv/kernel/time.c
+++ b/arch/riscv/kernel/time.c
@@ -11,7 +11,7 @@
#include <asm/processor.h>
#include <asm/timex.h>
-unsigned long riscv_timebase;
+unsigned long riscv_timebase __ro_after_init;
EXPORT_SYMBOL_GPL(riscv_timebase);
void __init time_init(void)
diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c
index 3f1d35e7c98a..25a3b8849599 100644
--- a/arch/riscv/kernel/vdso.c
+++ b/arch/riscv/kernel/vdso.c
@@ -20,8 +20,8 @@
extern char vdso_start[], vdso_end[];
-static unsigned int vdso_pages;
-static struct page **vdso_pagelist;
+static unsigned int vdso_pages __ro_after_init;
+static struct page **vdso_pagelist __ro_after_init;
/*
* The vDSO data page.
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 76bf2de8aa59..719ec72ef069 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -149,11 +149,11 @@ void __init setup_bootmem(void)
}
#ifdef CONFIG_MMU
-static struct pt_alloc_ops pt_ops;
+static struct pt_alloc_ops pt_ops __ro_after_init;
-unsigned long va_pa_offset;
+unsigned long va_pa_offset __ro_after_init;
EXPORT_SYMBOL(va_pa_offset);
-unsigned long pfn_base;
+unsigned long pfn_base __ro_after_init;
EXPORT_SYMBOL(pfn_base);
pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
--
2.31.0
From: Jisheng Zhang <[email protected]>
Constify the sys_call_table so that it will be placed in the .rodata
section. This will cause attempts to modify the table to fail when
strict page permissions are in place.
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/riscv/include/asm/syscall.h | 2 +-
arch/riscv/kernel/syscall_table.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h
index 49350c8bd7b0..b933b1583c9f 100644
--- a/arch/riscv/include/asm/syscall.h
+++ b/arch/riscv/include/asm/syscall.h
@@ -15,7 +15,7 @@
#include <linux/err.h>
/* The array of function pointers for syscalls. */
-extern void *sys_call_table[];
+extern void * const sys_call_table[];
/*
* Only the low 32 bits of orig_r0 are meaningful, so we return int.
diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c
index f1ead9df96ca..a63c667c27b3 100644
--- a/arch/riscv/kernel/syscall_table.c
+++ b/arch/riscv/kernel/syscall_table.c
@@ -13,7 +13,7 @@
#undef __SYSCALL
#define __SYSCALL(nr, call) [nr] = (call),
-void *sys_call_table[__NR_syscalls] = {
+void * const sys_call_table[__NR_syscalls] = {
[0 ... __NR_syscalls - 1] = sys_ni_syscall,
#include <asm/unistd.h>
};
--
2.31.0
From: Jisheng Zhang <[email protected]>
Constify the sbi_ipi_ops so that it will be placed in the .rodata
section. This will cause attempts to modify it to fail when strict
page permissions are in place.
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/riscv/include/asm/smp.h | 4 ++--
arch/riscv/kernel/sbi.c | 2 +-
arch/riscv/kernel/smp.c | 4 ++--
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h
index df1f7c4cd433..a7d2811f3536 100644
--- a/arch/riscv/include/asm/smp.h
+++ b/arch/riscv/include/asm/smp.h
@@ -46,7 +46,7 @@ int riscv_hartid_to_cpuid(int hartid);
void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out);
/* Set custom IPI operations */
-void riscv_set_ipi_ops(struct riscv_ipi_ops *ops);
+void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops);
/* Clear IPI for current CPU */
void riscv_clear_ipi(void);
@@ -92,7 +92,7 @@ static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in,
cpumask_set_cpu(boot_cpu_hartid, out);
}
-static inline void riscv_set_ipi_ops(struct riscv_ipi_ops *ops)
+static inline void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops)
{
}
diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index cbd94a72eaa7..cb848e80865e 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -556,7 +556,7 @@ static void sbi_send_cpumask_ipi(const struct cpumask *target)
sbi_send_ipi(cpumask_bits(&hartid_mask));
}
-static struct riscv_ipi_ops sbi_ipi_ops = {
+static const struct riscv_ipi_ops sbi_ipi_ops = {
.ipi_inject = sbi_send_cpumask_ipi
};
diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c
index 504284d49135..e035124f06dc 100644
--- a/arch/riscv/kernel/smp.c
+++ b/arch/riscv/kernel/smp.c
@@ -85,9 +85,9 @@ static void ipi_stop(void)
wait_for_interrupt();
}
-static struct riscv_ipi_ops *ipi_ops __ro_after_init;
+static const struct riscv_ipi_ops *ipi_ops __ro_after_init;
-void riscv_set_ipi_ops(struct riscv_ipi_ops *ops)
+void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops)
{
ipi_ops = ops;
}
--
2.31.0
From: Jisheng Zhang <[email protected]>
We will drop the executable permissions of the code pages from the
mapping at allocation time soon. Move bpf_jit_alloc_exec() and
bpf_jit_free_exec() to bpf_jit_core.c so that they can be shared by
both RV64I and RV32I.
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/riscv/net/bpf_jit_comp64.c | 13 -------------
arch/riscv/net/bpf_jit_core.c | 13 +++++++++++++
2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index b44ff52f84a6..87e3bf5b9086 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -1148,16 +1148,3 @@ void bpf_jit_build_epilogue(struct rv_jit_context *ctx)
{
__build_epilogue(false, ctx);
}
-
-void *bpf_jit_alloc_exec(unsigned long size)
-{
- return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
- BPF_JIT_REGION_END, GFP_KERNEL,
- PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
-}
-
-void bpf_jit_free_exec(void *addr)
-{
- return vfree(addr);
-}
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index 3630d447352c..d8da819290b7 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -164,3 +164,16 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
tmp : orig_prog);
return prog;
}
+
+void *bpf_jit_alloc_exec(unsigned long size)
+{
+ return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
+ BPF_JIT_REGION_END, GFP_KERNEL,
+ PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+
+void bpf_jit_free_exec(void *addr)
+{
+ return vfree(addr);
+}
--
2.31.0
From: Jisheng Zhang <[email protected]>
We allocate Non-executable pages, then call bpf_jit_binary_lock_ro()
to enable executable permission after mapping them read-only. This is
to prepare for STRICT_MODULE_RWX in following patch.
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/riscv/net/bpf_jit_core.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index d8da819290b7..0d5099f0dac8 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -152,6 +152,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns);
if (!prog->is_func || extra_pass) {
+ bpf_jit_binary_lock_ro(header);
out_offset:
kfree(ctx->offset);
kfree(jit_data);
@@ -169,7 +170,7 @@ void *bpf_jit_alloc_exec(unsigned long size)
{
return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
BPF_JIT_REGION_END, GFP_KERNEL,
- PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+ PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
}
--
2.31.0
From: Jisheng Zhang <[email protected]>
The core code manages the executable permissions of code regions of
modules explicitly, it is not necessary to create the module vmalloc
regions with RWX permissions. Create them with RW- permissions instead.
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/riscv/kernel/module.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 104fba889cf7..8997b9dbcb3d 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -414,7 +414,7 @@ void *module_alloc(unsigned long size)
{
return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
VMALLOC_END, GFP_KERNEL,
- PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+ PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
}
#endif
--
2.31.0
From: Jisheng Zhang <[email protected]>
Allocate PAGE_KERNEL_READ_EXEC(read only, executable) page for kprobes
insn page. This is to prepare for STRICT_MODULE_RWX.
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/riscv/kernel/probes/kprobes.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c
index 7e2c78e2ca6b..8c1f7a30aeed 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -84,6 +84,14 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return 0;
}
+void *alloc_insn_page(void)
+{
+ return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_READ_EXEC,
+ VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+
/* install breakpoint in text */
void __kprobes arch_arm_kprobe(struct kprobe *p)
{
--
2.31.0
From: Jisheng Zhang <[email protected]>
Now we can set ARCH_HAS_STRICT_MODULE_RWX for MMU riscv platforms, this
is good from security perspective.
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/riscv/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 87d7b52f278f..9716be3674a2 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -28,6 +28,7 @@ config RISCV
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_STRICT_KERNEL_RWX if MMU
+ select ARCH_HAS_STRICT_MODULE_RWX if MMU
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
--
2.31.0
> We will drop the executable permissions of the code pages from the
> mapping at allocation time soon. Move bpf_jit_alloc_exec() and
> bpf_jit_free_exec() to bpf_jit_core.c so that they can be shared by
> both RV64I and RV32I.
Looks good to me.
Acked-by: Luke Nelson <[email protected]>
On Mon, 29 Mar 2021 11:21:44 PDT (-0700), [email protected] wrote:
> From: Jisheng Zhang <[email protected]>
>
> patch1 is a trivial improvement patch to move some functions to .init
> section
>
> Then following patches improve self-protection by:
>
> Marking some variables __ro_after_init
> Constifing some variables
> Enabling ARCH_HAS_STRICT_MODULE_RWX
>
> Jisheng Zhang (9):
> riscv: add __init section marker to some functions
> riscv: Mark some global variables __ro_after_init
> riscv: Constify sys_call_table
> riscv: Constify sbi_ipi_ops
> riscv: kprobes: Implement alloc_insn_page()
> riscv: bpf: Move bpf_jit_alloc_exec() and bpf_jit_free_exec() to core
> riscv: bpf: Avoid breaking W^X
> riscv: module: Create module allocations without exec permissions
> riscv: Set ARCH_HAS_STRICT_MODULE_RWX if MMU
>
> arch/riscv/Kconfig | 1 +
> arch/riscv/include/asm/smp.h | 4 ++--
> arch/riscv/include/asm/syscall.h | 2 +-
> arch/riscv/kernel/module.c | 2 +-
> arch/riscv/kernel/probes/kprobes.c | 8 ++++++++
> arch/riscv/kernel/sbi.c | 10 +++++-----
> arch/riscv/kernel/smp.c | 6 +++---
> arch/riscv/kernel/syscall_table.c | 2 +-
> arch/riscv/kernel/time.c | 2 +-
> arch/riscv/kernel/traps.c | 2 +-
> arch/riscv/kernel/vdso.c | 4 ++--
> arch/riscv/mm/init.c | 12 ++++++------
> arch/riscv/mm/kasan_init.c | 6 +++---
> arch/riscv/mm/ptdump.c | 2 +-
> arch/riscv/net/bpf_jit_comp64.c | 13 -------------
> arch/riscv/net/bpf_jit_core.c | 14 ++++++++++++++
> 16 files changed, 50 insertions(+), 40 deletions(-)
Thanks. These are on for-next. I had to fix up a handful of merge
conflicts, so LMK if I made any mistakes.
On Mär 30 2021, Jisheng Zhang wrote:
> From: Jisheng Zhang <[email protected]>
>
> We allocate Non-executable pages, then call bpf_jit_binary_lock_ro()
> to enable executable permission after mapping them read-only. This is
> to prepare for STRICT_MODULE_RWX in following patch.
That breaks booting with
<https://github.com/openSUSE/kernel-source/blob/master/config/riscv64/default>.
Andreas.
--
Andreas Schwab, [email protected]
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1
"And now for something completely different."
Hi Andreas,
On Fri, 11 Jun 2021 16:10:03 +0200
Andreas Schwab <[email protected]> wrote:
> On Mär 30 2021, Jisheng Zhang wrote:
>
> > From: Jisheng Zhang <[email protected]>
> >
> > We allocate Non-executable pages, then call bpf_jit_binary_lock_ro()
> > to enable executable permission after mapping them read-only. This is
> > to prepare for STRICT_MODULE_RWX in following patch.
>
> That breaks booting with
> <https://github.com/openSUSE/kernel-source/blob/master/config/riscv64/default>.
>
Thanks for the bug report.
I reproduced an kernel panic with the defconfig on qemu, but I'm not sure whether
this is the issue you saw, I will check.
0.161959] futex hash table entries: 512 (order: 3, 32768 bytes, linear)
[ 0.167028] pinctrl core: initialized pinctrl subsystem
[ 0.190727] Unable to handle kernel paging request at virtual address ffffffff81651bd8
[ 0.191361] Oops [#1]
[ 0.191509] Modules linked in:
[ 0.191814] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #3
[ 0.192179] Hardware name: riscv-virtio,qemu (DT)
[ 0.192492] epc : __memset+0xc4/0xfc
[ 0.192712] ra : skb_flow_dissector_init+0x22/0x86
[ 0.192915] epc : ffffffff803e2700 ra : ffffffff8058f90c sp : ffffffe001a4fda0
[ 0.193221] gp : ffffffff8156d120 tp : ffffffe001a5b700 t0 : ffffffff81651b10
[ 0.193631] t1 : 0000000000000100 t2 : 00000000000003a8 s0 : ffffffe001a4fdd0
[ 0.194034] s1 : ffffffff80c9e250 a0 : ffffffff81651bd8 a1 : 0000000000000000
[ 0.194502] a2 : 000000000000003c a3 : ffffffff81651c10 a4 : 0000000000000064
[ 0.195053] a5 : ffffffff803e2700 a6 : 0000000000000040 a7 : 0000000000000002
[ 0.195436] s2 : ffffffff81651bd8 s3 : 0000000000000009 s4 : ffffffff8156e0c8
[ 0.195723] s5 : ffffffff8156e050 s6 : ffffffff80a105e0 s7 : ffffffff80a00738
[ 0.195992] s8 : ffffffff80f07be0 s9 : 0000000000000008 s10: ffffffff808000ac
[ 0.196257] s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000
[ 0.196511] t5 : 00000000000003a9 t6 : 00000000000003ff
[ 0.196714] status: 0000000000000120 badaddr: ffffffff81651bd8 cause: 000000000000000f
[ 0.197103] [<ffffffff803e2700>] __memset+0xc4/0xfc
[ 0.197408] [<ffffffff80831f58>] init_default_flow_dissectors+0x22/0x60
[ 0.197693] [<ffffffff800020fc>] do_one_initcall+0x3e/0x168
[ 0.197907] [<ffffffff80801438>] kernel_init_freeable+0x25a/0x2c6
[ 0.198157] [<ffffffff8070a8a8>] kernel_init+0x12/0x110
[ 0.198351] [<ffffffff8000333a>] ret_from_exception+0x0/0xc
[ 0.198973] Unable to handle kernel paging request at virtual address ffffffff8164d860
[ 0.199242] Oops [#2]
[ 0.199336] Modules linked in:
[ 0.199514] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G D 5.13.0-rc5-default+ #3
[ 0.199785] Hardware name: riscv-virtio,qemu (DT)
[ 0.199940] epc : _raw_spin_lock_irqsave+0x14/0x4e
[ 0.200113] ra : _extract_crng+0x58/0xac
[ 0.200264] epc : ffffffff807117ae ra : ffffffff80490774 sp : ffffffe001a4fa70
[ 0.200489] gp : ffffffff8156d120 tp : ffffffe001a5b700 t0 : ffffffff8157c0d7
[ 0.200715] t1 : ffffffff8157c0c8 t2 : 0000000000000000 s0 : ffffffe001a4fa80
[ 0.200938] s1 : ffffffff8164d818 a0 : 0000000000000022 a1 : ffffffe001a4fac8
[ 0.201166] a2 : 0000000000000010 a3 : 0000000000000001 a4 : ffffffff8164d860
[ 0.201389] a5 : 0000000000000000 a6 : c0000000ffffdfff a7 : ffffffffffffffff
[ 0.201612] s2 : ffffffff8156e1c0 s3 : ffffffe001a4fac8 s4 : ffffffff8164d860
[ 0.201836] s5 : ffffffff8156e0c8 s6 : ffffffff80a105e0 s7 : ffffffff80a00738
[ 0.202060] s8 : ffffffff80f07be0 s9 : 0000000000000008 s10: ffffffff808000ac
[ 0.202295] s11: 0000000000000000 t3 : 000000000000005b t4 : ffffffffffffffff
[ 0.202519] t5 : 00000000000003a9 t6 : ffffffe001a4f9b8
[ 0.202691] status: 0000000000000100 badaddr: ffffffff8164d860 cause: 000000000000000f
[ 0.202940] [<ffffffff807117ae>] _raw_spin_lock_irqsave+0x14/0x4e
[ 0.203326] Unable to handle kernel paging request at virtual address ffffffff8164d860
[ 0.203574] Oops [#3]
[ 0.203664] Modules linked in:
[ 0.203784] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G D 5.13.0-rc5-default+ #3
[ 0.204046] Hardware name: riscv-virtio,qemu (DT)
[ 0.204201] epc : _raw_spin_lock_irqsave+0x14/0x4e
[ 0.204371] ra : _extract_crng+0x58/0xac
[ 0.204519] epc : ffffffff807117ae ra : ffffffff80490774 sp : ffffffe001a4f740
[ 0.204819] gp : ffffffff8156d120 tp : ffffffe001a5b700 t0 : ffffffff8157c0d7
[ 0.205089] t1 : ffffffff8157c0c8 t2 : 0000000000000000 s0 : ffffffe001a4f750
[ 0.205330] s1 : ffffffff8164d818 a0 : 0000000000000102 a1 : ffffffe001a4f798
[ 0.205553] a2 : 0000000000000010 a3 : 0000000000000001 a4 : ffffffff8164d860
[ 0.205768] a5 : 0000000000000000 a6 : c0000000ffffdfff a7 : ffffffff81408a40
[ 0.205981] s2 : ffffffff8156e1c0 s3 : ffffffe001a4f798 s4 : ffffffff8164d860
[ 0.206197] s5 : ffffffff8156e0c8 s6 : ffffffff80a105e0 s7 : ffffffff80a00738
[ 0.206411] s8 : ffffffff80f07be0 s9 : 0000000000000008 s10: ffffffff808000ac
[ 0.206633] s11: 0000000000000000 t3 : 000000000000005b t4 : ffffffffffffffff
[ 0.206849] t5 : 00000000000003a9 t6 : ffffffe001a4f688
On Jun 12 2021, Jisheng Zhang wrote:
> I reproduced an kernel panic with the defconfig on qemu, but I'm not sure whether
> this is the issue you saw, I will check.
>
> 0.161959] futex hash table entries: 512 (order: 3, 32768 bytes, linear)
> [ 0.167028] pinctrl core: initialized pinctrl subsystem
> [ 0.190727] Unable to handle kernel paging request at virtual address ffffffff81651bd8
> [ 0.191361] Oops [#1]
> [ 0.191509] Modules linked in:
> [ 0.191814] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #3
> [ 0.192179] Hardware name: riscv-virtio,qemu (DT)
> [ 0.192492] epc : __memset+0xc4/0xfc
> [ 0.192712] ra : skb_flow_dissector_init+0x22/0x86
Yes, that's the same.
Andreas.
--
Andreas Schwab, [email protected]
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1
"And now for something completely different."
Hi,
On Fri, 11 Jun 2021 18:41:16 +0200
Andreas Schwab <[email protected]> wrote:
> On Jun 12 2021, Jisheng Zhang wrote:
>
> > I reproduced an kernel panic with the defconfig on qemu, but I'm not sure whether
> > this is the issue you saw, I will check.
> >
> > 0.161959] futex hash table entries: 512 (order: 3, 32768 bytes, linear)
> > [ 0.167028] pinctrl core: initialized pinctrl subsystem
> > [ 0.190727] Unable to handle kernel paging request at virtual address ffffffff81651bd8
> > [ 0.191361] Oops [#1]
> > [ 0.191509] Modules linked in:
> > [ 0.191814] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #3
> > [ 0.192179] Hardware name: riscv-virtio,qemu (DT)
> > [ 0.192492] epc : __memset+0xc4/0xfc
> > [ 0.192712] ra : skb_flow_dissector_init+0x22/0x86
>
> Yes, that's the same.
>
> Andreas.
>
I think I found the root cause: commit 2bfc6cd81bd ("move kernel mapping
outside of linear mapping") moves BPF JIT region after the kernel:
#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
The &_end is unlikely aligned with PMD SIZE, so the front bpf jit region
sits with kernel .data section in one PMD. But kenrel is mapped in PMD SIZE,
so when bpf_jit_binary_lock_ro() is called to make the first bpf jit prog
ROX, we will make part of kernel .data section RO too, so when we write, for example
memset the .data section, MMU will trigger store page fault.
To fix the issue, we need to make the bpf jit region PMD size aligned by either
patch BPF_JIT_REGION_START to align on PMD size rather than PAGE SIZE, or
something as below patch to move the BPF region before modules region:
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 9469f464e71a..997b894edbc2 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -31,8 +31,8 @@
#define BPF_JIT_REGION_SIZE (SZ_128M)
#ifdef CONFIG_64BIT
/* KASLR should leave at least 128MB for BPF after the kernel */
-#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
-#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
+#define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
+#define BPF_JIT_REGION_END (MODULES_VADDR)
#else
#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
#define BPF_JIT_REGION_END (VMALLOC_END)
@@ -40,8 +40,8 @@
/* Modules always live before the kernel */
#ifdef CONFIG_64BIT
-#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
+#define MODULES_VADDR (MODULES_END - SZ_128M)
#endif
can you please try it? Per my test, the issue is fixed.
Thanks
On Jun 14 2021, Jisheng Zhang wrote:
> I think I found the root cause: commit 2bfc6cd81bd ("move kernel mapping
> outside of linear mapping") moves BPF JIT region after the kernel:
>
> #define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
>
> The &_end is unlikely aligned with PMD SIZE, so the front bpf jit region
> sits with kernel .data section in one PMD. But kenrel is mapped in PMD SIZE,
> so when bpf_jit_binary_lock_ro() is called to make the first bpf jit prog
> ROX, we will make part of kernel .data section RO too, so when we write, for example
> memset the .data section, MMU will trigger store page fault.
>
> To fix the issue, we need to make the bpf jit region PMD size aligned by either
> patch BPF_JIT_REGION_START to align on PMD size rather than PAGE SIZE, or
> something as below patch to move the BPF region before modules region:
>
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 9469f464e71a..997b894edbc2 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -31,8 +31,8 @@
> #define BPF_JIT_REGION_SIZE (SZ_128M)
> #ifdef CONFIG_64BIT
> /* KASLR should leave at least 128MB for BPF after the kernel */
> -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
> -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
> +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
> +#define BPF_JIT_REGION_END (MODULES_VADDR)
> #else
> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> #define BPF_JIT_REGION_END (VMALLOC_END)
> @@ -40,8 +40,8 @@
>
> /* Modules always live before the kernel */
> #ifdef CONFIG_64BIT
> -#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
> #define MODULES_END (PFN_ALIGN((unsigned long)&_start))
> +#define MODULES_VADDR (MODULES_END - SZ_128M)
> #endif
>
>
> can you please try it? Per my test, the issue is fixed.
I can confirm that this fixes the issue.
Andreas.
--
Andreas Schwab, [email protected]
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1
"And now for something completely different."
From: Jisheng Zhang <[email protected]>
Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")
breaks booting with one kind of config file, I reproduced a kernel panic
with the config:
[ 0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220
[ 0.139159] Oops [#1]
[ 0.139303] Modules linked in:
[ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1
[ 0.139934] Hardware name: riscv-virtio,qemu (DT)
[ 0.140193] epc : __memset+0xc4/0xfc
[ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
[ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0
[ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158
[ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0
[ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000
[ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064
[ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff
[ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088
[ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438
[ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac
[ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000
[ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
[ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f
[ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
[ 0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60
[ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
[ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
[ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
[ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
[ 0.145124] ---[ end trace f1e9643daa46d591 ]---
After some investigation, I think I found the root cause: commit
2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
BPF JIT region after the kernel:
The &_end is unlikely aligned with PMD size, so the front bpf jit
region sits with part of kernel .data section in one PMD size mapping.
But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
called to make the first bpf jit prog ROX, we will make part of kernel
.data section RO too, so when we write to, for example memset the
.data section, MMU will trigger a store page fault.
To fix the issue, we need to ensure the BPF JIT region is PMD size
aligned. This patch acchieve this goal by restoring the BPF JIT region
to original position, I.E the 128MB before kernel .text section.
Reported-by: Andreas Schwab <[email protected]>
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/riscv/include/asm/pgtable.h | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 9469f464e71a..380cd3a7e548 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -30,9 +30,8 @@
#define BPF_JIT_REGION_SIZE (SZ_128M)
#ifdef CONFIG_64BIT
-/* KASLR should leave at least 128MB for BPF after the kernel */
-#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
-#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
+#define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
+#define BPF_JIT_REGION_END (MODULES_END)
#else
#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
#define BPF_JIT_REGION_END (VMALLOC_END)
--
2.32.0
On 6/14/21 6:49 PM, Jisheng Zhang wrote:
> From: Jisheng Zhang <[email protected]>
>
> Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")
> breaks booting with one kind of config file, I reproduced a kernel panic
> with the config:
>
> [ 0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220
> [ 0.139159] Oops [#1]
> [ 0.139303] Modules linked in:
> [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1
> [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
> [ 0.140193] epc : __memset+0xc4/0xfc
> [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
> [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0
> [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158
> [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0
> [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000
> [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064
> [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff
> [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088
> [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438
> [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac
> [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000
> [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
> [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f
> [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
> [ 0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60
> [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
> [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
> [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
> [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
> [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
>
> After some investigation, I think I found the root cause: commit
> 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
> BPF JIT region after the kernel:
>
> The &_end is unlikely aligned with PMD size, so the front bpf jit
> region sits with part of kernel .data section in one PMD size mapping.
> But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
> called to make the first bpf jit prog ROX, we will make part of kernel
> .data section RO too, so when we write to, for example memset the
> .data section, MMU will trigger a store page fault.
>
> To fix the issue, we need to ensure the BPF JIT region is PMD size
> aligned. This patch acchieve this goal by restoring the BPF JIT region
> to original position, I.E the 128MB before kernel .text section.
>
> Reported-by: Andreas Schwab <[email protected]>
> Signed-off-by: Jisheng Zhang <[email protected]>
> ---
> arch/riscv/include/asm/pgtable.h | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 9469f464e71a..380cd3a7e548 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -30,9 +30,8 @@
>
> #define BPF_JIT_REGION_SIZE (SZ_128M)
> #ifdef CONFIG_64BIT
> -/* KASLR should leave at least 128MB for BPF after the kernel */
> -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
> -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
> +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
> +#define BPF_JIT_REGION_END (MODULES_END)
> #else
> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> #define BPF_JIT_REGION_END (VMALLOC_END)
I presume this fix will be routed via riscv tree?
Thanks,
Daniel
Hi Jisheng,
Le 14/06/2021 ? 18:49, Jisheng Zhang a ?crit?:
> From: Jisheng Zhang <[email protected]>
>
> Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")
> breaks booting with one kind of config file, I reproduced a kernel panic
> with the config:
>
> [ 0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220
> [ 0.139159] Oops [#1]
> [ 0.139303] Modules linked in:
> [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1
> [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
> [ 0.140193] epc : __memset+0xc4/0xfc
> [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
> [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0
> [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158
> [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0
> [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000
> [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064
> [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff
> [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088
> [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438
> [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac
> [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000
> [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
> [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f
> [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
> [ 0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60
> [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
> [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
> [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
> [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
> [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
>
> After some investigation, I think I found the root cause: commit
> 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
> BPF JIT region after the kernel:
>
> The &_end is unlikely aligned with PMD size, so the front bpf jit
> region sits with part of kernel .data section in one PMD size mapping.
> But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
> called to make the first bpf jit prog ROX, we will make part of kernel
> .data section RO too, so when we write to, for example memset the
> .data section, MMU will trigger a store page fault.
Good catch, we make sure no physical allocation happens between _end and
the next PMD aligned address, but I missed this one.
>
> To fix the issue, we need to ensure the BPF JIT region is PMD size
> aligned. This patch acchieve this goal by restoring the BPF JIT region
> to original position, I.E the 128MB before kernel .text section.
But I disagree with your solution: I made sure modules and BPF programs
get their own virtual regions to avoid worst case scenario where one
could allocate all the space and leave nothing to the other (we are
limited to +- 2GB offset). Why don't just align BPF_JIT_REGION_START to
the next PMD aligned address?
Again, good catch, thanks,
Alex
>
> Reported-by: Andreas Schwab <[email protected]>
> Signed-off-by: Jisheng Zhang <[email protected]>
> ---
> arch/riscv/include/asm/pgtable.h | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 9469f464e71a..380cd3a7e548 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -30,9 +30,8 @@
>
> #define BPF_JIT_REGION_SIZE (SZ_128M)
> #ifdef CONFIG_64BIT
> -/* KASLR should leave at least 128MB for BPF after the kernel */
> -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
> -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
> +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
> +#define BPF_JIT_REGION_END (MODULES_END)
> #else
> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> #define BPF_JIT_REGION_END (VMALLOC_END)
>
On Tue, 15 Jun 2021 20:54:19 +0200
Alex Ghiti <[email protected]> wrote:
> Hi Jisheng,
Hi Alex,
>
> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :
> > From: Jisheng Zhang <[email protected]>
> >
> > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")
> > breaks booting with one kind of config file, I reproduced a kernel panic
> > with the config:
> >
> > [ 0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220
> > [ 0.139159] Oops [#1]
> > [ 0.139303] Modules linked in:
> > [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1
> > [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
> > [ 0.140193] epc : __memset+0xc4/0xfc
> > [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
> > [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0
> > [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158
> > [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0
> > [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000
> > [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064
> > [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff
> > [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088
> > [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438
> > [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac
> > [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000
> > [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
> > [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f
> > [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
> > [ 0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60
> > [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
> > [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
> > [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
> > [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
> > [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
> >
> > After some investigation, I think I found the root cause: commit
> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
> > BPF JIT region after the kernel:
> >
> > The &_end is unlikely aligned with PMD size, so the front bpf jit
> > region sits with part of kernel .data section in one PMD size mapping.
> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
> > called to make the first bpf jit prog ROX, we will make part of kernel
> > .data section RO too, so when we write to, for example memset the
> > .data section, MMU will trigger a store page fault.
>
> Good catch, we make sure no physical allocation happens between _end and
> the next PMD aligned address, but I missed this one.
>
> >
> > To fix the issue, we need to ensure the BPF JIT region is PMD size
> > aligned. This patch acchieve this goal by restoring the BPF JIT region
> > to original position, I.E the 128MB before kernel .text section.
>
> But I disagree with your solution: I made sure modules and BPF programs
> get their own virtual regions to avoid worst case scenario where one
> could allocate all the space and leave nothing to the other (we are
> limited to +- 2GB offset). Why don't just align BPF_JIT_REGION_START to
> the next PMD aligned address?
Originally, I planed to fix the issue by aligning BPF_JIT_REGION_START, but
IIRC, BPF experts are adding (or have added) "Calling kernel functions from BPF"
feature, there's a risk that BPF JIT region is beyond the 2GB of module region:
------
module
------
kernel
------
BPF_JIT
So I made this patch finally. In this patch, we let BPF JIT region sit
between module and kernel.
To address "make sure modules and BPF programs get their own virtual regions",
what about something as below (applied against this patch)?
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 380cd3a7e548..da1158f10b09 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -31,7 +31,7 @@
#define BPF_JIT_REGION_SIZE (SZ_128M)
#ifdef CONFIG_64BIT
#define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
-#define BPF_JIT_REGION_END (MODULES_END)
+#define BPF_JIT_REGION_END (PFN_ALIGN((unsigned long)&_start))
#else
#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
#define BPF_JIT_REGION_END (VMALLOC_END)
@@ -40,7 +40,7 @@
/* Modules always live before the kernel */
#ifdef CONFIG_64BIT
#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
-#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
+#define MODULES_END (BPF_JIT_REGION_END)
#endif
>
> Again, good catch, thanks,
>
> Alex
>
> >
> > Reported-by: Andreas Schwab <[email protected]>
> > Signed-off-by: Jisheng Zhang <[email protected]>
> > ---
> > arch/riscv/include/asm/pgtable.h | 5 ++---
> > 1 file changed, 2 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index 9469f464e71a..380cd3a7e548 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -30,9 +30,8 @@
> >
> > #define BPF_JIT_REGION_SIZE (SZ_128M)
> > #ifdef CONFIG_64BIT
> > -/* KASLR should leave at least 128MB for BPF after the kernel */
> > -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
> > -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
> > +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
> > +#define BPF_JIT_REGION_END (MODULES_END)
> > #else
> > #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> > #define BPF_JIT_REGION_END (VMALLOC_END)
> >
Le 16/06/2021 à 02:03, Jisheng Zhang a écrit :
> On Tue, 15 Jun 2021 20:54:19 +0200
> Alex Ghiti <[email protected]> wrote:
>
>> Hi Jisheng,
>
> Hi Alex,
>
>>
>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :
>>> From: Jisheng Zhang <[email protected]>
>>>
>>> Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")
>>> breaks booting with one kind of config file, I reproduced a kernel panic
>>> with the config:
>>>
>>> [ 0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220
>>> [ 0.139159] Oops [#1]
>>> [ 0.139303] Modules linked in:
>>> [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1
>>> [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
>>> [ 0.140193] epc : __memset+0xc4/0xfc
>>> [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
>>> [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0
>>> [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158
>>> [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0
>>> [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000
>>> [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064
>>> [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff
>>> [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088
>>> [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438
>>> [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac
>>> [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000
>>> [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
>>> [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f
>>> [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
>>> [ 0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60
>>> [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
>>> [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
>>> [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
>>> [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
>>> [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
>>>
>>> After some investigation, I think I found the root cause: commit
>>> 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
>>> BPF JIT region after the kernel:
>>>
>>> The &_end is unlikely aligned with PMD size, so the front bpf jit
>>> region sits with part of kernel .data section in one PMD size mapping.
>>> But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
>>> called to make the first bpf jit prog ROX, we will make part of kernel
>>> .data section RO too, so when we write to, for example memset the
>>> .data section, MMU will trigger a store page fault.
>>
>> Good catch, we make sure no physical allocation happens between _end and
>> the next PMD aligned address, but I missed this one.
>>
>>>
>>> To fix the issue, we need to ensure the BPF JIT region is PMD size
>>> aligned. This patch acchieve this goal by restoring the BPF JIT region
>>> to original position, I.E the 128MB before kernel .text section.
>>
>> But I disagree with your solution: I made sure modules and BPF programs
>> get their own virtual regions to avoid worst case scenario where one
>> could allocate all the space and leave nothing to the other (we are
>> limited to +- 2GB offset). Why don't just align BPF_JIT_REGION_START to
>> the next PMD aligned address?
>
> Originally, I planed to fix the issue by aligning BPF_JIT_REGION_START, but
> IIRC, BPF experts are adding (or have added) "Calling kernel functions from BPF"
> feature, there's a risk that BPF JIT region is beyond the 2GB of module region:
>
> ------
> module
> ------
> kernel
> ------
> BPF_JIT
>
> So I made this patch finally. In this patch, we let BPF JIT region sit
> between module and kernel.
>
From what I read in the lwn article, I'm not sure BPF programs can call
module functions, can someone tell us if it is possible? Or planned?
> To address "make sure modules and BPF programs get their own virtual regions",
> what about something as below (applied against this patch)?
>
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 380cd3a7e548..da1158f10b09 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -31,7 +31,7 @@
> #define BPF_JIT_REGION_SIZE (SZ_128M)
> #ifdef CONFIG_64BIT
> #define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
> -#define BPF_JIT_REGION_END (MODULES_END)
> +#define BPF_JIT_REGION_END (PFN_ALIGN((unsigned long)&_start))
> #else
> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> #define BPF_JIT_REGION_END (VMALLOC_END)
> @@ -40,7 +40,7 @@
> /* Modules always live before the kernel */
> #ifdef CONFIG_64BIT
> #define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
> -#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
> +#define MODULES_END (BPF_JIT_REGION_END)
> #endif
>
>
In case it is possible, I would let the vmalloc allocator handle the
case where modules steal room from BPF: I would then not implement the
above but rather your first patch.
And do not forget to modify Documentation/riscv/vm-layout.rst
accordingly and remove the comment "/* KASLR should leave at least 128MB
for BPF after the kernel */"
Thanks,
Alex
>
>>
>> Again, good catch, thanks,
>>
>> Alex
>>
>>>
>>> Reported-by: Andreas Schwab <[email protected]>
>>> Signed-off-by: Jisheng Zhang <[email protected]>
>>> ---
>>> arch/riscv/include/asm/pgtable.h | 5 ++---
>>> 1 file changed, 2 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
>>> index 9469f464e71a..380cd3a7e548 100644
>>> --- a/arch/riscv/include/asm/pgtable.h
>>> +++ b/arch/riscv/include/asm/pgtable.h
>>> @@ -30,9 +30,8 @@
>>>
>>> #define BPF_JIT_REGION_SIZE (SZ_128M)
>>> #ifdef CONFIG_64BIT
>>> -/* KASLR should leave at least 128MB for BPF after the kernel */
>>> -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
>>> -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
>>> +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
>>> +#define BPF_JIT_REGION_END (MODULES_END)
>>> #else
>>> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
>>> #define BPF_JIT_REGION_END (VMALLOC_END)
>>>
>
>
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv
>
On Tue, 15 Jun 2021 17:03:28 PDT (-0700), [email protected] wrote:
> On Tue, 15 Jun 2021 20:54:19 +0200
> Alex Ghiti <[email protected]> wrote:
>
>> Hi Jisheng,
>
> Hi Alex,
>
>>
>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :
>> > From: Jisheng Zhang <[email protected]>
>> >
>> > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")
>> > breaks booting with one kind of config file, I reproduced a kernel panic
>> > with the config:
>> >
>> > [ 0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220
>> > [ 0.139159] Oops [#1]
>> > [ 0.139303] Modules linked in:
>> > [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1
>> > [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
>> > [ 0.140193] epc : __memset+0xc4/0xfc
>> > [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
>> > [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0
>> > [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158
>> > [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0
>> > [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000
>> > [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064
>> > [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff
>> > [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088
>> > [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438
>> > [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac
>> > [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000
>> > [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
>> > [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f
>> > [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
>> > [ 0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60
>> > [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
>> > [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
>> > [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
>> > [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
>> > [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
>> >
>> > After some investigation, I think I found the root cause: commit
>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
>> > BPF JIT region after the kernel:
>> >
>> > The &_end is unlikely aligned with PMD size, so the front bpf jit
>> > region sits with part of kernel .data section in one PMD size mapping.
>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
>> > called to make the first bpf jit prog ROX, we will make part of kernel
>> > .data section RO too, so when we write to, for example memset the
>> > .data section, MMU will trigger a store page fault.
>>
>> Good catch, we make sure no physical allocation happens between _end and
>> the next PMD aligned address, but I missed this one.
>>
>> >
>> > To fix the issue, we need to ensure the BPF JIT region is PMD size
>> > aligned. This patch acchieve this goal by restoring the BPF JIT region
>> > to original position, I.E the 128MB before kernel .text section.
>>
>> But I disagree with your solution: I made sure modules and BPF programs
>> get their own virtual regions to avoid worst case scenario where one
>> could allocate all the space and leave nothing to the other (we are
>> limited to +- 2GB offset). Why don't just align BPF_JIT_REGION_START to
>> the next PMD aligned address?
>
> Originally, I planed to fix the issue by aligning BPF_JIT_REGION_START, but
> IIRC, BPF experts are adding (or have added) "Calling kernel functions from BPF"
> feature, there's a risk that BPF JIT region is beyond the 2GB of module region:
>
> ------
> module
> ------
> kernel
> ------
> BPF_JIT
>
> So I made this patch finally. In this patch, we let BPF JIT region sit
> between module and kernel.
>
> To address "make sure modules and BPF programs get their own virtual regions",
> what about something as below (applied against this patch)?
>
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 380cd3a7e548..da1158f10b09 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -31,7 +31,7 @@
> #define BPF_JIT_REGION_SIZE (SZ_128M)
> #ifdef CONFIG_64BIT
> #define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
> -#define BPF_JIT_REGION_END (MODULES_END)
> +#define BPF_JIT_REGION_END (PFN_ALIGN((unsigned long)&_start))
> #else
> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> #define BPF_JIT_REGION_END (VMALLOC_END)
> @@ -40,7 +40,7 @@
> /* Modules always live before the kernel */
> #ifdef CONFIG_64BIT
> #define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
> -#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
> +#define MODULES_END (BPF_JIT_REGION_END)
> #endif
>
>
>
>>
>> Again, good catch, thanks,
>>
>> Alex
>>
>> >
>> > Reported-by: Andreas Schwab <[email protected]>
>> > Signed-off-by: Jisheng Zhang <[email protected]>
>> > ---
>> > arch/riscv/include/asm/pgtable.h | 5 ++---
>> > 1 file changed, 2 insertions(+), 3 deletions(-)
>> >
>> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
>> > index 9469f464e71a..380cd3a7e548 100644
>> > --- a/arch/riscv/include/asm/pgtable.h
>> > +++ b/arch/riscv/include/asm/pgtable.h
>> > @@ -30,9 +30,8 @@
>> >
>> > #define BPF_JIT_REGION_SIZE (SZ_128M)
>> > #ifdef CONFIG_64BIT
>> > -/* KASLR should leave at least 128MB for BPF after the kernel */
>> > -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
>> > -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
>> > +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
>> > +#define BPF_JIT_REGION_END (MODULES_END)
>> > #else
>> > #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
>> > #define BPF_JIT_REGION_END (VMALLOC_END)
>> >
This, when applied onto fixes, is breaking early boot on KASAN
configurations for me.
Le 17/06/2021 à 09:30, Palmer Dabbelt a écrit :
> On Tue, 15 Jun 2021 17:03:28 PDT (-0700), [email protected] wrote:
>> On Tue, 15 Jun 2021 20:54:19 +0200
>> Alex Ghiti <[email protected]> wrote:
>>
>>> Hi Jisheng,
>>
>> Hi Alex,
>>
>>>
>>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :
>>> > From: Jisheng Zhang <[email protected]>
>>> > > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking
>>> W^X")
>>> > breaks booting with one kind of config file, I reproduced a kernel
>>> panic
>>> > with the config:
>>> > > [ 0.138553] Unable to handle kernel paging request at virtual
>>> address ffffffff81201220
>>> > [ 0.139159] Oops [#1]
>>> > [ 0.139303] Modules linked in:
>>> > [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted
>>> 5.13.0-rc5-default+ #1
>>> > [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
>>> > [ 0.140193] epc : __memset+0xc4/0xfc
>>> > [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
>>> > [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp :
>>> ffffffe001647da0
>>> > [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 :
>>> ffffffff81201158
>>> > [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 :
>>> ffffffe001647dd0
>>> > [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 :
>>> 0000000000000000
>>> > [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 :
>>> 0000000000000064
>>> > [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 :
>>> ffffffffffffffff
>>> > [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 :
>>> ffffffff81135088
>>> > [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 :
>>> ffffffff80800438
>>> > [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10:
>>> ffffffff806000ac
>>> > [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 :
>>> 0000000000000000
>>> > [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
>>> > [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220
>>> cause: 000000000000000f
>>> > [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
>>> > [ 0.143859] [<ffffffff8061e984>]
>>> init_default_flow_dissectors+0x22/0x60
>>> > [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
>>> > [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
>>> > [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
>>> > [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
>>> > [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
>>> > > After some investigation, I think I found the root cause: commit
>>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
>>> > BPF JIT region after the kernel:
>>> > > The &_end is unlikely aligned with PMD size, so the front bpf jit
>>> > region sits with part of kernel .data section in one PMD size mapping.
>>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
>>> > called to make the first bpf jit prog ROX, we will make part of kernel
>>> > .data section RO too, so when we write to, for example memset the
>>> > .data section, MMU will trigger a store page fault.
>>> Good catch, we make sure no physical allocation happens between _end
>>> and the next PMD aligned address, but I missed this one.
>>>
>>> > > To fix the issue, we need to ensure the BPF JIT region is PMD size
>>> > aligned. This patch acchieve this goal by restoring the BPF JIT region
>>> > to original position, I.E the 128MB before kernel .text section.
>>> But I disagree with your solution: I made sure modules and BPF
>>> programs get their own virtual regions to avoid worst case scenario
>>> where one could allocate all the space and leave nothing to the other
>>> (we are limited to +- 2GB offset). Why don't just align
>>> BPF_JIT_REGION_START to the next PMD aligned address?
>>
>> Originally, I planed to fix the issue by aligning
>> BPF_JIT_REGION_START, but
>> IIRC, BPF experts are adding (or have added) "Calling kernel functions
>> from BPF"
>> feature, there's a risk that BPF JIT region is beyond the 2GB of
>> module region:
>>
>> ------
>> module
>> ------
>> kernel
>> ------
>> BPF_JIT
>>
>> So I made this patch finally. In this patch, we let BPF JIT region sit
>> between module and kernel.
>>
>> To address "make sure modules and BPF programs get their own virtual
>> regions",
>> what about something as below (applied against this patch)?
>>
>> diff --git a/arch/riscv/include/asm/pgtable.h
>> b/arch/riscv/include/asm/pgtable.h
>> index 380cd3a7e548..da1158f10b09 100644
>> --- a/arch/riscv/include/asm/pgtable.h
>> +++ b/arch/riscv/include/asm/pgtable.h
>> @@ -31,7 +31,7 @@
>> #define BPF_JIT_REGION_SIZE (SZ_128M)
>> #ifdef CONFIG_64BIT
>> #define BPF_JIT_REGION_START (BPF_JIT_REGION_END -
>> BPF_JIT_REGION_SIZE)
>> -#define BPF_JIT_REGION_END (MODULES_END)
>> +#define BPF_JIT_REGION_END (PFN_ALIGN((unsigned long)&_start))
>> #else
>> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
>> #define BPF_JIT_REGION_END (VMALLOC_END)
>> @@ -40,7 +40,7 @@
>> /* Modules always live before the kernel */
>> #ifdef CONFIG_64BIT
>> #define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
>> -#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
>> +#define MODULES_END (BPF_JIT_REGION_END)
>> #endif
>>
>>
>>
>>>
>>> Again, good catch, thanks,
>>>
>>> Alex
>>>
>>> > > Reported-by: Andreas Schwab <[email protected]>
>>> > Signed-off-by: Jisheng Zhang <[email protected]>
>>> > ---
>>> > arch/riscv/include/asm/pgtable.h | 5 ++---
>>> > 1 file changed, 2 insertions(+), 3 deletions(-)
>>> > > diff --git a/arch/riscv/include/asm/pgtable.h
>>> b/arch/riscv/include/asm/pgtable.h
>>> > index 9469f464e71a..380cd3a7e548 100644
>>> > --- a/arch/riscv/include/asm/pgtable.h
>>> > +++ b/arch/riscv/include/asm/pgtable.h
>>> > @@ -30,9 +30,8 @@
>>> > > #define BPF_JIT_REGION_SIZE (SZ_128M)
>>> > #ifdef CONFIG_64BIT
>>> > -/* KASLR should leave at least 128MB for BPF after the kernel */
>>> > -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
>>> > -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START +
>>> BPF_JIT_REGION_SIZE)
>>> > +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END -
>>> BPF_JIT_REGION_SIZE)
>>> > +#define BPF_JIT_REGION_END (MODULES_END)
>>> > #else
>>> > #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
>>> > #define BPF_JIT_REGION_END (VMALLOC_END)
>>> >
>
> This, when applied onto fixes, is breaking early boot on KASAN
> configurations for me.
Not surprising, I took a shortcut when initializing KASAN for modules,
kernel and BPF:
kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
kasan_mem_to_shadow((const void
*)BPF_JIT_REGION_END));
The kernel is then not covered, I'm taking a look at how to fix that
properly.
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Le 17/06/2021 à 10:09, Alex Ghiti a écrit :
> Le 17/06/2021 à 09:30, Palmer Dabbelt a écrit :
>> On Tue, 15 Jun 2021 17:03:28 PDT (-0700), [email protected]
>> wrote:
>>> On Tue, 15 Jun 2021 20:54:19 +0200
>>> Alex Ghiti <[email protected]> wrote:
>>>
>>>> Hi Jisheng,
>>>
>>> Hi Alex,
>>>
>>>>
>>>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :
>>>> > From: Jisheng Zhang <[email protected]>
>>>> > > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid
>>>> breaking W^X")
>>>> > breaks booting with one kind of config file, I reproduced a kernel
>>>> panic
>>>> > with the config:
>>>> > > [ 0.138553] Unable to handle kernel paging request at virtual
>>>> address ffffffff81201220
>>>> > [ 0.139159] Oops [#1]
>>>> > [ 0.139303] Modules linked in:
>>>> > [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted
>>>> 5.13.0-rc5-default+ #1
>>>> > [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
>>>> > [ 0.140193] epc : __memset+0xc4/0xfc
>>>> > [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
>>>> > [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp :
>>>> ffffffe001647da0
>>>> > [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 :
>>>> ffffffff81201158
>>>> > [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 :
>>>> ffffffe001647dd0
>>>> > [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 :
>>>> 0000000000000000
>>>> > [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 :
>>>> 0000000000000064
>>>> > [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 :
>>>> ffffffffffffffff
>>>> > [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 :
>>>> ffffffff81135088
>>>> > [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 :
>>>> ffffffff80800438
>>>> > [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10:
>>>> ffffffff806000ac
>>>> > [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 :
>>>> 0000000000000000
>>>> > [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
>>>> > [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220
>>>> cause: 000000000000000f
>>>> > [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
>>>> > [ 0.143859] [<ffffffff8061e984>]
>>>> init_default_flow_dissectors+0x22/0x60
>>>> > [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
>>>> > [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
>>>> > [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
>>>> > [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
>>>> > [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
>>>> > > After some investigation, I think I found the root cause: commit
>>>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
>>>> > BPF JIT region after the kernel:
>>>> > > The &_end is unlikely aligned with PMD size, so the front bpf jit
>>>> > region sits with part of kernel .data section in one PMD size
>>>> mapping.
>>>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
>>>> > called to make the first bpf jit prog ROX, we will make part of
>>>> kernel
>>>> > .data section RO too, so when we write to, for example memset the
>>>> > .data section, MMU will trigger a store page fault.
>>>> Good catch, we make sure no physical allocation happens between _end
>>>> and the next PMD aligned address, but I missed this one.
>>>>
>>>> > > To fix the issue, we need to ensure the BPF JIT region is PMD size
>>>> > aligned. This patch acchieve this goal by restoring the BPF JIT
>>>> region
>>>> > to original position, I.E the 128MB before kernel .text section.
>>>> But I disagree with your solution: I made sure modules and BPF
>>>> programs get their own virtual regions to avoid worst case scenario
>>>> where one could allocate all the space and leave nothing to the
>>>> other (we are limited to +- 2GB offset). Why don't just align
>>>> BPF_JIT_REGION_START to the next PMD aligned address?
>>>
>>> Originally, I planed to fix the issue by aligning
>>> BPF_JIT_REGION_START, but
>>> IIRC, BPF experts are adding (or have added) "Calling kernel
>>> functions from BPF"
>>> feature, there's a risk that BPF JIT region is beyond the 2GB of
>>> module region:
>>>
>>> ------
>>> module
>>> ------
>>> kernel
>>> ------
>>> BPF_JIT
>>>
>>> So I made this patch finally. In this patch, we let BPF JIT region sit
>>> between module and kernel.
>>>
>>> To address "make sure modules and BPF programs get their own virtual
>>> regions",
>>> what about something as below (applied against this patch)?
>>>
>>> diff --git a/arch/riscv/include/asm/pgtable.h
>>> b/arch/riscv/include/asm/pgtable.h
>>> index 380cd3a7e548..da1158f10b09 100644
>>> --- a/arch/riscv/include/asm/pgtable.h
>>> +++ b/arch/riscv/include/asm/pgtable.h
>>> @@ -31,7 +31,7 @@
>>> #define BPF_JIT_REGION_SIZE (SZ_128M)
>>> #ifdef CONFIG_64BIT
>>> #define BPF_JIT_REGION_START (BPF_JIT_REGION_END -
>>> BPF_JIT_REGION_SIZE)
>>> -#define BPF_JIT_REGION_END (MODULES_END)
>>> +#define BPF_JIT_REGION_END (PFN_ALIGN((unsigned long)&_start))
>>> #else
>>> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
>>> #define BPF_JIT_REGION_END (VMALLOC_END)
>>> @@ -40,7 +40,7 @@
>>> /* Modules always live before the kernel */
>>> #ifdef CONFIG_64BIT
>>> #define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
>>> -#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
>>> +#define MODULES_END (BPF_JIT_REGION_END)
>>> #endif
>>>
>>>
>>>
>>>>
>>>> Again, good catch, thanks,
>>>>
>>>> Alex
>>>>
>>>> > > Reported-by: Andreas Schwab <[email protected]>
>>>> > Signed-off-by: Jisheng Zhang <[email protected]>
>>>> > ---
>>>> > arch/riscv/include/asm/pgtable.h | 5 ++---
>>>> > 1 file changed, 2 insertions(+), 3 deletions(-)
>>>> > > diff --git a/arch/riscv/include/asm/pgtable.h
>>>> b/arch/riscv/include/asm/pgtable.h
>>>> > index 9469f464e71a..380cd3a7e548 100644
>>>> > --- a/arch/riscv/include/asm/pgtable.h
>>>> > +++ b/arch/riscv/include/asm/pgtable.h
>>>> > @@ -30,9 +30,8 @@
>>>> > > #define BPF_JIT_REGION_SIZE (SZ_128M)
>>>> > #ifdef CONFIG_64BIT
>>>> > -/* KASLR should leave at least 128MB for BPF after the kernel */
>>>> > -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
>>>> > -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START +
>>>> BPF_JIT_REGION_SIZE)
>>>> > +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END -
>>>> BPF_JIT_REGION_SIZE)
>>>> > +#define BPF_JIT_REGION_END (MODULES_END)
>>>> > #else
>>>> > #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
>>>> > #define BPF_JIT_REGION_END (VMALLOC_END)
>>>> >
>>
>> This, when applied onto fixes, is breaking early boot on KASAN
>> configurations for me.
>
> Not surprising, I took a shortcut when initializing KASAN for modules,
> kernel and BPF:
>
> kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
> kasan_mem_to_shadow((const void
> *)BPF_JIT_REGION_END));
>
> The kernel is then not covered, I'm taking a look at how to fix that
> properly.
>
The following based on "riscv: Introduce structure that group all
variables regarding kernel mapping" fixes the issue:
diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index 9daacae93e33..2a45ea909e7f 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -199,9 +199,12 @@ void __init kasan_init(void)
kasan_populate(kasan_mem_to_shadow(start),
kasan_mem_to_shadow(end));
}
- /* Populate kernel, BPF, modules mapping */
+ /* Populate BPF and modules mapping: modules mapping encompasses
BPF mapping */
kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
- kasan_mem_to_shadow((const void
*)BPF_JIT_REGION_END));
+ kasan_mem_to_shadow((const void *)MODULES_END));
+ /* Populate kernel mapping */
+ kasan_populate(kasan_mem_to_shadow((const void
*)kernel_map.virt_addr),
+ kasan_mem_to_shadow((const void
*)kernel_map.virt_addr + kernel_map.size));
Without the mentioned patch, replace kernel_map.virt_addr with
kernel_virt_addr and kernel_map.size with load_sz. Note that load_sz was
re-exposed in v6 of the patchset "Map the kernel with correct
permissions the first time".
Alex
>>
>> _______________________________________________
>> linux-riscv mailing list
>> [email protected]
>> http://lists.infradead.org/mailman/listinfo/linux-riscv
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv
On Fri, 18 Jun 2021 01:27:31 +0800
Jisheng Zhang <[email protected]> wrote:
> On Thu, 17 Jun 2021 16:18:54 +0200
> Alex Ghiti <[email protected]> wrote:
>
> > Le 17/06/2021 à 10:09, Alex Ghiti a écrit :
> > > Le 17/06/2021 à 09:30, Palmer Dabbelt a écrit :
> > >> On Tue, 15 Jun 2021 17:03:28 PDT (-0700), [email protected]
> > >> wrote:
> > >>> On Tue, 15 Jun 2021 20:54:19 +0200
> > >>> Alex Ghiti <[email protected]> wrote:
> > >>>
> > >>>> Hi Jisheng,
> > >>>
> > >>> Hi Alex,
> > >>>
> > >>>>
> > >>>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :
> > >>>> > From: Jisheng Zhang <[email protected]>
> > >>>> > > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid
> > >>>> breaking W^X")
> > >>>> > breaks booting with one kind of config file, I reproduced a kernel
> > >>>> panic
> > >>>> > with the config:
> > >>>> > > [ 0.138553] Unable to handle kernel paging request at virtual
> > >>>> address ffffffff81201220
> > >>>> > [ 0.139159] Oops [#1]
> > >>>> > [ 0.139303] Modules linked in:
> > >>>> > [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted
> > >>>> 5.13.0-rc5-default+ #1
> > >>>> > [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
> > >>>> > [ 0.140193] epc : __memset+0xc4/0xfc
> > >>>> > [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
> > >>>> > [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp :
> > >>>> ffffffe001647da0
> > >>>> > [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 :
> > >>>> ffffffff81201158
> > >>>> > [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 :
> > >>>> ffffffe001647dd0
> > >>>> > [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 :
> > >>>> 0000000000000000
> > >>>> > [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 :
> > >>>> 0000000000000064
> > >>>> > [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 :
> > >>>> ffffffffffffffff
> > >>>> > [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 :
> > >>>> ffffffff81135088
> > >>>> > [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 :
> > >>>> ffffffff80800438
> > >>>> > [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10:
> > >>>> ffffffff806000ac
> > >>>> > [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 :
> > >>>> 0000000000000000
> > >>>> > [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
> > >>>> > [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220
> > >>>> cause: 000000000000000f
> > >>>> > [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
> > >>>> > [ 0.143859] [<ffffffff8061e984>]
> > >>>> init_default_flow_dissectors+0x22/0x60
> > >>>> > [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
> > >>>> > [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
> > >>>> > [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
> > >>>> > [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
> > >>>> > [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
> > >>>> > > After some investigation, I think I found the root cause: commit
> > >>>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
> > >>>> > BPF JIT region after the kernel:
> > >>>> > > The &_end is unlikely aligned with PMD size, so the front bpf jit
> > >>>> > region sits with part of kernel .data section in one PMD size
> > >>>> mapping.
> > >>>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
> > >>>> > called to make the first bpf jit prog ROX, we will make part of
> > >>>> kernel
> > >>>> > .data section RO too, so when we write to, for example memset the
> > >>>> > .data section, MMU will trigger a store page fault.
> > >>>> Good catch, we make sure no physical allocation happens between _end
> > >>>> and the next PMD aligned address, but I missed this one.
> > >>>>
> > >>>> > > To fix the issue, we need to ensure the BPF JIT region is PMD size
> > >>>> > aligned. This patch acchieve this goal by restoring the BPF JIT
> > >>>> region
> > >>>> > to original position, I.E the 128MB before kernel .text section.
> > >>>> But I disagree with your solution: I made sure modules and BPF
> > >>>> programs get their own virtual regions to avoid worst case scenario
> > >>>> where one could allocate all the space and leave nothing to the
> > >>>> other (we are limited to +- 2GB offset). Why don't just align
> > >>>> BPF_JIT_REGION_START to the next PMD aligned address?
> > >>>
> > >>> Originally, I planed to fix the issue by aligning
> > >>> BPF_JIT_REGION_START, but
> > >>> IIRC, BPF experts are adding (or have added) "Calling kernel
> > >>> functions from BPF"
> > >>> feature, there's a risk that BPF JIT region is beyond the 2GB of
> > >>> module region:
> > >>>
> > >>> ------
> > >>> module
> > >>> ------
> > >>> kernel
> > >>> ------
> > >>> BPF_JIT
> > >>>
> > >>> So I made this patch finally. In this patch, we let BPF JIT region sit
> > >>> between module and kernel.
> > >>>
> > >>> To address "make sure modules and BPF programs get their own virtual
> > >>> regions",
> > >>> what about something as below (applied against this patch)?
> > >>>
> > >>> diff --git a/arch/riscv/include/asm/pgtable.h
> > >>> b/arch/riscv/include/asm/pgtable.h
> > >>> index 380cd3a7e548..da1158f10b09 100644
> > >>> --- a/arch/riscv/include/asm/pgtable.h
> > >>> +++ b/arch/riscv/include/asm/pgtable.h
> > >>> @@ -31,7 +31,7 @@
> > >>> #define BPF_JIT_REGION_SIZE (SZ_128M)
> > >>> #ifdef CONFIG_64BIT
> > >>> #define BPF_JIT_REGION_START (BPF_JIT_REGION_END -
> > >>> BPF_JIT_REGION_SIZE)
> > >>> -#define BPF_JIT_REGION_END (MODULES_END)
> > >>> +#define BPF_JIT_REGION_END (PFN_ALIGN((unsigned long)&_start))
> > >>> #else
> > >>> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> > >>> #define BPF_JIT_REGION_END (VMALLOC_END)
> > >>> @@ -40,7 +40,7 @@
> > >>> /* Modules always live before the kernel */
> > >>> #ifdef CONFIG_64BIT
> > >>> #define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
> > >>> -#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
> > >>> +#define MODULES_END (BPF_JIT_REGION_END)
> > >>> #endif
> > >>>
> > >>>
> > >>>
> > >>>>
> > >>>> Again, good catch, thanks,
> > >>>>
> > >>>> Alex
> > >>>>
> > >>>> > > Reported-by: Andreas Schwab <[email protected]>
> > >>>> > Signed-off-by: Jisheng Zhang <[email protected]>
> > >>>> > ---
> > >>>> > arch/riscv/include/asm/pgtable.h | 5 ++---
> > >>>> > 1 file changed, 2 insertions(+), 3 deletions(-)
> > >>>> > > diff --git a/arch/riscv/include/asm/pgtable.h
> > >>>> b/arch/riscv/include/asm/pgtable.h
> > >>>> > index 9469f464e71a..380cd3a7e548 100644
> > >>>> > --- a/arch/riscv/include/asm/pgtable.h
> > >>>> > +++ b/arch/riscv/include/asm/pgtable.h
> > >>>> > @@ -30,9 +30,8 @@
> > >>>> > > #define BPF_JIT_REGION_SIZE (SZ_128M)
> > >>>> > #ifdef CONFIG_64BIT
> > >>>> > -/* KASLR should leave at least 128MB for BPF after the kernel */
> > >>>> > -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
> > >>>> > -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START +
> > >>>> BPF_JIT_REGION_SIZE)
> > >>>> > +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END -
> > >>>> BPF_JIT_REGION_SIZE)
> > >>>> > +#define BPF_JIT_REGION_END (MODULES_END)
> > >>>> > #else
> > >>>> > #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> > >>>> > #define BPF_JIT_REGION_END (VMALLOC_END)
> > >>>> >
> > >>
> > >> This, when applied onto fixes, is breaking early boot on KASAN
> > >> configurations for me.
>
> I can reproduce this issue.
>
> > >
> > > Not surprising, I took a shortcut when initializing KASAN for modules,
> > > kernel and BPF:
> > >
> > > kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
> > > kasan_mem_to_shadow((const void
> > > *)BPF_JIT_REGION_END));
> > >
> > > The kernel is then not covered, I'm taking a look at how to fix that
> > > properly.
> > >
> >
> > The following based on "riscv: Introduce structure that group all
> > variables regarding kernel mapping" fixes the issue:
> >
> > diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
> > index 9daacae93e33..2a45ea909e7f 100644
> > --- a/arch/riscv/mm/kasan_init.c
> > +++ b/arch/riscv/mm/kasan_init.c
> > @@ -199,9 +199,12 @@ void __init kasan_init(void)
> > kasan_populate(kasan_mem_to_shadow(start),
> > kasan_mem_to_shadow(end));
> > }
> >
> > - /* Populate kernel, BPF, modules mapping */
> > + /* Populate BPF and modules mapping: modules mapping encompasses
> > BPF mapping */
> > kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
> > - kasan_mem_to_shadow((const void
> > *)BPF_JIT_REGION_END));
> > + kasan_mem_to_shadow((const void *)MODULES_END));
> > + /* Populate kernel mapping */
> > + kasan_populate(kasan_mem_to_shadow((const void
> > *)kernel_map.virt_addr),
> > + kasan_mem_to_shadow((const void
> > *)kernel_map.virt_addr + kernel_map.size));
> >
> If this patch works, maybe we can still use one kasan_populate() to cover
> kernel, bpf, and module:
>
> kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
> - kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));
> + kasan_mem_to_shadow((const void *)MODULES_VADDR + SZ_2G));
>
I made a mistake. Below patch works:
kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
- kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));
+ kasan_mem_to_shadow((const void *)(MODULES_VADDR + SZ_2G)));
> However, both can't solve the early boot hang issue. I'm not sure what's missing.
>
> I applied your patch on rc6 + solution below "replace kernel_map.virt_addr with kernel_virt_addr and
> kernel_map.size with load_sz"
>
>
> Thanks
>
> >
> > Without the mentioned patch, replace kernel_map.virt_addr with
> > kernel_virt_addr and kernel_map.size with load_sz. Note that load_sz was
> > re-exposed in v6 of the patchset "Map the kernel with correct
> > permissions the first time".
> >
>
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv
On Thu, 17 Jun 2021 09:23:04 +0200
Alex Ghiti <[email protected]> wrote:
> Le 16/06/2021 à 02:03, Jisheng Zhang a écrit :
> > On Tue, 15 Jun 2021 20:54:19 +0200
> > Alex Ghiti <[email protected]> wrote:
> >
> >> Hi Jisheng,
> >
> > Hi Alex,
> >
> >>
> >> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :
> >>> From: Jisheng Zhang <[email protected]>
> >>>
> >>> Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")
> >>> breaks booting with one kind of config file, I reproduced a kernel panic
> >>> with the config:
> >>>
> >>> [ 0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220
> >>> [ 0.139159] Oops [#1]
> >>> [ 0.139303] Modules linked in:
> >>> [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1
> >>> [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
> >>> [ 0.140193] epc : __memset+0xc4/0xfc
> >>> [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
> >>> [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0
> >>> [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158
> >>> [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0
> >>> [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000
> >>> [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064
> >>> [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff
> >>> [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088
> >>> [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438
> >>> [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac
> >>> [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000
> >>> [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
> >>> [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f
> >>> [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
> >>> [ 0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60
> >>> [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
> >>> [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
> >>> [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
> >>> [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
> >>> [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
> >>>
> >>> After some investigation, I think I found the root cause: commit
> >>> 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
> >>> BPF JIT region after the kernel:
> >>>
> >>> The &_end is unlikely aligned with PMD size, so the front bpf jit
> >>> region sits with part of kernel .data section in one PMD size mapping.
> >>> But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
> >>> called to make the first bpf jit prog ROX, we will make part of kernel
> >>> .data section RO too, so when we write to, for example memset the
> >>> .data section, MMU will trigger a store page fault.
> >>
> >> Good catch, we make sure no physical allocation happens between _end and
> >> the next PMD aligned address, but I missed this one.
> >>
> >>>
> >>> To fix the issue, we need to ensure the BPF JIT region is PMD size
> >>> aligned. This patch acchieve this goal by restoring the BPF JIT region
> >>> to original position, I.E the 128MB before kernel .text section.
> >>
> >> But I disagree with your solution: I made sure modules and BPF programs
> >> get their own virtual regions to avoid worst case scenario where one
> >> could allocate all the space and leave nothing to the other (we are
> >> limited to +- 2GB offset). Why don't just align BPF_JIT_REGION_START to
> >> the next PMD aligned address?
> >
> > Originally, I planed to fix the issue by aligning BPF_JIT_REGION_START, but
> > IIRC, BPF experts are adding (or have added) "Calling kernel functions from BPF"
> > feature, there's a risk that BPF JIT region is beyond the 2GB of module region:
> >
> > ------
> > module
> > ------
> > kernel
> > ------
> > BPF_JIT
> >
> > So I made this patch finally. In this patch, we let BPF JIT region sit
> > between module and kernel.
> >
>
> From what I read in the lwn article, I'm not sure BPF programs can call
> module functions, can someone tell us if it is possible? Or planned?
What about module call BPF program? this case also wants the 2GB address limit.
>
> > To address "make sure modules and BPF programs get their own virtual regions",
> > what about something as below (applied against this patch)?
> >
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index 380cd3a7e548..da1158f10b09 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -31,7 +31,7 @@
> > #define BPF_JIT_REGION_SIZE (SZ_128M)
> > #ifdef CONFIG_64BIT
> > #define BPF_JIT_REGION_START (BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
> > -#define BPF_JIT_REGION_END (MODULES_END)
> > +#define BPF_JIT_REGION_END (PFN_ALIGN((unsigned long)&_start))
> > #else
> > #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> > #define BPF_JIT_REGION_END (VMALLOC_END)
> > @@ -40,7 +40,7 @@
> > /* Modules always live before the kernel */
> > #ifdef CONFIG_64BIT
> > #define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
> > -#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
> > +#define MODULES_END (BPF_JIT_REGION_END)
> > #endif
> >
> >
>
> In case it is possible, I would let the vmalloc allocator handle the
> case where modules steal room from BPF: I would then not implement the
> above but rather your first patch.
>
> And do not forget to modify Documentation/riscv/vm-layout.rst
> accordingly and remove the comment "/* KASLR should leave at least 128MB
> for BPF after the kernel */"
Thanks for the comments
On Thu, 17 Jun 2021 16:18:54 +0200
Alex Ghiti <[email protected]> wrote:
> Le 17/06/2021 à 10:09, Alex Ghiti a écrit :
> > Le 17/06/2021 à 09:30, Palmer Dabbelt a écrit :
> >> On Tue, 15 Jun 2021 17:03:28 PDT (-0700), [email protected]
> >> wrote:
> >>> On Tue, 15 Jun 2021 20:54:19 +0200
> >>> Alex Ghiti <[email protected]> wrote:
> >>>
> >>>> Hi Jisheng,
> >>>
> >>> Hi Alex,
> >>>
> >>>>
> >>>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :
> >>>> > From: Jisheng Zhang <[email protected]>
> >>>> > > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid
> >>>> breaking W^X")
> >>>> > breaks booting with one kind of config file, I reproduced a kernel
> >>>> panic
> >>>> > with the config:
> >>>> > > [ 0.138553] Unable to handle kernel paging request at virtual
> >>>> address ffffffff81201220
> >>>> > [ 0.139159] Oops [#1]
> >>>> > [ 0.139303] Modules linked in:
> >>>> > [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted
> >>>> 5.13.0-rc5-default+ #1
> >>>> > [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
> >>>> > [ 0.140193] epc : __memset+0xc4/0xfc
> >>>> > [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
> >>>> > [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp :
> >>>> ffffffe001647da0
> >>>> > [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 :
> >>>> ffffffff81201158
> >>>> > [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 :
> >>>> ffffffe001647dd0
> >>>> > [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 :
> >>>> 0000000000000000
> >>>> > [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 :
> >>>> 0000000000000064
> >>>> > [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 :
> >>>> ffffffffffffffff
> >>>> > [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 :
> >>>> ffffffff81135088
> >>>> > [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 :
> >>>> ffffffff80800438
> >>>> > [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10:
> >>>> ffffffff806000ac
> >>>> > [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 :
> >>>> 0000000000000000
> >>>> > [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
> >>>> > [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220
> >>>> cause: 000000000000000f
> >>>> > [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
> >>>> > [ 0.143859] [<ffffffff8061e984>]
> >>>> init_default_flow_dissectors+0x22/0x60
> >>>> > [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
> >>>> > [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
> >>>> > [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
> >>>> > [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
> >>>> > [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
> >>>> > > After some investigation, I think I found the root cause: commit
> >>>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
> >>>> > BPF JIT region after the kernel:
> >>>> > > The &_end is unlikely aligned with PMD size, so the front bpf jit
> >>>> > region sits with part of kernel .data section in one PMD size
> >>>> mapping.
> >>>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
> >>>> > called to make the first bpf jit prog ROX, we will make part of
> >>>> kernel
> >>>> > .data section RO too, so when we write to, for example memset the
> >>>> > .data section, MMU will trigger a store page fault.
> >>>> Good catch, we make sure no physical allocation happens between _end
> >>>> and the next PMD aligned address, but I missed this one.
> >>>>
> >>>> > > To fix the issue, we need to ensure the BPF JIT region is PMD size
> >>>> > aligned. This patch acchieve this goal by restoring the BPF JIT
> >>>> region
> >>>> > to original position, I.E the 128MB before kernel .text section.
> >>>> But I disagree with your solution: I made sure modules and BPF
> >>>> programs get their own virtual regions to avoid worst case scenario
> >>>> where one could allocate all the space and leave nothing to the
> >>>> other (we are limited to +- 2GB offset). Why don't just align
> >>>> BPF_JIT_REGION_START to the next PMD aligned address?
> >>>
> >>> Originally, I planed to fix the issue by aligning
> >>> BPF_JIT_REGION_START, but
> >>> IIRC, BPF experts are adding (or have added) "Calling kernel
> >>> functions from BPF"
> >>> feature, there's a risk that BPF JIT region is beyond the 2GB of
> >>> module region:
> >>>
> >>> ------
> >>> module
> >>> ------
> >>> kernel
> >>> ------
> >>> BPF_JIT
> >>>
> >>> So I made this patch finally. In this patch, we let BPF JIT region sit
> >>> between module and kernel.
> >>>
> >>> To address "make sure modules and BPF programs get their own virtual
> >>> regions",
> >>> what about something as below (applied against this patch)?
> >>>
> >>> diff --git a/arch/riscv/include/asm/pgtable.h
> >>> b/arch/riscv/include/asm/pgtable.h
> >>> index 380cd3a7e548..da1158f10b09 100644
> >>> --- a/arch/riscv/include/asm/pgtable.h
> >>> +++ b/arch/riscv/include/asm/pgtable.h
> >>> @@ -31,7 +31,7 @@
> >>> #define BPF_JIT_REGION_SIZE (SZ_128M)
> >>> #ifdef CONFIG_64BIT
> >>> #define BPF_JIT_REGION_START (BPF_JIT_REGION_END -
> >>> BPF_JIT_REGION_SIZE)
> >>> -#define BPF_JIT_REGION_END (MODULES_END)
> >>> +#define BPF_JIT_REGION_END (PFN_ALIGN((unsigned long)&_start))
> >>> #else
> >>> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> >>> #define BPF_JIT_REGION_END (VMALLOC_END)
> >>> @@ -40,7 +40,7 @@
> >>> /* Modules always live before the kernel */
> >>> #ifdef CONFIG_64BIT
> >>> #define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
> >>> -#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
> >>> +#define MODULES_END (BPF_JIT_REGION_END)
> >>> #endif
> >>>
> >>>
> >>>
> >>>>
> >>>> Again, good catch, thanks,
> >>>>
> >>>> Alex
> >>>>
> >>>> > > Reported-by: Andreas Schwab <[email protected]>
> >>>> > Signed-off-by: Jisheng Zhang <[email protected]>
> >>>> > ---
> >>>> > arch/riscv/include/asm/pgtable.h | 5 ++---
> >>>> > 1 file changed, 2 insertions(+), 3 deletions(-)
> >>>> > > diff --git a/arch/riscv/include/asm/pgtable.h
> >>>> b/arch/riscv/include/asm/pgtable.h
> >>>> > index 9469f464e71a..380cd3a7e548 100644
> >>>> > --- a/arch/riscv/include/asm/pgtable.h
> >>>> > +++ b/arch/riscv/include/asm/pgtable.h
> >>>> > @@ -30,9 +30,8 @@
> >>>> > > #define BPF_JIT_REGION_SIZE (SZ_128M)
> >>>> > #ifdef CONFIG_64BIT
> >>>> > -/* KASLR should leave at least 128MB for BPF after the kernel */
> >>>> > -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
> >>>> > -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START +
> >>>> BPF_JIT_REGION_SIZE)
> >>>> > +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END -
> >>>> BPF_JIT_REGION_SIZE)
> >>>> > +#define BPF_JIT_REGION_END (MODULES_END)
> >>>> > #else
> >>>> > #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> >>>> > #define BPF_JIT_REGION_END (VMALLOC_END)
> >>>> >
> >>
> >> This, when applied onto fixes, is breaking early boot on KASAN
> >> configurations for me.
I can reproduce this issue.
> >
> > Not surprising, I took a shortcut when initializing KASAN for modules,
> > kernel and BPF:
> >
> > kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
> > kasan_mem_to_shadow((const void
> > *)BPF_JIT_REGION_END));
> >
> > The kernel is then not covered, I'm taking a look at how to fix that
> > properly.
> >
>
> The following based on "riscv: Introduce structure that group all
> variables regarding kernel mapping" fixes the issue:
>
> diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
> index 9daacae93e33..2a45ea909e7f 100644
> --- a/arch/riscv/mm/kasan_init.c
> +++ b/arch/riscv/mm/kasan_init.c
> @@ -199,9 +199,12 @@ void __init kasan_init(void)
> kasan_populate(kasan_mem_to_shadow(start),
> kasan_mem_to_shadow(end));
> }
>
> - /* Populate kernel, BPF, modules mapping */
> + /* Populate BPF and modules mapping: modules mapping encompasses
> BPF mapping */
> kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
> - kasan_mem_to_shadow((const void
> *)BPF_JIT_REGION_END));
> + kasan_mem_to_shadow((const void *)MODULES_END));
> + /* Populate kernel mapping */
> + kasan_populate(kasan_mem_to_shadow((const void
> *)kernel_map.virt_addr),
> + kasan_mem_to_shadow((const void
> *)kernel_map.virt_addr + kernel_map.size));
>
If this patch works, maybe we can still use one kasan_populate() to cover
kernel, bpf, and module:
kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
- kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));
+ kasan_mem_to_shadow((const void *)MODULES_VADDR + SZ_2G));
However, both can't solve the early boot hang issue. I'm not sure what's missing.
I applied your patch on rc6 + solution below "replace kernel_map.virt_addr with kernel_virt_addr and
kernel_map.size with load_sz"
Thanks
>
> Without the mentioned patch, replace kernel_map.virt_addr with
> kernel_virt_addr and kernel_map.size with load_sz. Note that load_sz was
> re-exposed in v6 of the patchset "Map the kernel with correct
> permissions the first time".
>
On Fri, 18 Jun 2021 01:46:48 +0800
Jisheng Zhang <[email protected]> wrote:
> On Fri, 18 Jun 2021 01:27:31 +0800
> Jisheng Zhang <[email protected]> wrote:
>
> > On Thu, 17 Jun 2021 16:18:54 +0200
> > Alex Ghiti <[email protected]> wrote:
> >
> > > Le 17/06/2021 à 10:09, Alex Ghiti a écrit :
> > > > Le 17/06/2021 à 09:30, Palmer Dabbelt a écrit :
> > > >> On Tue, 15 Jun 2021 17:03:28 PDT (-0700), [email protected]
> > > >> wrote:
> > > >>> On Tue, 15 Jun 2021 20:54:19 +0200
> > > >>> Alex Ghiti <[email protected]> wrote:
> > > >>>
> > > >>>> Hi Jisheng,
> > > >>>
> > > >>> Hi Alex,
> > > >>>
> > > >>>>
> > > >>>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :
> > > >>>> > From: Jisheng Zhang <[email protected]>
> > > >>>> > > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid
> > > >>>> breaking W^X")
> > > >>>> > breaks booting with one kind of config file, I reproduced a kernel
> > > >>>> panic
> > > >>>> > with the config:
> > > >>>> > > [ 0.138553] Unable to handle kernel paging request at virtual
> > > >>>> address ffffffff81201220
> > > >>>> > [ 0.139159] Oops [#1]
> > > >>>> > [ 0.139303] Modules linked in:
> > > >>>> > [ 0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted
> > > >>>> 5.13.0-rc5-default+ #1
> > > >>>> > [ 0.139934] Hardware name: riscv-virtio,qemu (DT)
> > > >>>> > [ 0.140193] epc : __memset+0xc4/0xfc
> > > >>>> > [ 0.140416] ra : skb_flow_dissector_init+0x1e/0x82
> > > >>>> > [ 0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp :
> > > >>>> ffffffe001647da0
> > > >>>> > [ 0.140878] gp : ffffffff81134b08 tp : ffffffe001654380 t0 :
> > > >>>> ffffffff81201158
> > > >>>> > [ 0.141156] t1 : 0000000000000002 t2 : 0000000000000154 s0 :
> > > >>>> ffffffe001647dd0
> > > >>>> > [ 0.141424] s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 :
> > > >>>> 0000000000000000
> > > >>>> > [ 0.141654] a2 : 000000000000003c a3 : ffffffff81201258 a4 :
> > > >>>> 0000000000000064
> > > >>>> > [ 0.141893] a5 : ffffffff8029806c a6 : 0000000000000040 a7 :
> > > >>>> ffffffffffffffff
> > > >>>> > [ 0.142126] s2 : ffffffff81201220 s3 : 0000000000000009 s4 :
> > > >>>> ffffffff81135088
> > > >>>> > [ 0.142353] s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 :
> > > >>>> ffffffff80800438
> > > >>>> > [ 0.142584] s8 : ffffffff80bc6578 s9 : 0000000000000008 s10:
> > > >>>> ffffffff806000ac
> > > >>>> > [ 0.142810] s11: 0000000000000000 t3 : fffffffffffffffc t4 :
> > > >>>> 0000000000000000
> > > >>>> > [ 0.143042] t5 : 0000000000000155 t6 : 00000000000003ff
> > > >>>> > [ 0.143220] status: 0000000000000120 badaddr: ffffffff81201220
> > > >>>> cause: 000000000000000f
> > > >>>> > [ 0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
> > > >>>> > [ 0.143859] [<ffffffff8061e984>]
> > > >>>> init_default_flow_dissectors+0x22/0x60
> > > >>>> > [ 0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
> > > >>>> > [ 0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
> > > >>>> > [ 0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
> > > >>>> > [ 0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
> > > >>>> > [ 0.145124] ---[ end trace f1e9643daa46d591 ]---
> > > >>>> > > After some investigation, I think I found the root cause: commit
> > > >>>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
> > > >>>> > BPF JIT region after the kernel:
> > > >>>> > > The &_end is unlikely aligned with PMD size, so the front bpf jit
> > > >>>> > region sits with part of kernel .data section in one PMD size
> > > >>>> mapping.
> > > >>>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
> > > >>>> > called to make the first bpf jit prog ROX, we will make part of
> > > >>>> kernel
> > > >>>> > .data section RO too, so when we write to, for example memset the
> > > >>>> > .data section, MMU will trigger a store page fault.
> > > >>>> Good catch, we make sure no physical allocation happens between _end
> > > >>>> and the next PMD aligned address, but I missed this one.
> > > >>>>
> > > >>>> > > To fix the issue, we need to ensure the BPF JIT region is PMD size
> > > >>>> > aligned. This patch acchieve this goal by restoring the BPF JIT
> > > >>>> region
> > > >>>> > to original position, I.E the 128MB before kernel .text section.
> > > >>>> But I disagree with your solution: I made sure modules and BPF
> > > >>>> programs get their own virtual regions to avoid worst case scenario
> > > >>>> where one could allocate all the space and leave nothing to the
> > > >>>> other (we are limited to +- 2GB offset). Why don't just align
> > > >>>> BPF_JIT_REGION_START to the next PMD aligned address?
> > > >>>
> > > >>> Originally, I planed to fix the issue by aligning
> > > >>> BPF_JIT_REGION_START, but
> > > >>> IIRC, BPF experts are adding (or have added) "Calling kernel
> > > >>> functions from BPF"
> > > >>> feature, there's a risk that BPF JIT region is beyond the 2GB of
> > > >>> module region:
> > > >>>
> > > >>> ------
> > > >>> module
> > > >>> ------
> > > >>> kernel
> > > >>> ------
> > > >>> BPF_JIT
> > > >>>
> > > >>> So I made this patch finally. In this patch, we let BPF JIT region sit
> > > >>> between module and kernel.
> > > >>>
> > > >>> To address "make sure modules and BPF programs get their own virtual
> > > >>> regions",
> > > >>> what about something as below (applied against this patch)?
> > > >>>
> > > >>> diff --git a/arch/riscv/include/asm/pgtable.h
> > > >>> b/arch/riscv/include/asm/pgtable.h
> > > >>> index 380cd3a7e548..da1158f10b09 100644
> > > >>> --- a/arch/riscv/include/asm/pgtable.h
> > > >>> +++ b/arch/riscv/include/asm/pgtable.h
> > > >>> @@ -31,7 +31,7 @@
> > > >>> #define BPF_JIT_REGION_SIZE (SZ_128M)
> > > >>> #ifdef CONFIG_64BIT
> > > >>> #define BPF_JIT_REGION_START (BPF_JIT_REGION_END -
> > > >>> BPF_JIT_REGION_SIZE)
> > > >>> -#define BPF_JIT_REGION_END (MODULES_END)
> > > >>> +#define BPF_JIT_REGION_END (PFN_ALIGN((unsigned long)&_start))
> > > >>> #else
> > > >>> #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> > > >>> #define BPF_JIT_REGION_END (VMALLOC_END)
> > > >>> @@ -40,7 +40,7 @@
> > > >>> /* Modules always live before the kernel */
> > > >>> #ifdef CONFIG_64BIT
> > > >>> #define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
> > > >>> -#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
> > > >>> +#define MODULES_END (BPF_JIT_REGION_END)
> > > >>> #endif
> > > >>>
> > > >>>
> > > >>>
> > > >>>>
> > > >>>> Again, good catch, thanks,
> > > >>>>
> > > >>>> Alex
> > > >>>>
> > > >>>> > > Reported-by: Andreas Schwab <[email protected]>
> > > >>>> > Signed-off-by: Jisheng Zhang <[email protected]>
> > > >>>> > ---
> > > >>>> > arch/riscv/include/asm/pgtable.h | 5 ++---
> > > >>>> > 1 file changed, 2 insertions(+), 3 deletions(-)
> > > >>>> > > diff --git a/arch/riscv/include/asm/pgtable.h
> > > >>>> b/arch/riscv/include/asm/pgtable.h
> > > >>>> > index 9469f464e71a..380cd3a7e548 100644
> > > >>>> > --- a/arch/riscv/include/asm/pgtable.h
> > > >>>> > +++ b/arch/riscv/include/asm/pgtable.h
> > > >>>> > @@ -30,9 +30,8 @@
> > > >>>> > > #define BPF_JIT_REGION_SIZE (SZ_128M)
> > > >>>> > #ifdef CONFIG_64BIT
> > > >>>> > -/* KASLR should leave at least 128MB for BPF after the kernel */
> > > >>>> > -#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end)
> > > >>>> > -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START +
> > > >>>> BPF_JIT_REGION_SIZE)
> > > >>>> > +#define BPF_JIT_REGION_START (BPF_JIT_REGION_END -
> > > >>>> BPF_JIT_REGION_SIZE)
> > > >>>> > +#define BPF_JIT_REGION_END (MODULES_END)
> > > >>>> > #else
> > > >>>> > #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> > > >>>> > #define BPF_JIT_REGION_END (VMALLOC_END)
> > > >>>> >
> > > >>
> > > >> This, when applied onto fixes, is breaking early boot on KASAN
> > > >> configurations for me.
> >
> > I can reproduce this issue.
> >
> > > >
> > > > Not surprising, I took a shortcut when initializing KASAN for modules,
> > > > kernel and BPF:
> > > >
> > > > kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
> > > > kasan_mem_to_shadow((const void
> > > > *)BPF_JIT_REGION_END));
> > > >
> > > > The kernel is then not covered, I'm taking a look at how to fix that
> > > > properly.
> > > >
> > >
> > > The following based on "riscv: Introduce structure that group all
> > > variables regarding kernel mapping" fixes the issue:
> > >
> > > diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
> > > index 9daacae93e33..2a45ea909e7f 100644
> > > --- a/arch/riscv/mm/kasan_init.c
> > > +++ b/arch/riscv/mm/kasan_init.c
> > > @@ -199,9 +199,12 @@ void __init kasan_init(void)
> > > kasan_populate(kasan_mem_to_shadow(start),
> > > kasan_mem_to_shadow(end));
> > > }
> > >
> > > - /* Populate kernel, BPF, modules mapping */
> > > + /* Populate BPF and modules mapping: modules mapping encompasses
> > > BPF mapping */
> > > kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
> > > - kasan_mem_to_shadow((const void
> > > *)BPF_JIT_REGION_END));
> > > + kasan_mem_to_shadow((const void *)MODULES_END));
> > > + /* Populate kernel mapping */
> > > + kasan_populate(kasan_mem_to_shadow((const void
> > > *)kernel_map.virt_addr),
> > > + kasan_mem_to_shadow((const void
> > > *)kernel_map.virt_addr + kernel_map.size));
> > >
> > If this patch works, maybe we can still use one kasan_populate() to cover
> > kernel, bpf, and module:
> >
> > kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
> > - kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));
> > + kasan_mem_to_shadow((const void *)MODULES_VADDR + SZ_2G));
> >
>
> I made a mistake. Below patch works:
>
> kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
> - kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));
> + kasan_mem_to_shadow((const void *)(MODULES_VADDR + SZ_2G)));
This isn't the key. I knew the reason now. kasan_init() has local vars named
as _start and _end, then MODULES_VADDR is defined as:
#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
So MODULES_VADDR isn't what we expected. To fix it, we must rename the local
vars
>
> > However, both can't solve the early boot hang issue. I'm not sure what's missing.
> >
> > I applied your patch on rc6 + solution below "replace kernel_map.virt_addr with kernel_virt_addr and
> > kernel_map.size with load_sz"
> >
> >
> > Thanks
> >
> > >
> > > Without the mentioned patch, replace kernel_map.virt_addr with
> > > kernel_virt_addr and kernel_map.size with load_sz. Note that load_sz was
> > > re-exposed in v6 of the patchset "Map the kernel with correct
> > > permissions the first time".
> > >
> >
> >
> > _______________________________________________
> > linux-riscv mailing list
> > [email protected]
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
>