2020-03-24 07:31:30

by Zong Li

[permalink] [raw]
Subject: [PATCH RFC 0/8] Support KASLR for RISC-V

This patch series implement KASLR for RISC-V. It copies kernel image to
a proper and random place, and make all harts go to the new destination.

This patch depends on the patch 'riscv: Introduce CONFIG_RELOCATABLE',
but given a small change by making PAGE_OFFSET be constant, so all the
memory could be available after moving kernel physical address. This
patch also depends on 'Support strict kernel memory permissions for security'.

Zong Li (8):
riscv/kaslr: add interface to get kaslr offset
riscv/kaslr: introduce functions to clear page table
riscv/kaslr: support KASLR infrastructure
riscv/kaslr: randomize the kernel image offset
riscv/kaslr: support sparse memory model
riscv/kaslr: clear the original kernel image
riscv/kaslr: add cmdline support to disable KASLR
riscv/kaslr: dump out kernel offset information on panic

arch/riscv/Kconfig | 15 ++
arch/riscv/include/asm/kaslr.h | 12 +
arch/riscv/include/asm/page.h | 5 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/head.S | 39 +++
arch/riscv/kernel/kaslr.c | 442 +++++++++++++++++++++++++++++++++
arch/riscv/kernel/setup.c | 23 ++
arch/riscv/mm/init.c | 115 ++++++++-
8 files changed, 651 insertions(+), 2 deletions(-)
create mode 100644 arch/riscv/include/asm/kaslr.h
create mode 100644 arch/riscv/kernel/kaslr.c

--
2.25.1


2020-03-24 07:31:34

by Zong Li

[permalink] [raw]
Subject: [PATCH RFC 1/8] riscv/kaslr: add interface to get kaslr offset

Add interface to get the random offset.

Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/include/asm/page.h | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 92848e172a40..e2c2020f0a8d 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -101,6 +101,11 @@ extern unsigned long kernel_virt_addr;
extern unsigned long max_low_pfn;
extern unsigned long min_low_pfn;

+static inline unsigned long get_kaslr_offset(void)
+{
+ return kernel_virt_addr - PAGE_OFFSET;
+}
+
#define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset))
#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset)

--
2.25.1

2020-03-24 07:31:43

by Zong Li

[permalink] [raw]
Subject: [PATCH RFC 2/8] riscv/kaslr: introduce functions to clear page table

In KASLR, we need to re-create page table after getting a random
destination. Introduce clear function to clear old content. Also, the
page table entries allow writing value when it's empty, so we have to
clear the early page table.

This patch is a preparation to support KASLR.

Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/mm/init.c | 54 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index ace5d74fd939..51e263c04fa2 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -315,6 +315,7 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
#define get_pgd_next_virt(__pa) get_pmd_virt(__pa)
#define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
+#define clear_pgd_next_mapping(__nextp) clear_pmd(__nextp)
#define fixmap_pgd_next fixmap_pmd
#else
#define pgd_next_t pte_t
@@ -322,6 +323,7 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
#define get_pgd_next_virt(__pa) get_pte_virt(__pa)
#define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
+#define clear_pgd_next_mapping(__nextp) clear_pte(__nextp)
#define fixmap_pgd_next fixmap_pte
#endif

@@ -361,6 +363,58 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
return PMD_SIZE;
}

+#ifdef CONFIG_RANDOMIZE_BASE
+static void __init clear_pte(pte_t *ptep)
+{
+ unsigned int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ if (!pte_none(ptep[i]))
+ ptep[i] = __pte(0);
+}
+
+static void __init clear_pmd(pmd_t *pmdp)
+{
+ unsigned int i;
+ pte_t *ptep;
+ phys_addr_t pte_phys;
+ uintptr_t kaslr_offset = get_kaslr_offset();
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ if (!pmd_none(pmdp[i])) {
+ if (pmd_leaf(pmdp[i])) {
+ pmd_clear(&pmdp[i]);
+ } else {
+ pte_phys = PFN_PHYS(_pmd_pfn(pmdp[i]));
+ ptep = get_pte_virt(pte_phys + kaslr_offset);
+ clear_pte(ptep);
+ pmd_clear(&pmdp[i]);
+ }
+ }
+}
+
+static void __init clear_pgd(pgd_t *pgdp)
+{
+ unsigned int i;
+ pgd_next_t *nextp;
+ phys_addr_t next_phys;
+ uintptr_t kaslr_offset = get_kaslr_offset();
+
+ for (i = 0; i < PTRS_PER_PGD; i++)
+ if (pgd_val(pgdp[i]) != 0) {
+ if (pgd_leaf(pgd_val(pgdp[i]))) {
+ set_pgd(&pgdp[i], __pgd(0));
+ } else {
+ next_phys = PFN_PHYS(_pgd_pfn(pgdp[i]));
+ nextp = get_pgd_next_virt(next_phys +
+ kaslr_offset);
+ clear_pgd_next_mapping(nextp);
+ set_pgd(&pgdp[i], __pgd(0));
+ }
+ }
+}
+#endif
+
/*
* setup_vm() is called from head.S with MMU-off.
*
--
2.25.1

2020-03-24 07:32:00

by Zong Li

[permalink] [raw]
Subject: [PATCH RFC 7/8] riscv/kaslr: add cmdline support to disable KASLR

Provide a cmdline parameter 'nokaslr' to disable KASLR.

Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/kernel/kaslr.c | 34 ++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+)

diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
index 0bd30831c455..6920727e4b4a 100644
--- a/arch/riscv/kernel/kaslr.c
+++ b/arch/riscv/kernel/kaslr.c
@@ -156,6 +156,36 @@ static __init u64 kaslr_get_seed(void)
return ret;
}

+static __init const u8 *kaslr_get_cmdline(void)
+{
+ static const u8 default_cmdline[] __initconst = CONFIG_CMDLINE;
+
+ if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) {
+ int node;
+ const u8 *prop;
+
+ node = fdt_path_offset(dtb_early_va, "/chosen");
+ if (node < 0)
+ goto out;
+
+ prop = fdt_getprop(dtb_early_va, node, "bootargs", NULL);
+ if (!prop)
+ goto out;
+
+ return prop;
+ }
+
+out:
+ return default_cmdline;
+}
+
+static __init bool kaslr_is_disabled(void)
+{
+ const u8 *cmdline = kaslr_get_cmdline();
+
+ return strstr(cmdline, "nokaslr") != NULL;
+}
+
static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
uintptr_t e2)
{
@@ -379,6 +409,10 @@ uintptr_t __init kaslr_early_init(void)
if (!seed)
return 0;

+ /* Check whether disable kaslr by cmdline. */
+ if (kaslr_is_disabled())
+ return 0;
+
/* Get the random number for kaslr offset. */
kaslr_offset = get_random_offset(seed, kernel_size);

--
2.25.1

2020-03-24 07:32:04

by Zong Li

[permalink] [raw]
Subject: [PATCH RFC 8/8] riscv/kaslr: dump out kernel offset information on panic

Dump out the kernel offset when panic to help debug kernel.

Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/kernel/setup.c | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)

diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 913d25e4b9fa..3ce50bf628ba 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -85,3 +85,26 @@ void __init setup_arch(char **cmdline_p)

riscv_fill_hwcap();
}
+
+static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
+ void *p)
+{
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n",
+ get_kaslr_offset(), PAGE_OFFSET);
+
+ return 0;
+}
+
+static struct notifier_block kernel_offset_notifier = {
+ .notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && get_kaslr_offset() > 0)
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kernel_offset_notifier);
+
+ return 0;
+}
+__initcall(register_kernel_offset_dumper);
--
2.25.1

2020-03-24 07:32:32

by Zong Li

[permalink] [raw]
Subject: [PATCH RFC 3/8] riscv/kaslr: support KASLR infrastructure

This patch support KASLR implementation. It copies kernel image to a
proper and random place, and make all harts go to the new destination.

After KASLR initialization, secondary harts go to the new destination
to wait their stack pointer to be setup by main hart, main hart goes to
re-create the early page table and doing relocation by going back to
setup_vm again.

We separate the randomization process from this patch, so the kernel
offset was not randomized yet, it just hardcode a meanless number here.

Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/Kconfig | 15 +++++++++++
arch/riscv/kernel/Makefile | 2 ++
arch/riscv/kernel/head.S | 39 +++++++++++++++++++++++++++
arch/riscv/kernel/kaslr.c | 55 ++++++++++++++++++++++++++++++++++++++
arch/riscv/mm/init.c | 53 +++++++++++++++++++++++++++++++++++-
5 files changed, 163 insertions(+), 1 deletion(-)
create mode 100644 arch/riscv/kernel/kaslr.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index aea03ac470c8..8f566b40ea1e 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -183,6 +183,21 @@ config RELOCATABLE
relocation pass at runtime even if the kernel is loaded at the
same address it was linked at.

+config RANDOMIZE_BASE
+ bool "Randomize the address of the kernel image"
+ depends on MMU
+ select MODULE_SECTIONS if MODULES
+ select RELOCATABLE
+ help
+ Randomizes the virtual address at which the kernel image is
+ loaded, as a security feature that deters exploit attempts
+ relying on knowledge of the location of kernel internals.
+
+ It is the job of previous stage to provide entropy, by passing a
+ random u64 value in /chosen/kaslr-seed at kernel entry.
+
+ If unsure, say N.
+
source "arch/riscv/Kconfig.socs"

menu "Platform type"
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index d189bd3d8501..8f62732b1135 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -45,4 +45,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o
obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o
obj-$(CONFIG_RISCV_SBI) += sbi.o

+obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
+
clean:
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index cb4a6e2d3793..5191e528d813 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -113,9 +113,12 @@ clear_bss_done:
la a2, boot_cpu_hartid
REG_S a0, (a2)

+.align 2
+early_page_table:
/* Initialize page tables and relocate to virtual addresses */
la sp, init_thread_union + THREAD_SIZE
mv a0, s1
+
call setup_vm
#ifdef CONFIG_MMU
la a0, early_pg_dir
@@ -127,6 +130,29 @@ clear_bss_done:
sw zero, TASK_TI_CPU(tp)
la sp, init_thread_union + THREAD_SIZE

+#ifdef CONFIG_RANDOMIZE_BASE
+ /* KASRL initialization. Try to get a random kernel offset. */
+ call kaslr_early_init
+
+ /* If return value equals to zero, we don't need to randomize kernel */
+ beqz a0, 1f
+
+ la a1, early_page_table
+ add a1, a1, a0
+ la a0, va_pa_offset
+ REG_L a0, 0(a0)
+ sub a1, a1, a0
+ mv a0, s1
+
+ /*
+ * Go to new kernel image destination, and disable MMU to re-create
+ * early page table and do relocation.
+ */
+ csrw CSR_TVEC, a1
+ csrw CSR_SATP, x0
+1:
+#endif
+
#ifdef CONFIG_KASAN
call kasan_early_init
#endif
@@ -194,6 +220,19 @@ relocate:
la a3, .Lsecondary_park
csrw CSR_TVEC, a3

+#ifdef CONFIG_RANDOMIZE_BASE
+ /*
+ * Wait winning hart to tell secondary harts where is the new
+ * destination to go.
+ */
+.Lwait_for_next_target:
+ la a3, secondary_next_target
+ REG_L a3, 0(a3)
+ beqz a3, .Lwait_for_next_target
+ jr a3
+.global secondary_random_target
+secondary_random_target:
+#endif
slli a3, a0, LGREG
la a1, __cpu_up_stack_pointer
la a2, __cpu_up_task_pointer
diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
new file mode 100644
index 000000000000..281b5fcca5c8
--- /dev/null
+++ b/arch/riscv/kernel/kaslr.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 SiFive
+ * Copyright (C) 2020 Zong Li <[email protected]>
+ */
+
+#include <linux/libfdt.h>
+#include <linux/timex.h>
+#include <linux/random.h>
+#include <linux/set_memory.h>
+#include <asm/cacheflush.h>
+
+extern char _start[], _end[];
+extern void secondary_random_target(void);
+extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
+
+uintptr_t secondary_next_target __initdata;
+static uintptr_t kaslr_offset __initdata;
+
+uintptr_t __init kaslr_early_init(void)
+{
+ uintptr_t dest_start, dest_end;
+ uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
+
+ /* Get zero value at second time to avoid doing randomization again. */
+ if (kaslr_offset)
+ return 0;
+
+ /* Get the random number for kaslr offset. */
+ kaslr_offset = 0x10000000;
+
+ /* Update kernel_virt_addr for get_kaslr_offset. */
+ kernel_virt_addr += kaslr_offset;
+
+ if (kaslr_offset) {
+ dest_start = (uintptr_t) (PAGE_OFFSET + kaslr_offset);
+ dest_end = dest_start + kernel_size;
+
+ /* Create the new destination mapping for kernel image. */
+ kaslr_create_page_table(dest_start, dest_end);
+
+ /* Copy kernel image from orignial location. */
+ memcpy((void *)dest_start, (void *)_start, kernel_size);
+ flush_icache_range(dest_start, dest_end);
+
+ /* Make secondary harts jump to new kernel image destination. */
+ WRITE_ONCE(secondary_next_target,
+ __pa_symbol(secondary_random_target) + kaslr_offset);
+ } else {
+ WRITE_ONCE(secondary_next_target,
+ __pa_symbol(secondary_random_target));
+ }
+
+ return kaslr_offset;
+}
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 51e263c04fa2..2f5b25f02b6c 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -413,6 +413,41 @@ static void __init clear_pgd(pgd_t *pgdp)
}
}
}
+
+static void __init clear_page_tables(void)
+{
+ clear_pgd(early_pg_dir);
+ clear_pgd(trampoline_pg_dir);
+}
+
+void __init kaslr_create_page_table(uintptr_t start, uintptr_t end)
+{
+ pgd_next_t *nextp;
+ phys_addr_t next_phys;
+ uintptr_t pgd_index, va;
+ phys_addr_t pa = __pa(PAGE_OFFSET) + get_kaslr_offset();
+ uintptr_t map_size =
+ best_map_size(__pa(PAGE_OFFSET), MAX_EARLY_MAPPING_SIZE);
+
+ /* Expolit early_pg_dir and early_pmd during using early page table. */
+ for (va = start; va < end; va += map_size, pa += map_size) {
+ pgd_index = pgd_index(va);
+
+ if (pgd_val(early_pg_dir[pgd_index]) == 0) {
+ next_phys = alloc_pgd_next(va);
+ early_pg_dir[pgd_index] =
+ pfn_pgd(PFN_DOWN(next_phys), PAGE_TABLE);
+ nextp = (pgd_next_t *)(__va(next_phys));
+ memset(nextp, 0, PAGE_SIZE);
+ } else {
+ next_phys = PFN_PHYS(_pgd_pfn(early_pg_dir[pgd_index]));
+ nextp = (pgd_next_t *)(__va(next_phys));
+ }
+
+ create_pgd_next_mapping(nextp, va, pa, map_size,
+ PAGE_KERNEL_EXEC);
+ }
+}
#endif

/*
@@ -489,7 +524,13 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);

va_pa_offset = kernel_virt_addr - load_pa;
- pfn_base = PFN_DOWN(load_pa);
+
+ /*
+ * Update pfn_base only if pfn_base is empty. It's avoid to mess up it
+ * when re-enter this function by KASLR.
+ */
+ if (!pfn_base)
+ pfn_base = PFN_DOWN(load_pa);

#ifdef CONFIG_RELOCATABLE
/*
@@ -513,6 +554,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
BUG_ON((load_pa % map_size) != 0);
BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE);

+#ifdef CONFIG_RANDOMIZE_BASE
+ /*
+ * Enter setup_vm twice if there is a legal random destination in KASLR,
+ * Reach here at second time, Clear page table because PTE entris allow
+ * writing when it's empty.
+ */
+ if (get_kaslr_offset())
+ clear_page_tables();
+#endif
+
/* Setup early PGD for fixmap */
create_pgd_mapping(early_pg_dir, FIXADDR_START,
(uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
--
2.25.1

2020-03-24 07:32:32

by Zong Li

[permalink] [raw]
Subject: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset

Entropy is derived from the banner and timer, it is better than nothing
but not enough secure, so previous stage may pass entropy via the device
tree /chosen/kaslr-seed node.

We limit randomization range within 1GB, so we can exploit early page
table to map new destination of kernel image. Additionally, the kernel
offset need 2M alignment to ensure it's good in PMD page table.

We also checks the kernel offset whether it's safe by avoiding to
overlaps with dtb, initrd and reserved memory regions.

Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
arch/riscv/mm/init.c | 2 +-
2 files changed, 273 insertions(+), 3 deletions(-)

diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
index 281b5fcca5c8..9ec2b608eb7f 100644
--- a/arch/riscv/kernel/kaslr.c
+++ b/arch/riscv/kernel/kaslr.c
@@ -11,23 +11,293 @@
#include <asm/cacheflush.h>

extern char _start[], _end[];
+extern void *dtb_early_va;
+extern phys_addr_t dtb_early_pa;
extern void secondary_random_target(void);
extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);

uintptr_t secondary_next_target __initdata;
static uintptr_t kaslr_offset __initdata;

+static const __init u32 *get_reg_address(int root_cells,
+ const u32 *value, u64 *result)
+{
+ int cell;
+ *result = 0;
+
+ for (cell = root_cells; cell > 0; --cell)
+ *result = (*result << 32) + fdt32_to_cpu(*value++);
+
+ return value;
+}
+
+static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
+ int *size_cell)
+{
+ int node = fdt_path_offset(dtb_early_va, path);
+ fdt64_t *prop;
+
+ if (node < 0)
+ return -EINVAL;
+
+ prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
+ if (!prop)
+ return -EINVAL;
+ *addr_cell = fdt32_to_cpu(*prop);
+
+ prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
+ if (!prop)
+ return -EINVAL;
+ *size_cell = fdt32_to_cpu(*prop);
+
+ return node;
+}
+
+static __init void kaslr_get_mem_info(uintptr_t *mem_start,
+ uintptr_t *mem_size)
+{
+ int node, root, addr_cells, size_cells;
+ u64 base, size;
+
+ /* Get root node's address cells and size cells. */
+ root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
+ if (root < 0)
+ return;
+
+ /* Get memory base address and size. */
+ fdt_for_each_subnode(node, dtb_early_va, root) {
+ const char *dev_type;
+ const u32 *reg;
+
+ dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
+ if (!dev_type)
+ continue;
+
+ if (!strcmp(dev_type, "memory")) {
+ reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
+ if (!reg)
+ return;
+
+ reg = get_reg_address(addr_cells, reg, &base);
+ reg = get_reg_address(size_cells, reg, &size);
+
+ *mem_start = base;
+ *mem_size = size;
+
+ break;
+ }
+ }
+}
+
+/* Return a default seed if there is no HW generator. */
+static u64 kaslr_default_seed = ULL(-1);
+static __init u64 kaslr_get_seed(void)
+{
+ int node, len;
+ fdt64_t *prop;
+ u64 ret;
+
+ node = fdt_path_offset(dtb_early_va, "/chosen");
+ if (node < 0)
+ return kaslr_default_seed++;
+
+ prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
+ if (!prop || len != sizeof(u64))
+ return kaslr_default_seed++;
+
+ ret = fdt64_to_cpu(*prop);
+
+ /* Re-write to zero for checking whether get seed at second time */
+ *prop = 0;
+
+ return ret;
+}
+
+static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
+ uintptr_t e2)
+{
+ return e1 >= s2 && e2 >= s1;
+}
+
+static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
+ uintptr_t end_addr)
+{
+ int node, rsv_mem, addr_cells, size_cells;
+
+ /* Get the reserved-memory node. */
+ rsv_mem = get_node_addr_size_cells("/reserved-memory",
+ &addr_cells,
+ &size_cells);
+ if (rsv_mem < 0)
+ return false;
+
+ /* Get memory base address and size. */
+ fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
+ uint64_t base, size;
+ const uint32_t *reg;
+
+ reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
+ if (!reg)
+ return 0;
+
+ reg = get_reg_address(addr_cells, reg, &base);
+ reg = get_reg_address(size_cells, reg, &size);
+
+ if (is_overlap(start_addr, end_addr, base, base + size))
+ return true;
+ }
+
+ return false;
+}
+
+static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
+{
+ int node;
+ uintptr_t initrd_start, initrd_end;
+ fdt64_t *prop;
+
+ node = fdt_path_offset(dtb_early_va, "/chosen");
+ if (node < 0)
+ return false;
+
+ prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
+ if (!prop)
+ return false;
+
+ initrd_start = fdt64_to_cpu(*prop);
+
+ prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
+ if (!prop)
+ return false;
+
+ initrd_end = fdt64_to_cpu(*prop);
+
+ return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
+}
+
+static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
+{
+ uintptr_t dtb_start = dtb_early_pa;
+ uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
+
+ return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
+}
+
+static __init bool has_regions_overlapping(uintptr_t start_addr,
+ uintptr_t end_addr)
+{
+ if (is_overlap_dtb(start_addr, end_addr))
+ return true;
+
+ if (is_overlap_initrd(start_addr, end_addr))
+ return true;
+
+ if (is_overlap_reserved_mem(start_addr, end_addr))
+ return true;
+
+ return false;
+}
+
+static inline __init unsigned long get_legal_offset(int random_index,
+ int max_index,
+ uintptr_t mem_start,
+ uintptr_t kernel_size)
+{
+ uintptr_t start_addr, end_addr;
+ int idx, stop_idx;
+
+ idx = stop_idx = random_index;
+
+ do {
+ start_addr = mem_start + idx * SZ_2M + kernel_size;
+ end_addr = start_addr + kernel_size;
+
+ /* Check overlap to other regions. */
+ if (!has_regions_overlapping(start_addr, end_addr))
+ return idx * SZ_2M + kernel_size;
+
+ if (idx-- < 0)
+ idx = max_index;
+
+ } while (idx != stop_idx);
+
+ return 0;
+}
+
+static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
+{
+ size_t i;
+ uintptr_t *ptr = (uintptr_t *) area;
+
+ for (i = 0; i < size / sizeof(hash); i++) {
+ /* Rotate by odd number of bits and XOR. */
+ hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
+ hash ^= ptr[i];
+ }
+
+ return hash;
+}
+
+#define MEM_RESERVE_START __pa(PAGE_OFFSET)
+static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
+{
+ uintptr_t mem_start = 0, mem_size= 0, random_size;
+ uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
+ int index;
+ u64 random = 0;
+ cycles_t time_base;
+
+ /* Attempt to create a simple but unpredictable starting entropy */
+ random = rotate_xor(random, linux_banner, strlen(linux_banner));
+
+ /*
+ * If there is no HW random number generator, use timer to get a random
+ * number. This is better than nothing but not enough secure.
+ */
+ time_base = get_cycles() << 32;
+ time_base ^= get_cycles();
+ random = rotate_xor(random, &time_base, sizeof(time_base));
+
+ if (seed)
+ random = rotate_xor(random, &seed, sizeof(seed));
+
+ kaslr_get_mem_info(&mem_start, &mem_size);
+ if (!mem_size)
+ return 0;
+
+ if (mem_start < MEM_RESERVE_START) {
+ mem_size -= MEM_RESERVE_START - mem_start;
+ mem_start = MEM_RESERVE_START;
+ }
+
+ /*
+ * Limit randomization range within 1G, so we can exploit
+ * early_pmd/early_pte during early page table phase.
+ */
+ random_size = min_t(u64,
+ mem_size - (kernel_size_align * 2),
+ SZ_1G - (kernel_size_align * 2));
+
+ /* The index of 2M block in whole avaliable region */
+ index = random % (random_size / SZ_2M);
+
+ return get_legal_offset(index, random_size / SZ_2M,
+ mem_start, kernel_size_align);
+}
+
uintptr_t __init kaslr_early_init(void)
{
+ u64 seed;
uintptr_t dest_start, dest_end;
uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;

/* Get zero value at second time to avoid doing randomization again. */
- if (kaslr_offset)
+ seed = kaslr_get_seed();
+ if (!seed)
return 0;

/* Get the random number for kaslr offset. */
- kaslr_offset = 0x10000000;
+ kaslr_offset = get_random_offset(seed, kernel_size);

/* Update kernel_virt_addr for get_kaslr_offset. */
kernel_virt_addr += kaslr_offset;
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 2f5b25f02b6c..34c6ecf2c599 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -125,7 +125,7 @@ static void __init setup_initrd(void)
}
#endif /* CONFIG_BLK_DEV_INITRD */

-static phys_addr_t dtb_early_pa __initdata;
+phys_addr_t dtb_early_pa __initdata;

void __init setup_bootmem(void)
{
--
2.25.1

2020-03-24 07:32:43

by Zong Li

[permalink] [raw]
Subject: [PATCH RFC 5/8] riscv/kaslr: support sparse memory model

For sparse memory model, we select a random memory node first, then get
a random offset in this node. It gets one memory node in flat memory
model case.

Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/kernel/kaslr.c | 139 ++++++++++++++++++++++++++++----------
1 file changed, 105 insertions(+), 34 deletions(-)

diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
index 9ec2b608eb7f..59001d6fdfc3 100644
--- a/arch/riscv/kernel/kaslr.c
+++ b/arch/riscv/kernel/kaslr.c
@@ -55,8 +55,9 @@ static __init int get_node_addr_size_cells(const char *path, int *addr_cell,

static __init void kaslr_get_mem_info(uintptr_t *mem_start,
uintptr_t *mem_size)
+ uintptr_t kernel_size, int find_index)
{
- int node, root, addr_cells, size_cells;
+ int node, root, addr_cells, size_cells, idx = 0;
u64 base, size;

/* Get root node's address cells and size cells. */
@@ -81,14 +82,56 @@ static __init void kaslr_get_mem_info(uintptr_t *mem_start,
reg = get_reg_address(addr_cells, reg, &base);
reg = get_reg_address(size_cells, reg, &size);

- *mem_start = base;
- *mem_size = size;
+ if (size < (kernel_size * 2))
+ continue;

- break;
+ if (idx == find_index) {
+ *mem_start = base;
+ *mem_size = size;
+ break;
+ }
+
+ idx++;
}
}
}

+static __init int get_memory_nodes_num(uintptr_t kernel_size)
+{
+ int node, root, addr_cells, size_cells, total_nodes = 0;
+ u64 base, size;
+
+ /* Get root node's address cells and size cells. */
+ root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
+ if (root < 0)
+ return 0;
+
+ /* Get memory base address and size. */
+ fdt_for_each_subnode(node, dtb_early_va, root) {
+ const char *dev_type;
+ const u32 *reg;
+
+ dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
+ if (!dev_type)
+ continue;
+
+ if (!strcmp(dev_type, "memory")) {
+ reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
+ if (!reg)
+ return 0;
+
+ reg = get_reg_address(addr_cells, reg, &base);
+ reg = get_reg_address(size_cells, reg, &size);
+
+ /* Candidate ensures that it don't overlap itself. */
+ if (size > kernel_size * 2)
+ total_nodes++;
+ }
+ }
+
+ return total_nodes;
+}
+
/* Return a default seed if there is no HW generator. */
static u64 kaslr_default_seed = ULL(-1);
static __init u64 kaslr_get_seed(void)
@@ -198,10 +241,11 @@ static __init bool has_regions_overlapping(uintptr_t start_addr,
return false;
}

-static inline __init unsigned long get_legal_offset(int random_index,
- int max_index,
- uintptr_t mem_start,
- uintptr_t kernel_size)
+static inline __init unsigned long get_legal_offset_in_node(int random_index,
+ int max_index,
+ uintptr_t mem_start,
+ uintptr_t
+ kernel_size)
{
uintptr_t start_addr, end_addr;
int idx, stop_idx;
@@ -214,7 +258,8 @@ static inline __init unsigned long get_legal_offset(int random_index,

/* Check overlap to other regions. */
if (!has_regions_overlapping(start_addr, end_addr))
- return idx * SZ_2M + kernel_size;
+ return idx * SZ_2M + kernel_size + (mem_start -
+ __pa(PAGE_OFFSET));

if (idx-- < 0)
idx = max_index;
@@ -224,6 +269,56 @@ static inline __init unsigned long get_legal_offset(int random_index,
return 0;
}

+#define MEM_RESERVE_START __pa(PAGE_OFFSET)
+static inline __init unsigned long get_legal_offset(u64 random,
+ uintptr_t kernel_size)
+{
+ int mem_nodes, idx, stop_idx, index;
+ uintptr_t mem_start = 0, mem_size = 0, random_size, ret;
+
+ mem_nodes = get_memory_nodes_num(kernel_size);
+
+ idx = stop_idx = random % mem_nodes;
+
+ do {
+ kaslr_get_mem_info(&mem_start, &mem_size, kernel_size, idx);
+
+ if (!mem_size)
+ return 0;
+
+ if (mem_start < MEM_RESERVE_START) {
+ mem_size -= MEM_RESERVE_START - mem_start;
+ mem_start = MEM_RESERVE_START;
+ }
+
+ /*
+ * Limit randomization range within 1G, so we can exploit
+ * early_pmd/early_pte during early page table phase.
+ */
+ random_size = min_t(u64,
+ mem_size - (kernel_size * 2),
+ SZ_1G - (kernel_size * 2));
+
+ if (!random_size || random_size < SZ_2M)
+ return 0;
+
+ /* The index of 2M block in whole available region */
+ index = random % (random_size / SZ_2M);
+
+ ret =
+ get_legal_offset_in_node(index, random_size / SZ_2M,
+ mem_start, kernel_size);
+ if (ret)
+ break;
+
+ if (idx-- < 0)
+ idx = mem_nodes - 1;
+
+ } while (idx != stop_idx);
+
+ return ret;
+}
+
static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
{
size_t i;
@@ -238,12 +333,9 @@ static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
return hash;
}

-#define MEM_RESERVE_START __pa(PAGE_OFFSET)
static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
{
- uintptr_t mem_start = 0, mem_size= 0, random_size;
uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
- int index;
u64 random = 0;
cycles_t time_base;

@@ -261,28 +353,7 @@ static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
if (seed)
random = rotate_xor(random, &seed, sizeof(seed));

- kaslr_get_mem_info(&mem_start, &mem_size);
- if (!mem_size)
- return 0;
-
- if (mem_start < MEM_RESERVE_START) {
- mem_size -= MEM_RESERVE_START - mem_start;
- mem_start = MEM_RESERVE_START;
- }
-
- /*
- * Limit randomization range within 1G, so we can exploit
- * early_pmd/early_pte during early page table phase.
- */
- random_size = min_t(u64,
- mem_size - (kernel_size_align * 2),
- SZ_1G - (kernel_size_align * 2));
-
- /* The index of 2M block in whole avaliable region */
- index = random % (random_size / SZ_2M);
-
- return get_legal_offset(index, random_size / SZ_2M,
- mem_start, kernel_size_align);
+ return get_legal_offset(random, kernel_size_align);
}

uintptr_t __init kaslr_early_init(void)
--
2.25.1

2020-03-24 07:32:51

by Zong Li

[permalink] [raw]
Subject: [PATCH RFC 6/8] riscv/kaslr: clear the original kernel image

After completing final page table, we can clear original kernel image
and remove executable permission.

Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/include/asm/kaslr.h | 12 ++++++++++++
arch/riscv/kernel/kaslr.c | 12 ++++++++++++
arch/riscv/mm/init.c | 6 ++++++
3 files changed, 30 insertions(+)
create mode 100644 arch/riscv/include/asm/kaslr.h

diff --git a/arch/riscv/include/asm/kaslr.h b/arch/riscv/include/asm/kaslr.h
new file mode 100644
index 000000000000..b165fe71dd4a
--- /dev/null
+++ b/arch/riscv/include/asm/kaslr.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 SiFive
+ * Copyright (C) 2020 Zong Li <[email protected]>
+ */
+
+#ifndef _ASM_RISCV_KASLR_H
+#define _ASM_RISCV_KASLR_H
+
+void __init kaslr_late_init(void);
+
+#endif /* _ASM_RISCV_KASLR_H */
diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
index 59001d6fdfc3..0bd30831c455 100644
--- a/arch/riscv/kernel/kaslr.c
+++ b/arch/riscv/kernel/kaslr.c
@@ -356,6 +356,18 @@ static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
return get_legal_offset(random, kernel_size_align);
}

+void __init kaslr_late_init(void)
+{
+ uintptr_t kernel_size;
+
+ /* Clear original kernel image. */
+ if (kaslr_offset) {
+ kernel_size = (uintptr_t) _end - (uintptr_t) _start;
+ memset((void *)PAGE_OFFSET, 0, kernel_size);
+ set_memory_nx(PAGE_OFFSET, kaslr_offset >> PAGE_SHIFT);
+ }
+}
+
uintptr_t __init kaslr_early_init(void)
{
u64 seed;
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 34c6ecf2c599..08e2ce170533 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -15,6 +15,7 @@
#include <linux/set_memory.h>
#ifdef CONFIG_RELOCATABLE
#include <linux/elf.h>
+#include <asm/kaslr.h>
#endif

#include <asm/fixmap.h>
@@ -649,6 +650,11 @@ static void __init setup_vm_final(void)
/* Move to swapper page table */
csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
local_flush_tlb_all();
+
+#ifdef CONFIG_RANDOMIZE_BASE
+ /* Clear orignial kernel image and set the right permission. */
+ kaslr_late_init();
+#endif
}

void free_initmem(void)
--
2.25.1

2020-04-07 05:09:13

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 1/8] riscv/kaslr: add interface to get kaslr offset

On 3/24/20 3:30 AM, Zong Li wrote:
> Add interface to get the random offset.
>
> Signed-off-by: Zong Li <[email protected]>
> ---
> arch/riscv/include/asm/page.h | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index 92848e172a40..e2c2020f0a8d 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -101,6 +101,11 @@ extern unsigned long kernel_virt_addr;
> extern unsigned long max_low_pfn;
> extern unsigned long min_low_pfn;
>
> +static inline unsigned long get_kaslr_offset(void)
> +{
> + return kernel_virt_addr - PAGE_OFFSET;
> +}
> +
> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset))
> #define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset)
>
>

No problem for this one:

Reviewed-by: Alexandre Ghiti <[email protected]>

Thanks,

Alex

2020-04-07 05:10:07

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 2/8] riscv/kaslr: introduce functions to clear page table

On 3/24/20 3:30 AM, Zong Li wrote:
> In KASLR, we need to re-create page table after getting a random
> destination. Introduce clear function to clear old content. Also, the
> page table entries allow writing value when it's empty, so we have to
> clear the early page table.
>
> This patch is a preparation to support KASLR.
>
> Signed-off-by: Zong Li <[email protected]>
> ---
> arch/riscv/mm/init.c | 54 ++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 54 insertions(+)
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index ace5d74fd939..51e263c04fa2 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -315,6 +315,7 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
> #define get_pgd_next_virt(__pa) get_pmd_virt(__pa)
> #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
> create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
> +#define clear_pgd_next_mapping(__nextp) clear_pmd(__nextp)
> #define fixmap_pgd_next fixmap_pmd
> #else
> #define pgd_next_t pte_t
> @@ -322,6 +323,7 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
> #define get_pgd_next_virt(__pa) get_pte_virt(__pa)
> #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
> create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
> +#define clear_pgd_next_mapping(__nextp) clear_pte(__nextp)
> #define fixmap_pgd_next fixmap_pte
> #endif
>
> @@ -361,6 +363,58 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
> return PMD_SIZE;
> }
>
> +#ifdef CONFIG_RANDOMIZE_BASE
> +static void __init clear_pte(pte_t *ptep)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < PTRS_PER_PTE; i++)
> + if (!pte_none(ptep[i]))
> + ptep[i] = __pte(0);
> +}
> +
> +static void __init clear_pmd(pmd_t *pmdp)
> +{
> + unsigned int i;
> + pte_t *ptep;
> + phys_addr_t pte_phys;
> + uintptr_t kaslr_offset = get_kaslr_offset();
> +
> + for (i = 0; i < PTRS_PER_PMD; i++)
> + if (!pmd_none(pmdp[i])) {
> + if (pmd_leaf(pmdp[i])) {
> + pmd_clear(&pmdp[i]);
> + } else {
> + pte_phys = PFN_PHYS(_pmd_pfn(pmdp[i]));
> + ptep = get_pte_virt(pte_phys + kaslr_offset);
> + clear_pte(ptep);
> + pmd_clear(&pmdp[i]);
> + }
> + }
> +}
> +
> +static void __init clear_pgd(pgd_t *pgdp)
> +{
> + unsigned int i;
> + pgd_next_t *nextp;
> + phys_addr_t next_phys;
> + uintptr_t kaslr_offset = get_kaslr_offset();
> +
> + for (i = 0; i < PTRS_PER_PGD; i++)
> + if (pgd_val(pgdp[i]) != 0) {
> + if (pgd_leaf(pgd_val(pgdp[i]))) {
> + set_pgd(&pgdp[i], __pgd(0));
> + } else {
> + next_phys = PFN_PHYS(_pgd_pfn(pgdp[i]));
> + nextp = get_pgd_next_virt(next_phys +
> + kaslr_offset);
> + clear_pgd_next_mapping(nextp);
> + set_pgd(&pgdp[i], __pgd(0));
> + }
> + }
> +}
> +#endif
> +
> /*
> * setup_vm() is called from head.S with MMU-off.
> *
>

If this is only for clearing early page tables, a memset is way easier
as there is only one page per level to clear.

Alex

2020-04-07 05:11:27

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 3/8] riscv/kaslr: support KASLR infrastructure



On 3/24/20 3:30 AM, Zong Li wrote:
> This patch support KASLR implementation. It copies kernel image to a
> proper and random place, and make all harts go to the new destination.
>
> After KASLR initialization, secondary harts go to the new destination
> to wait their stack pointer to be setup by main hart, main hart goes to
> re-create the early page table and doing relocation by going back to
> setup_vm again.
>
> We separate the randomization process from this patch, so the kernel
> offset was not randomized yet, it just hardcode a meanless number here.
>
> Signed-off-by: Zong Li <[email protected]>
> ---
> arch/riscv/Kconfig | 15 +++++++++++
> arch/riscv/kernel/Makefile | 2 ++
> arch/riscv/kernel/head.S | 39 +++++++++++++++++++++++++++
> arch/riscv/kernel/kaslr.c | 55 ++++++++++++++++++++++++++++++++++++++
> arch/riscv/mm/init.c | 53 +++++++++++++++++++++++++++++++++++-
> 5 files changed, 163 insertions(+), 1 deletion(-)
> create mode 100644 arch/riscv/kernel/kaslr.c
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index aea03ac470c8..8f566b40ea1e 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -183,6 +183,21 @@ config RELOCATABLE
> relocation pass at runtime even if the kernel is loaded at the
> same address it was linked at.
>
> +config RANDOMIZE_BASE
> + bool "Randomize the address of the kernel image"
> + depends on MMU
> + select MODULE_SECTIONS if MODULES
> + select RELOCATABLE
> + help
> + Randomizes the virtual address at which the kernel image is
> + loaded, as a security feature that deters exploit attempts
> + relying on knowledge of the location of kernel internals.
> +
> + It is the job of previous stage to provide entropy, by passing a
> + random u64 value in /chosen/kaslr-seed at kernel entry.
> +
> + If unsure, say N.
> +
> source "arch/riscv/Kconfig.socs"
>
> menu "Platform type"
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index d189bd3d8501..8f62732b1135 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -45,4 +45,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o
> obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o
> obj-$(CONFIG_RISCV_SBI) += sbi.o
>
> +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
> +
> clean:
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index cb4a6e2d3793..5191e528d813 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -113,9 +113,12 @@ clear_bss_done:
> la a2, boot_cpu_hartid
> REG_S a0, (a2)
>
> +.align 2

Why do you need this new alignment constraint ?

> +early_page_table:
> /* Initialize page tables and relocate to virtual addresses */
> la sp, init_thread_union + THREAD_SIZE
> mv a0, s1
> +

Newline ?

> call setup_vm
> #ifdef CONFIG_MMU
> la a0, early_pg_dir
> @@ -127,6 +130,29 @@ clear_bss_done:
> sw zero, TASK_TI_CPU(tp)
> la sp, init_thread_union + THREAD_SIZE
>
> +#ifdef CONFIG_RANDOMIZE_BASE
> + /* KASRL initialization. Try to get a random kernel offset. */
> + call kaslr_early_init
> +
> + /* If return value equals to zero, we don't need to randomize kernel */
> + beqz a0, 1f
> +
> + la a1, early_page_table
> + add a1, a1, a0
> + la a0, va_pa_offset
> + REG_L a0, 0(a0)
> + sub a1, a1, a0
> + mv a0, s1
> +
> + /*
> + * Go to new kernel image destination, and disable MMU to re-create
> + * early page table and do relocation.
> + */
> + csrw CSR_TVEC, a1
> + csrw CSR_SATP, x0
> +1:
> +#endif
> +
> #ifdef CONFIG_KASAN
> call kasan_early_init
> #endif
> @@ -194,6 +220,19 @@ relocate:
> la a3, .Lsecondary_park
> csrw CSR_TVEC, a3
>
> +#ifdef CONFIG_RANDOMIZE_BASE
> + /*
> + * Wait winning hart to tell secondary harts where is the new
> + * destination to go.
> + */
> +.Lwait_for_next_target:
> + la a3, secondary_next_target
> + REG_L a3, 0(a3)
> + beqz a3, .Lwait_for_next_target
> + jr a3
> +.global secondary_random_target
> +secondary_random_target:
> +#endif
> slli a3, a0, LGREG
> la a1, __cpu_up_stack_pointer
> la a2, __cpu_up_task_pointer
> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> new file mode 100644
> index 000000000000..281b5fcca5c8
> --- /dev/null
> +++ b/arch/riscv/kernel/kaslr.c
> @@ -0,0 +1,55 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2020 SiFive
> + * Copyright (C) 2020 Zong Li <[email protected]>
> + */
> +
> +#include <linux/libfdt.h>
> +#include <linux/timex.h>
> +#include <linux/random.h>
> +#include <linux/set_memory.h>
> +#include <asm/cacheflush.h>
> +
> +extern char _start[], _end[];
> +extern void secondary_random_target(void);
> +extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
> +
> +uintptr_t secondary_next_target __initdata;
> +static uintptr_t kaslr_offset __initdata;
> +
> +uintptr_t __init kaslr_early_init(void)
> +{
> + uintptr_t dest_start, dest_end;
> + uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
> +
> + /* Get zero value at second time to avoid doing randomization again. */
> + if (kaslr_offset)
> + return 0;
> +
> + /* Get the random number for kaslr offset. */
> + kaslr_offset = 0x10000000;

For clarity, you could use a macro or something like that for this constant.

> +
> + /* Update kernel_virt_addr for get_kaslr_offset. */
> + kernel_virt_addr += kaslr_offset;

This could be done after you test if kaslr_offset is null below.

> +
> + if (kaslr_offset) {
> + dest_start = (uintptr_t) (PAGE_OFFSET + kaslr_offset);
> + dest_end = dest_start + kernel_size;

dest_end = dest_start + kernel_size - 1;

> +
> + /* Create the new destination mapping for kernel image. */
> + kaslr_create_page_table(dest_start, dest_end);
> +
> + /* Copy kernel image from orignial location. */
> + memcpy((void *)dest_start, (void *)_start, kernel_size);
> + flush_icache_range(dest_start, dest_end); > +
> + /* Make secondary harts jump to new kernel image destination. */
> + WRITE_ONCE(secondary_next_target,
> + __pa_symbol(secondary_random_target) + kaslr_offset);

Don't you need to sync secondary harts icache with main hart dcache here ?

> + } else {
> + WRITE_ONCE(secondary_next_target,
> + __pa_symbol(secondary_random_target));
> + }
> +
> + return kaslr_offset;
> +}
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 51e263c04fa2..2f5b25f02b6c 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -413,6 +413,41 @@ static void __init clear_pgd(pgd_t *pgdp)
> }
> }
> }
> +
> +static void __init clear_page_tables(void)
> +{
> + clear_pgd(early_pg_dir);
> + clear_pgd(trampoline_pg_dir);
> +}

early page table and trampoline page table consist in one page per
level, I confirm that a memset to 0 is easier here.

> +
> +void __init kaslr_create_page_table(uintptr_t start, uintptr_t end)
> +{
> + pgd_next_t *nextp;
> + phys_addr_t next_phys;
> + uintptr_t pgd_index, va;
> + phys_addr_t pa = __pa(PAGE_OFFSET) + get_kaslr_offset();
> + uintptr_t map_size =
> + best_map_size(__pa(PAGE_OFFSET), MAX_EARLY_MAPPING_SIZE);
> +
> + /* Expolit early_pg_dir and early_pmd during using early page table. */
> + for (va = start; va < end; va += map_size, pa += map_size) {
> + pgd_index = pgd_index(va);
> +
> + if (pgd_val(early_pg_dir[pgd_index]) == 0) {
> + next_phys = alloc_pgd_next(va);
> + early_pg_dir[pgd_index] =
> + pfn_pgd(PFN_DOWN(next_phys), PAGE_TABLE);
> + nextp = (pgd_next_t *)(__va(next_phys));
> + memset(nextp, 0, PAGE_SIZE);
> + } else {
> + next_phys = PFN_PHYS(_pgd_pfn(early_pg_dir[pgd_index]));
> + nextp = (pgd_next_t *)(__va(next_phys));
> + }
> +
> + create_pgd_next_mapping(nextp, va, pa, map_size,
> + PAGE_KERNEL_EXEC);
> + }
> +}
> #endif

I may be missing something here: I don't see where the mappings for the
new kernel you create here are used between here and setup_vm ?

If I read correctly, if kaslr_early_init returns a random offset, you
disable mmu and then call setup_vm which will recreate early page tables
anyway.

>
> /*
> @@ -489,7 +524,13 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);
>
> va_pa_offset = kernel_virt_addr - load_pa;
> - pfn_base = PFN_DOWN(load_pa);
> +
> + /*
> + * Update pfn_base only if pfn_base is empty. It's avoid to mess up it
> + * when re-enter this function by KASLR.
> + */
> + if (!pfn_base)
> + pfn_base = PFN_DOWN(load_pa);
>
> #ifdef CONFIG_RELOCATABLE
> /*
> @@ -513,6 +554,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> BUG_ON((load_pa % map_size) != 0);
> BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE);
>
> +#ifdef CONFIG_RANDOMIZE_BASE
> + /*
> + * Enter setup_vm twice if there is a legal random destination in KASLR,
> + * Reach here at second time, Clear page table because PTE entris allow
> + * writing when it's empty.
> + */
> + if (get_kaslr_offset())
> + clear_page_tables();
> +#endif
> +
> /* Setup early PGD for fixmap */
> create_pgd_mapping(early_pg_dir, FIXADDR_START,
> (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
>

Just an idea, maybe worthless, but couldn't we benefit from kexec here ?
That's quite the same: copy a new kernel from the current kernel in some
new memory locations and then jump to it. We could pass the computed
random offset as a very early kernel parameter so that setup_vm would
only be called once (per kernel).

Alex

2020-04-07 05:13:06

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 6/8] riscv/kaslr: clear the original kernel image

On 3/24/20 3:30 AM, Zong Li wrote:
> After completing final page table, we can clear original kernel image
> and remove executable permission.
>
> Signed-off-by: Zong Li <[email protected]>
> ---
> arch/riscv/include/asm/kaslr.h | 12 ++++++++++++
> arch/riscv/kernel/kaslr.c | 12 ++++++++++++
> arch/riscv/mm/init.c | 6 ++++++
> 3 files changed, 30 insertions(+)
> create mode 100644 arch/riscv/include/asm/kaslr.h
>
> diff --git a/arch/riscv/include/asm/kaslr.h b/arch/riscv/include/asm/kaslr.h
> new file mode 100644
> index 000000000000..b165fe71dd4a
> --- /dev/null
> +++ b/arch/riscv/include/asm/kaslr.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2020 SiFive
> + * Copyright (C) 2020 Zong Li <[email protected]>
> + */
> +
> +#ifndef _ASM_RISCV_KASLR_H
> +#define _ASM_RISCV_KASLR_H
> +
> +void __init kaslr_late_init(void);
> +
> +#endif /* _ASM_RISCV_KASLR_H */
> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> index 59001d6fdfc3..0bd30831c455 100644
> --- a/arch/riscv/kernel/kaslr.c
> +++ b/arch/riscv/kernel/kaslr.c
> @@ -356,6 +356,18 @@ static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
> return get_legal_offset(random, kernel_size_align);
> }
>
> +void __init kaslr_late_init(void)
> +{
> + uintptr_t kernel_size;
> +
> + /* Clear original kernel image. */
> + if (kaslr_offset) {
> + kernel_size = (uintptr_t) _end - (uintptr_t) _start;

kernel_size = (uintptr_t) _end - (uintptr_t) _start + 1;

> + memset((void *)PAGE_OFFSET, 0, kernel_size);

I have been thinking again about our discussion regarding PAGE_OFFSET:
PAGE_OFFSET actually points to the address where the kernel was loaded,
not the beginning of memory, that's a bit weird.

Just saying that here, because it took me a few seconds to remember that
and understand what you were doing here.

> + set_memory_nx(PAGE_OFFSET, kaslr_offset >> PAGE_SHIFT);

Again, I certainly missed something but when do you use old kernel
mappings ?

> + }
> +}
> +
> uintptr_t __init kaslr_early_init(void)
> {
> u64 seed;
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 34c6ecf2c599..08e2ce170533 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -15,6 +15,7 @@
> #include <linux/set_memory.h>
> #ifdef CONFIG_RELOCATABLE
> #include <linux/elf.h>
> +#include <asm/kaslr.h>
> #endif
>
> #include <asm/fixmap.h>
> @@ -649,6 +650,11 @@ static void __init setup_vm_final(void)
> /* Move to swapper page table */
> csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
> local_flush_tlb_all();
> +
> +#ifdef CONFIG_RANDOMIZE_BASE
> + /* Clear orignial kernel image and set the right permission. */
> + kaslr_late_init();
> +#endif
> }
>
> void free_initmem(void)
>

Alex

2020-04-07 05:13:12

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset


On 3/24/20 3:30 AM, Zong Li wrote:
> Entropy is derived from the banner and timer, it is better than nothing
> but not enough secure, so previous stage may pass entropy via the device
> tree /chosen/kaslr-seed node.
>
> We limit randomization range within 1GB, so we can exploit early page
> table to map new destination of kernel image. Additionally, the kernel
> offset need 2M alignment to ensure it's good in PMD page table.
>
> We also checks the kernel offset whether it's safe by avoiding to
> overlaps with dtb, initrd and reserved memory regions.
>

That maybe changes the way my sv48 patchset will be implemented: I can't
get user preference (3-level or 4-level) by any means, device-tree or
kernel parameter.

But I don't see how you could get a random offset without info from the
device tree anyway (reserved memory regions especially), so maybe I
could parse dtb for allowing the user to choose. I'll move this
discussion to the sv48 introduction.

> Signed-off-by: Zong Li <[email protected]>
> ---
> arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
> arch/riscv/mm/init.c | 2 +-
> 2 files changed, 273 insertions(+), 3 deletions(-)
>
> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> index 281b5fcca5c8..9ec2b608eb7f 100644
> --- a/arch/riscv/kernel/kaslr.c
> +++ b/arch/riscv/kernel/kaslr.c
> @@ -11,23 +11,293 @@
> #include <asm/cacheflush.h>
>
> extern char _start[], _end[];
> +extern void *dtb_early_va;
> +extern phys_addr_t dtb_early_pa;
> extern void secondary_random_target(void);
> extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
>
> uintptr_t secondary_next_target __initdata;
> static uintptr_t kaslr_offset __initdata;
>
> +static const __init u32 *get_reg_address(int root_cells,
> + const u32 *value, u64 *result)
> +{
> + int cell;
> + *result = 0;
> +
> + for (cell = root_cells; cell > 0; --cell)
> + *result = (*result << 32) + fdt32_to_cpu(*value++);
> +
> + return value;
> +}
> +
> +static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
> + int *size_cell)
> +{
> + int node = fdt_path_offset(dtb_early_va, path);
> + fdt64_t *prop;
> +
> + if (node < 0)
> + return -EINVAL;
> +
> + prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
> + if (!prop)
> + return -EINVAL;
> + *addr_cell = fdt32_to_cpu(*prop);
> +
> + prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
> + if (!prop)
> + return -EINVAL;
> + *size_cell = fdt32_to_cpu(*prop);
> +
> + return node;
> +}
> +
> +static __init void kaslr_get_mem_info(uintptr_t *mem_start,
> + uintptr_t *mem_size)
> +{
> + int node, root, addr_cells, size_cells;
> + u64 base, size;
> +
> + /* Get root node's address cells and size cells. */
> + root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
> + if (root < 0)
> + return;
> +
> + /* Get memory base address and size. */
> + fdt_for_each_subnode(node, dtb_early_va, root) {
> + const char *dev_type;
> + const u32 *reg;
> +
> + dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
> + if (!dev_type)
> + continue;
> +
> + if (!strcmp(dev_type, "memory")) {
> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> + if (!reg)
> + return;
> +
> + reg = get_reg_address(addr_cells, reg, &base);
> + reg = get_reg_address(size_cells, reg, &size);
> +
> + *mem_start = base;
> + *mem_size = size;
> +
> + break;
> + }
> + }
> +}
> +
> +/* Return a default seed if there is no HW generator. */
> +static u64 kaslr_default_seed = ULL(-1);
> +static __init u64 kaslr_get_seed(void)
> +{
> + int node, len;
> + fdt64_t *prop;
> + u64 ret;
> +
> + node = fdt_path_offset(dtb_early_va, "/chosen");
> + if (node < 0)
> + return kaslr_default_seed++;
> +
> + prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
> + if (!prop || len != sizeof(u64))
> + return kaslr_default_seed++;
> +
> + ret = fdt64_to_cpu(*prop);
> +
> + /* Re-write to zero for checking whether get seed at second time */
> + *prop = 0;
> +
> + return ret;
> +}
> +
> +static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
> + uintptr_t e2)
> +{
> + return e1 >= s2 && e2 >= s1;
> +}

Inline this function or use a macro maybe.

> +
> +static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
> + uintptr_t end_addr)
> +{
> + int node, rsv_mem, addr_cells, size_cells;
> +
> + /* Get the reserved-memory node. */
> + rsv_mem = get_node_addr_size_cells("/reserved-memory",
> + &addr_cells,
> + &size_cells);
> + if (rsv_mem < 0)
> + return false;
> +
> + /* Get memory base address and size. */
> + fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
> + uint64_t base, size;
> + const uint32_t *reg;
> +
> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> + if (!reg)
> + return 0;
> +
> + reg = get_reg_address(addr_cells, reg, &base);
> + reg = get_reg_address(size_cells, reg, &size);
> +
> + if (is_overlap(start_addr, end_addr, base, base + size))
> + return true;
> + }
> +
> + return false;
> +}
> +
> +static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
> +{
> + int node;
> + uintptr_t initrd_start, initrd_end;
> + fdt64_t *prop;
> +
> + node = fdt_path_offset(dtb_early_va, "/chosen");
> + if (node < 0)
> + return false;
> +
> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
> + if (!prop)
> + return false;
> +
> + initrd_start = fdt64_to_cpu(*prop);
> +
> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
> + if (!prop)
> + return false;
> +
> + initrd_end = fdt64_to_cpu(*prop);
> +
> + return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
> +}
> +
> +static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
> +{
> + uintptr_t dtb_start = dtb_early_pa;
> + uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
> +
> + return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
> +}
> +
> +static __init bool has_regions_overlapping(uintptr_t start_addr,
> + uintptr_t end_addr)
> +{
> + if (is_overlap_dtb(start_addr, end_addr))
> + return true;
> +
> + if (is_overlap_initrd(start_addr, end_addr))
> + return true;
> +
> + if (is_overlap_reserved_mem(start_addr, end_addr))
> + return true;
> +
> + return false;
> +}
> +
> +static inline __init unsigned long get_legal_offset(int random_index,
> + int max_index,
> + uintptr_t mem_start,
> + uintptr_t kernel_size)
> +{
> + uintptr_t start_addr, end_addr;
> + int idx, stop_idx;
> +
> + idx = stop_idx = random_index;
> +
> + do {
> + start_addr = mem_start + idx * SZ_2M + kernel_size;
> + end_addr = start_addr + kernel_size;
> +
> + /* Check overlap to other regions. */
> + if (!has_regions_overlapping(start_addr, end_addr))
> + return idx * SZ_2M + kernel_size;
> +
> + if (idx-- < 0)
> + idx = max_index;

Isn't the fallback to max_index a security breach ? Because at some
point, the kernel will be loaded at this specific address.

> +
> + } while (idx != stop_idx);
> +
> + return 0;
> +}
> +
> +static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
> +{
> + size_t i;
> + uintptr_t *ptr = (uintptr_t *) area;
> +
> + for (i = 0; i < size / sizeof(hash); i++) {
> + /* Rotate by odd number of bits and XOR. */
> + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
> + hash ^= ptr[i];
> + }
> +
> + return hash;
> +}
> +
> +#define MEM_RESERVE_START __pa(PAGE_OFFSET)
> +static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
> +{
> + uintptr_t mem_start = 0, mem_size= 0, random_size;
> + uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
> + int index;
> + u64 random = 0;
> + cycles_t time_base;
> +
> + /* Attempt to create a simple but unpredictable starting entropy */
> + random = rotate_xor(random, linux_banner, strlen(linux_banner));
> +
> + /*
> + * If there is no HW random number generator, use timer to get a random
> + * number. This is better than nothing but not enough secure.
> + */
> + time_base = get_cycles() << 32;
> + time_base ^= get_cycles();
> + random = rotate_xor(random, &time_base, sizeof(time_base));
> +
> + if (seed)
> + random = rotate_xor(random, &seed, sizeof(seed));
> +
> + kaslr_get_mem_info(&mem_start, &mem_size);
> + if (!mem_size)
> + return 0;
> +
> + if (mem_start < MEM_RESERVE_START) {
> + mem_size -= MEM_RESERVE_START - mem_start;
> + mem_start = MEM_RESERVE_START;
> + }
> +
> + /*
> + * Limit randomization range within 1G, so we can exploit
> + * early_pmd/early_pte during early page table phase.
> + */
> + random_size = min_t(u64,
> + mem_size - (kernel_size_align * 2),
> + SZ_1G - (kernel_size_align * 2));

pgdir size is 30 bits in sv39, but it's 39 bits in sv48, you should use
PGDIR_SIZE macro here.

> +
> + /* The index of 2M block in whole avaliable region */
> + index = random % (random_size / SZ_2M);
> +
> + return get_legal_offset(index, random_size / SZ_2M,
> + mem_start, kernel_size_align);
> +}
> +
> uintptr_t __init kaslr_early_init(void)
> {
> + u64 seed;
> uintptr_t dest_start, dest_end;
> uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
>
> /* Get zero value at second time to avoid doing randomization again. */
> - if (kaslr_offset)
> + seed = kaslr_get_seed();
> + if (!seed)
> return 0;
>
> /* Get the random number for kaslr offset. */
> - kaslr_offset = 0x10000000;
> + kaslr_offset = get_random_offset(seed, kernel_size);
>
> /* Update kernel_virt_addr for get_kaslr_offset. */
> kernel_virt_addr += kaslr_offset;
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 2f5b25f02b6c..34c6ecf2c599 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -125,7 +125,7 @@ static void __init setup_initrd(void)
> }
> #endif /* CONFIG_BLK_DEV_INITRD */
>
> -static phys_addr_t dtb_early_pa __initdata;
> +phys_addr_t dtb_early_pa __initdata;
>
> void __init setup_bootmem(void)
> {
>

Alex

2020-04-07 09:19:33

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH RFC 2/8] riscv/kaslr: introduce functions to clear page table

On Tue, Apr 7, 2020 at 1:09 PM Alex Ghiti <[email protected]> wrote:
>
> On 3/24/20 3:30 AM, Zong Li wrote:
> > In KASLR, we need to re-create page table after getting a random
> > destination. Introduce clear function to clear old content. Also, the
> > page table entries allow writing value when it's empty, so we have to
> > clear the early page table.
> >
> > This patch is a preparation to support KASLR.
> >
> > Signed-off-by: Zong Li <[email protected]>
> > ---
> > arch/riscv/mm/init.c | 54 ++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 54 insertions(+)
> >
> > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> > index ace5d74fd939..51e263c04fa2 100644
> > --- a/arch/riscv/mm/init.c
> > +++ b/arch/riscv/mm/init.c
> > @@ -315,6 +315,7 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
> > #define get_pgd_next_virt(__pa) get_pmd_virt(__pa)
> > #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
> > create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
> > +#define clear_pgd_next_mapping(__nextp) clear_pmd(__nextp)
> > #define fixmap_pgd_next fixmap_pmd
> > #else
> > #define pgd_next_t pte_t
> > @@ -322,6 +323,7 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
> > #define get_pgd_next_virt(__pa) get_pte_virt(__pa)
> > #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
> > create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
> > +#define clear_pgd_next_mapping(__nextp) clear_pte(__nextp)
> > #define fixmap_pgd_next fixmap_pte
> > #endif
> >
> > @@ -361,6 +363,58 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
> > return PMD_SIZE;
> > }
> >
> > +#ifdef CONFIG_RANDOMIZE_BASE
> > +static void __init clear_pte(pte_t *ptep)
> > +{
> > + unsigned int i;
> > +
> > + for (i = 0; i < PTRS_PER_PTE; i++)
> > + if (!pte_none(ptep[i]))
> > + ptep[i] = __pte(0);
> > +}
> > +
> > +static void __init clear_pmd(pmd_t *pmdp)
> > +{
> > + unsigned int i;
> > + pte_t *ptep;
> > + phys_addr_t pte_phys;
> > + uintptr_t kaslr_offset = get_kaslr_offset();
> > +
> > + for (i = 0; i < PTRS_PER_PMD; i++)
> > + if (!pmd_none(pmdp[i])) {
> > + if (pmd_leaf(pmdp[i])) {
> > + pmd_clear(&pmdp[i]);
> > + } else {
> > + pte_phys = PFN_PHYS(_pmd_pfn(pmdp[i]));
> > + ptep = get_pte_virt(pte_phys + kaslr_offset);
> > + clear_pte(ptep);
> > + pmd_clear(&pmdp[i]);
> > + }
> > + }
> > +}
> > +
> > +static void __init clear_pgd(pgd_t *pgdp)
> > +{
> > + unsigned int i;
> > + pgd_next_t *nextp;
> > + phys_addr_t next_phys;
> > + uintptr_t kaslr_offset = get_kaslr_offset();
> > +
> > + for (i = 0; i < PTRS_PER_PGD; i++)
> > + if (pgd_val(pgdp[i]) != 0) {
> > + if (pgd_leaf(pgd_val(pgdp[i]))) {
> > + set_pgd(&pgdp[i], __pgd(0));
> > + } else {
> > + next_phys = PFN_PHYS(_pgd_pfn(pgdp[i]));
> > + nextp = get_pgd_next_virt(next_phys +
> > + kaslr_offset);
> > + clear_pgd_next_mapping(nextp);
> > + set_pgd(&pgdp[i], __pgd(0));
> > + }
> > + }
> > +}
> > +#endif
> > +
> > /*
> > * setup_vm() is called from head.S with MMU-off.
> > *
> >
>
> If this is only for clearing early page tables, a memset is way easier
> as there is only one page per level to clear.
>

Yes, it's a better way. Thanks.

> Alex

2020-04-07 10:36:32

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH RFC 3/8] riscv/kaslr: support KASLR infrastructure

On Tue, Apr 7, 2020 at 1:10 PM Alex Ghiti <[email protected]> wrote:
>
>
>
> On 3/24/20 3:30 AM, Zong Li wrote:
> > This patch support KASLR implementation. It copies kernel image to a
> > proper and random place, and make all harts go to the new destination.
> >
> > After KASLR initialization, secondary harts go to the new destination
> > to wait their stack pointer to be setup by main hart, main hart goes to
> > re-create the early page table and doing relocation by going back to
> > setup_vm again.
> >
> > We separate the randomization process from this patch, so the kernel
> > offset was not randomized yet, it just hardcode a meanless number here.
> >
> > Signed-off-by: Zong Li <[email protected]>
> > ---
> > arch/riscv/Kconfig | 15 +++++++++++
> > arch/riscv/kernel/Makefile | 2 ++
> > arch/riscv/kernel/head.S | 39 +++++++++++++++++++++++++++
> > arch/riscv/kernel/kaslr.c | 55 ++++++++++++++++++++++++++++++++++++++
> > arch/riscv/mm/init.c | 53 +++++++++++++++++++++++++++++++++++-
> > 5 files changed, 163 insertions(+), 1 deletion(-)
> > create mode 100644 arch/riscv/kernel/kaslr.c
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index aea03ac470c8..8f566b40ea1e 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -183,6 +183,21 @@ config RELOCATABLE
> > relocation pass at runtime even if the kernel is loaded at the
> > same address it was linked at.
> >
> > +config RANDOMIZE_BASE
> > + bool "Randomize the address of the kernel image"
> > + depends on MMU
> > + select MODULE_SECTIONS if MODULES
> > + select RELOCATABLE
> > + help
> > + Randomizes the virtual address at which the kernel image is
> > + loaded, as a security feature that deters exploit attempts
> > + relying on knowledge of the location of kernel internals.
> > +
> > + It is the job of previous stage to provide entropy, by passing a
> > + random u64 value in /chosen/kaslr-seed at kernel entry.
> > +
> > + If unsure, say N.
> > +
> > source "arch/riscv/Kconfig.socs"
> >
> > menu "Platform type"
> > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > index d189bd3d8501..8f62732b1135 100644
> > --- a/arch/riscv/kernel/Makefile
> > +++ b/arch/riscv/kernel/Makefile
> > @@ -45,4 +45,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o
> > obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o
> > obj-$(CONFIG_RISCV_SBI) += sbi.o
> >
> > +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
> > +
> > clean:
> > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> > index cb4a6e2d3793..5191e528d813 100644
> > --- a/arch/riscv/kernel/head.S
> > +++ b/arch/riscv/kernel/head.S
> > @@ -113,9 +113,12 @@ clear_bss_done:
> > la a2, boot_cpu_hartid
> > REG_S a0, (a2)
> >
> > +.align 2
>
> Why do you need this new alignment constraint ?

We need to ensure the target of the trap vector is 4-byte alignment.

>
> > +early_page_table:
> > /* Initialize page tables and relocate to virtual addresses */
> > la sp, init_thread_union + THREAD_SIZE
> > mv a0, s1
> > +
>
> Newline ?

Remove it in the next version. Thanks.

>
> > call setup_vm
> > #ifdef CONFIG_MMU
> > la a0, early_pg_dir
> > @@ -127,6 +130,29 @@ clear_bss_done:
> > sw zero, TASK_TI_CPU(tp)
> > la sp, init_thread_union + THREAD_SIZE
> >
> > +#ifdef CONFIG_RANDOMIZE_BASE
> > + /* KASRL initialization. Try to get a random kernel offset. */
> > + call kaslr_early_init
> > +
> > + /* If return value equals to zero, we don't need to randomize kernel */
> > + beqz a0, 1f
> > +
> > + la a1, early_page_table
> > + add a1, a1, a0
> > + la a0, va_pa_offset
> > + REG_L a0, 0(a0)
> > + sub a1, a1, a0
> > + mv a0, s1
> > +
> > + /*
> > + * Go to new kernel image destination, and disable MMU to re-create
> > + * early page table and do relocation.
> > + */
> > + csrw CSR_TVEC, a1
> > + csrw CSR_SATP, x0
> > +1:
> > +#endif
> > +
> > #ifdef CONFIG_KASAN
> > call kasan_early_init
> > #endif
> > @@ -194,6 +220,19 @@ relocate:
> > la a3, .Lsecondary_park
> > csrw CSR_TVEC, a3
> >
> > +#ifdef CONFIG_RANDOMIZE_BASE
> > + /*
> > + * Wait winning hart to tell secondary harts where is the new
> > + * destination to go.
> > + */
> > +.Lwait_for_next_target:
> > + la a3, secondary_next_target
> > + REG_L a3, 0(a3)
> > + beqz a3, .Lwait_for_next_target
> > + jr a3
> > +.global secondary_random_target
> > +secondary_random_target:
> > +#endif
> > slli a3, a0, LGREG
> > la a1, __cpu_up_stack_pointer
> > la a2, __cpu_up_task_pointer
> > diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> > new file mode 100644
> > index 000000000000..281b5fcca5c8
> > --- /dev/null
> > +++ b/arch/riscv/kernel/kaslr.c
> > @@ -0,0 +1,55 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright (C) 2020 SiFive
> > + * Copyright (C) 2020 Zong Li <[email protected]>
> > + */
> > +
> > +#include <linux/libfdt.h>
> > +#include <linux/timex.h>
> > +#include <linux/random.h>
> > +#include <linux/set_memory.h>
> > +#include <asm/cacheflush.h>
> > +
> > +extern char _start[], _end[];
> > +extern void secondary_random_target(void);
> > +extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
> > +
> > +uintptr_t secondary_next_target __initdata;
> > +static uintptr_t kaslr_offset __initdata;
> > +
> > +uintptr_t __init kaslr_early_init(void)
> > +{
> > + uintptr_t dest_start, dest_end;
> > + uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
> > +
> > + /* Get zero value at second time to avoid doing randomization again. */
> > + if (kaslr_offset)
> > + return 0;
> > +
> > + /* Get the random number for kaslr offset. */
> > + kaslr_offset = 0x10000000;
>
> For clarity, you could use a macro or something like that for this constant.

This is a temporary assignment for this patch. The kaslr_offset is not
randomized yet, so it is just a hardcode meaningless number here.
Eventually, kalser_offset should be assigned a random number, that is
what the next patch does ('riscv/kaslr: randomize the kernel image
offset').

>
> > +
> > + /* Update kernel_virt_addr for get_kaslr_offset. */
> > + kernel_virt_addr += kaslr_offset;
>
> This could be done after you test if kaslr_offset is null below.

Yes, make sense, change it in the next version patch. Thanks.

>
> > +
> > + if (kaslr_offset) {
> > + dest_start = (uintptr_t) (PAGE_OFFSET + kaslr_offset);
> > + dest_end = dest_start + kernel_size;
>
> dest_end = dest_start + kernel_size - 1;

OK, Thanks.

>
> > +
> > + /* Create the new destination mapping for kernel image. */
> > + kaslr_create_page_table(dest_start, dest_end);
> > +
> > + /* Copy kernel image from orignial location. */
> > + memcpy((void *)dest_start, (void *)_start, kernel_size);
> > + flush_icache_range(dest_start, dest_end); > +
> > + /* Make secondary harts jump to new kernel image destination. */
> > + WRITE_ONCE(secondary_next_target,
> > + __pa_symbol(secondary_random_target) + kaslr_offset);
>
> Don't you need to sync secondary harts icache with main hart dcache here ?

It seems to me that secondary harts could see secondary_next_target
immediately through cache coherence, just like __cpu_up_stack_pointer
and __cpu_up_task_pointer. Could you give more detail here or why we
need to write secondary_next_target back to memory? Thanks.

>
> > + } else {
> > + WRITE_ONCE(secondary_next_target,
> > + __pa_symbol(secondary_random_target));
> > + }
> > +
> > + return kaslr_offset;
> > +}
> > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> > index 51e263c04fa2..2f5b25f02b6c 100644
> > --- a/arch/riscv/mm/init.c
> > +++ b/arch/riscv/mm/init.c
> > @@ -413,6 +413,41 @@ static void __init clear_pgd(pgd_t *pgdp)
> > }
> > }
> > }
> > +
> > +static void __init clear_page_tables(void)
> > +{
> > + clear_pgd(early_pg_dir);
> > + clear_pgd(trampoline_pg_dir);
> > +}
>
> early page table and trampoline page table consist in one page per
> level, I confirm that a memset to 0 is easier here.

yes, I'll change it. Thanks.

>
> > +
> > +void __init kaslr_create_page_table(uintptr_t start, uintptr_t end)
> > +{
> > + pgd_next_t *nextp;
> > + phys_addr_t next_phys;
> > + uintptr_t pgd_index, va;
> > + phys_addr_t pa = __pa(PAGE_OFFSET) + get_kaslr_offset();
> > + uintptr_t map_size =
> > + best_map_size(__pa(PAGE_OFFSET), MAX_EARLY_MAPPING_SIZE);
> > +
> > + /* Expolit early_pg_dir and early_pmd during using early page table. */
> > + for (va = start; va < end; va += map_size, pa += map_size) {
> > + pgd_index = pgd_index(va);
> > +
> > + if (pgd_val(early_pg_dir[pgd_index]) == 0) {
> > + next_phys = alloc_pgd_next(va);
> > + early_pg_dir[pgd_index] =
> > + pfn_pgd(PFN_DOWN(next_phys), PAGE_TABLE);
> > + nextp = (pgd_next_t *)(__va(next_phys));
> > + memset(nextp, 0, PAGE_SIZE);
> > + } else {
> > + next_phys = PFN_PHYS(_pgd_pfn(early_pg_dir[pgd_index]));
> > + nextp = (pgd_next_t *)(__va(next_phys));
> > + }
> > +
> > + create_pgd_next_mapping(nextp, va, pa, map_size,
> > + PAGE_KERNEL_EXEC);
> > + }
> > +}
> > #endif
>
> I may be missing something here: I don't see where the mappings for the
> new kernel you create here are used between here and setup_vm ?

Early page tables only create the mappings for original kernel image
(i.e., from vmlinux_start to vmlinux_end), so the mapping of the
destination of the new kernel image isn't be created, it would cause
error when copying kernel image.

>
> If I read correctly, if kaslr_early_init returns a random offset, you
> disable mmu and then call setup_vm which will recreate early page tables
> anyway.

Yes, we can exploit the setup_vm implementation to create the page
table for the destination of the new kernel image.

>
> >
> > /*
> > @@ -489,7 +524,13 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> > uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);
> >
> > va_pa_offset = kernel_virt_addr - load_pa;
> > - pfn_base = PFN_DOWN(load_pa);
> > +
> > + /*
> > + * Update pfn_base only if pfn_base is empty. It's avoid to mess up it
> > + * when re-enter this function by KASLR.
> > + */
> > + if (!pfn_base)
> > + pfn_base = PFN_DOWN(load_pa);
> >
> > #ifdef CONFIG_RELOCATABLE
> > /*
> > @@ -513,6 +554,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> > BUG_ON((load_pa % map_size) != 0);
> > BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE);
> >
> > +#ifdef CONFIG_RANDOMIZE_BASE
> > + /*
> > + * Enter setup_vm twice if there is a legal random destination in KASLR,
> > + * Reach here at second time, Clear page table because PTE entris allow
> > + * writing when it's empty.
> > + */
> > + if (get_kaslr_offset())
> > + clear_page_tables();
> > +#endif
> > +
> > /* Setup early PGD for fixmap */
> > create_pgd_mapping(early_pg_dir, FIXADDR_START,
> > (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> >
>
> Just an idea, maybe worthless, but couldn't we benefit from kexec here ?
> That's quite the same: copy a new kernel from the current kernel in some
> new memory locations and then jump to it. We could pass the computed
> random offset as a very early kernel parameter so that setup_vm would
> only be called once (per kernel).

Actually, I had tried something like you said, if that, we would
encounter some difficulties. We need to limit kaslr implementation to
use local symbols only, including all the functions which were used in
other files, because the kernel is built as pie, so the global symbols
need to be accessed by got table. If we want to access global symbols,
we need to do relocation first, but even if we did relocation first,
the content of each got table entry would be virtual address not
physical address, it would cause error during MMU disabled. Maybe we
could overcome these problems, but it seems to me that it would be
more difficult.

>
> Alex

2020-04-07 10:55:26

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset

On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
>
>
> On 3/24/20 3:30 AM, Zong Li wrote:
> > Entropy is derived from the banner and timer, it is better than nothing
> > but not enough secure, so previous stage may pass entropy via the device
> > tree /chosen/kaslr-seed node.
> >
> > We limit randomization range within 1GB, so we can exploit early page
> > table to map new destination of kernel image. Additionally, the kernel
> > offset need 2M alignment to ensure it's good in PMD page table.
> >
> > We also checks the kernel offset whether it's safe by avoiding to
> > overlaps with dtb, initrd and reserved memory regions.
> >
>
> That maybe changes the way my sv48 patchset will be implemented: I can't
> get user preference (3-level or 4-level) by any means, device-tree or
> kernel parameter.
>
> But I don't see how you could get a random offset without info from the
> device tree anyway (reserved memory regions especially), so maybe I
> could parse dtb for allowing the user to choose. I'll move this
> discussion to the sv48 introduction.

Maybe I'm a little bit misunderstanding here, but I think I got the
random offset through some information by parsing dtb.

>
> > Signed-off-by: Zong Li <[email protected]>
> > ---
> > arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
> > arch/riscv/mm/init.c | 2 +-
> > 2 files changed, 273 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> > index 281b5fcca5c8..9ec2b608eb7f 100644
> > --- a/arch/riscv/kernel/kaslr.c
> > +++ b/arch/riscv/kernel/kaslr.c
> > @@ -11,23 +11,293 @@
> > #include <asm/cacheflush.h>
> >
> > extern char _start[], _end[];
> > +extern void *dtb_early_va;
> > +extern phys_addr_t dtb_early_pa;
> > extern void secondary_random_target(void);
> > extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
> >
> > uintptr_t secondary_next_target __initdata;
> > static uintptr_t kaslr_offset __initdata;
> >
> > +static const __init u32 *get_reg_address(int root_cells,
> > + const u32 *value, u64 *result)
> > +{
> > + int cell;
> > + *result = 0;
> > +
> > + for (cell = root_cells; cell > 0; --cell)
> > + *result = (*result << 32) + fdt32_to_cpu(*value++);
> > +
> > + return value;
> > +}
> > +
> > +static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
> > + int *size_cell)
> > +{
> > + int node = fdt_path_offset(dtb_early_va, path);
> > + fdt64_t *prop;
> > +
> > + if (node < 0)
> > + return -EINVAL;
> > +
> > + prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
> > + if (!prop)
> > + return -EINVAL;
> > + *addr_cell = fdt32_to_cpu(*prop);
> > +
> > + prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
> > + if (!prop)
> > + return -EINVAL;
> > + *size_cell = fdt32_to_cpu(*prop);
> > +
> > + return node;
> > +}
> > +
> > +static __init void kaslr_get_mem_info(uintptr_t *mem_start,
> > + uintptr_t *mem_size)
> > +{
> > + int node, root, addr_cells, size_cells;
> > + u64 base, size;
> > +
> > + /* Get root node's address cells and size cells. */
> > + root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
> > + if (root < 0)
> > + return;
> > +
> > + /* Get memory base address and size. */
> > + fdt_for_each_subnode(node, dtb_early_va, root) {
> > + const char *dev_type;
> > + const u32 *reg;
> > +
> > + dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
> > + if (!dev_type)
> > + continue;
> > +
> > + if (!strcmp(dev_type, "memory")) {
> > + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> > + if (!reg)
> > + return;
> > +
> > + reg = get_reg_address(addr_cells, reg, &base);
> > + reg = get_reg_address(size_cells, reg, &size);
> > +
> > + *mem_start = base;
> > + *mem_size = size;
> > +
> > + break;
> > + }
> > + }
> > +}
> > +
> > +/* Return a default seed if there is no HW generator. */
> > +static u64 kaslr_default_seed = ULL(-1);
> > +static __init u64 kaslr_get_seed(void)
> > +{
> > + int node, len;
> > + fdt64_t *prop;
> > + u64 ret;
> > +
> > + node = fdt_path_offset(dtb_early_va, "/chosen");
> > + if (node < 0)
> > + return kaslr_default_seed++;
> > +
> > + prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
> > + if (!prop || len != sizeof(u64))
> > + return kaslr_default_seed++;
> > +
> > + ret = fdt64_to_cpu(*prop);
> > +
> > + /* Re-write to zero for checking whether get seed at second time */
> > + *prop = 0;
> > +
> > + return ret;
> > +}
> > +
> > +static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
> > + uintptr_t e2)
> > +{
> > + return e1 >= s2 && e2 >= s1;
> > +}
>
> Inline this function or use a macro maybe.

Yes, sure. Thanks.

>
> > +
> > +static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
> > + uintptr_t end_addr)
> > +{
> > + int node, rsv_mem, addr_cells, size_cells;
> > +
> > + /* Get the reserved-memory node. */
> > + rsv_mem = get_node_addr_size_cells("/reserved-memory",
> > + &addr_cells,
> > + &size_cells);
> > + if (rsv_mem < 0)
> > + return false;
> > +
> > + /* Get memory base address and size. */
> > + fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
> > + uint64_t base, size;
> > + const uint32_t *reg;
> > +
> > + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> > + if (!reg)
> > + return 0;
> > +
> > + reg = get_reg_address(addr_cells, reg, &base);
> > + reg = get_reg_address(size_cells, reg, &size);
> > +
> > + if (is_overlap(start_addr, end_addr, base, base + size))
> > + return true;
> > + }
> > +
> > + return false;
> > +}
> > +
> > +static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
> > +{
> > + int node;
> > + uintptr_t initrd_start, initrd_end;
> > + fdt64_t *prop;
> > +
> > + node = fdt_path_offset(dtb_early_va, "/chosen");
> > + if (node < 0)
> > + return false;
> > +
> > + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
> > + if (!prop)
> > + return false;
> > +
> > + initrd_start = fdt64_to_cpu(*prop);
> > +
> > + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
> > + if (!prop)
> > + return false;
> > +
> > + initrd_end = fdt64_to_cpu(*prop);
> > +
> > + return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
> > +}
> > +
> > +static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
> > +{
> > + uintptr_t dtb_start = dtb_early_pa;
> > + uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
> > +
> > + return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
> > +}
> > +
> > +static __init bool has_regions_overlapping(uintptr_t start_addr,
> > + uintptr_t end_addr)
> > +{
> > + if (is_overlap_dtb(start_addr, end_addr))
> > + return true;
> > +
> > + if (is_overlap_initrd(start_addr, end_addr))
> > + return true;
> > +
> > + if (is_overlap_reserved_mem(start_addr, end_addr))
> > + return true;
> > +
> > + return false;
> > +}
> > +
> > +static inline __init unsigned long get_legal_offset(int random_index,
> > + int max_index,
> > + uintptr_t mem_start,
> > + uintptr_t kernel_size)
> > +{
> > + uintptr_t start_addr, end_addr;
> > + int idx, stop_idx;
> > +
> > + idx = stop_idx = random_index;
> > +
> > + do {
> > + start_addr = mem_start + idx * SZ_2M + kernel_size;
> > + end_addr = start_addr + kernel_size;
> > +
> > + /* Check overlap to other regions. */
> > + if (!has_regions_overlapping(start_addr, end_addr))
> > + return idx * SZ_2M + kernel_size;
> > +
> > + if (idx-- < 0)
> > + idx = max_index;
>
> Isn't the fallback to max_index a security breach ? Because at some
> point, the kernel will be loaded at this specific address.

The max_index is the maximum safe index for destination of new kernel
image. Could you give more explain here?

>
> > +
> > + } while (idx != stop_idx);
> > +
> > + return 0;
> > +}
> > +
> > +static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
> > +{
> > + size_t i;
> > + uintptr_t *ptr = (uintptr_t *) area;
> > +
> > + for (i = 0; i < size / sizeof(hash); i++) {
> > + /* Rotate by odd number of bits and XOR. */
> > + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
> > + hash ^= ptr[i];
> > + }
> > +
> > + return hash;
> > +}
> > +
> > +#define MEM_RESERVE_START __pa(PAGE_OFFSET)
> > +static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
> > +{
> > + uintptr_t mem_start = 0, mem_size= 0, random_size;
> > + uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
> > + int index;
> > + u64 random = 0;
> > + cycles_t time_base;
> > +
> > + /* Attempt to create a simple but unpredictable starting entropy */
> > + random = rotate_xor(random, linux_banner, strlen(linux_banner));
> > +
> > + /*
> > + * If there is no HW random number generator, use timer to get a random
> > + * number. This is better than nothing but not enough secure.
> > + */
> > + time_base = get_cycles() << 32;
> > + time_base ^= get_cycles();
> > + random = rotate_xor(random, &time_base, sizeof(time_base));
> > +
> > + if (seed)
> > + random = rotate_xor(random, &seed, sizeof(seed));
> > +
> > + kaslr_get_mem_info(&mem_start, &mem_size);
> > + if (!mem_size)
> > + return 0;
> > +
> > + if (mem_start < MEM_RESERVE_START) {
> > + mem_size -= MEM_RESERVE_START - mem_start;
> > + mem_start = MEM_RESERVE_START;
> > + }
> > +
> > + /*
> > + * Limit randomization range within 1G, so we can exploit
> > + * early_pmd/early_pte during early page table phase.
> > + */
> > + random_size = min_t(u64,
> > + mem_size - (kernel_size_align * 2),
> > + SZ_1G - (kernel_size_align * 2));
>
> pgdir size is 30 bits in sv39, but it's 39 bits in sv48, you should use
> PGDIR_SIZE macro here.

OK, change it in the next version. Thanks.

>
> > +
> > + /* The index of 2M block in whole avaliable region */
> > + index = random % (random_size / SZ_2M);
> > +
> > + return get_legal_offset(index, random_size / SZ_2M,
> > + mem_start, kernel_size_align);
> > +}
> > +
> > uintptr_t __init kaslr_early_init(void)
> > {
> > + u64 seed;
> > uintptr_t dest_start, dest_end;
> > uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
> >
> > /* Get zero value at second time to avoid doing randomization again. */
> > - if (kaslr_offset)
> > + seed = kaslr_get_seed();
> > + if (!seed)
> > return 0;
> >
> > /* Get the random number for kaslr offset. */
> > - kaslr_offset = 0x10000000;
> > + kaslr_offset = get_random_offset(seed, kernel_size);
> >
> > /* Update kernel_virt_addr for get_kaslr_offset. */
> > kernel_virt_addr += kaslr_offset;
> > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> > index 2f5b25f02b6c..34c6ecf2c599 100644
> > --- a/arch/riscv/mm/init.c
> > +++ b/arch/riscv/mm/init.c
> > @@ -125,7 +125,7 @@ static void __init setup_initrd(void)
> > }
> > #endif /* CONFIG_BLK_DEV_INITRD */
> >
> > -static phys_addr_t dtb_early_pa __initdata;
> > +phys_addr_t dtb_early_pa __initdata;
> >
> > void __init setup_bootmem(void)
> > {
> >
>
> Alex

2020-04-07 11:19:40

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH RFC 6/8] riscv/kaslr: clear the original kernel image

On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
>
> On 3/24/20 3:30 AM, Zong Li wrote:
> > After completing final page table, we can clear original kernel image
> > and remove executable permission.
> >
> > Signed-off-by: Zong Li <[email protected]>
> > ---
> > arch/riscv/include/asm/kaslr.h | 12 ++++++++++++
> > arch/riscv/kernel/kaslr.c | 12 ++++++++++++
> > arch/riscv/mm/init.c | 6 ++++++
> > 3 files changed, 30 insertions(+)
> > create mode 100644 arch/riscv/include/asm/kaslr.h
> >
> > diff --git a/arch/riscv/include/asm/kaslr.h b/arch/riscv/include/asm/kaslr.h
> > new file mode 100644
> > index 000000000000..b165fe71dd4a
> > --- /dev/null
> > +++ b/arch/riscv/include/asm/kaslr.h
> > @@ -0,0 +1,12 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (C) 2020 SiFive
> > + * Copyright (C) 2020 Zong Li <[email protected]>
> > + */
> > +
> > +#ifndef _ASM_RISCV_KASLR_H
> > +#define _ASM_RISCV_KASLR_H
> > +
> > +void __init kaslr_late_init(void);
> > +
> > +#endif /* _ASM_RISCV_KASLR_H */
> > diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> > index 59001d6fdfc3..0bd30831c455 100644
> > --- a/arch/riscv/kernel/kaslr.c
> > +++ b/arch/riscv/kernel/kaslr.c
> > @@ -356,6 +356,18 @@ static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
> > return get_legal_offset(random, kernel_size_align);
> > }
> >
> > +void __init kaslr_late_init(void)
> > +{
> > + uintptr_t kernel_size;
> > +
> > + /* Clear original kernel image. */
> > + if (kaslr_offset) {
> > + kernel_size = (uintptr_t) _end - (uintptr_t) _start;
>
> kernel_size = (uintptr_t) _end - (uintptr_t) _start + 1;

OK, change it in the next version. Thanks.

>
> > + memset((void *)PAGE_OFFSET, 0, kernel_size);
>
> I have been thinking again about our discussion regarding PAGE_OFFSET:
> PAGE_OFFSET actually points to the address where the kernel was loaded,
> not the beginning of memory, that's a bit weird.
>
> Just saying that here, because it took me a few seconds to remember that
> and understand what you were doing here.

In non-kaslr case, we load the kernel to PAGE_OFFSET which points to,
so we clear the old kernel image through PAGE_OFFSET here. Certainly,
we could use a symbol to record the start address of the old kernel
image instead of PAGE_OFFSET here. I don't see other architectures
changing PAGE_OFFSET after copying the kernel to the new location in
kaslr. If you think the PAGE_OFFSET needs to be changed, we need to
give another way to make the page table could create the mappings for
the whole memory and memblock/buddy system could see the whole memory
after the kernel moves.

>
> > + set_memory_nx(PAGE_OFFSET, kaslr_offset >> PAGE_SHIFT);
>
> Again, I certainly missed something but when do you use old kernel
> mappings ?

We use old kernel mappings when KASLR calculates the random offset, at
that moment, kernel is running on old kernel location.

>
> > + }
> > +}
> > +
> > uintptr_t __init kaslr_early_init(void)
> > {
> > u64 seed;
> > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> > index 34c6ecf2c599..08e2ce170533 100644
> > --- a/arch/riscv/mm/init.c
> > +++ b/arch/riscv/mm/init.c
> > @@ -15,6 +15,7 @@
> > #include <linux/set_memory.h>
> > #ifdef CONFIG_RELOCATABLE
> > #include <linux/elf.h>
> > +#include <asm/kaslr.h>
> > #endif
> >
> > #include <asm/fixmap.h>
> > @@ -649,6 +650,11 @@ static void __init setup_vm_final(void)
> > /* Move to swapper page table */
> > csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
> > local_flush_tlb_all();
> > +
> > +#ifdef CONFIG_RANDOMIZE_BASE
> > + /* Clear orignial kernel image and set the right permission. */
> > + kaslr_late_init();
> > +#endif
> > }
> >
> > void free_initmem(void)
> >
>
> Alex

2020-04-09 05:53:23

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset



On 4/7/20 6:53 AM, Zong Li wrote:
> On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
>>
>>
>> On 3/24/20 3:30 AM, Zong Li wrote:
>>> Entropy is derived from the banner and timer, it is better than nothing
>>> but not enough secure, so previous stage may pass entropy via the device
>>> tree /chosen/kaslr-seed node.
>>>
>>> We limit randomization range within 1GB, so we can exploit early page
>>> table to map new destination of kernel image. Additionally, the kernel
>>> offset need 2M alignment to ensure it's good in PMD page table.
>>>
>>> We also checks the kernel offset whether it's safe by avoiding to
>>> overlaps with dtb, initrd and reserved memory regions.
>>>
>>
>> That maybe changes the way my sv48 patchset will be implemented: I can't
>> get user preference (3-level or 4-level) by any means, device-tree or
>> kernel parameter.
>>
>> But I don't see how you could get a random offset without info from the
>> device tree anyway (reserved memory regions especially), so maybe I
>> could parse dtb for allowing the user to choose. I'll move this
>> discussion to the sv48 introduction.
>
> Maybe I'm a little bit misunderstanding here, but I think I got the
> random offset through some information by parsing dtb.
>

I was just saying that I may use the dtb too in sv48 patchset to make it
possible for users to choose sv39 even if sv48 is supported by hardware
(which is not the case in my current patchset).

>>
>>> Signed-off-by: Zong Li <[email protected]>
>>> ---
>>> arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
>>> arch/riscv/mm/init.c | 2 +-
>>> 2 files changed, 273 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
>>> index 281b5fcca5c8..9ec2b608eb7f 100644
>>> --- a/arch/riscv/kernel/kaslr.c
>>> +++ b/arch/riscv/kernel/kaslr.c
>>> @@ -11,23 +11,293 @@
>>> #include <asm/cacheflush.h>
>>>
>>> extern char _start[], _end[];
>>> +extern void *dtb_early_va;
>>> +extern phys_addr_t dtb_early_pa;
>>> extern void secondary_random_target(void);
>>> extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
>>>
>>> uintptr_t secondary_next_target __initdata;
>>> static uintptr_t kaslr_offset __initdata;
>>>
>>> +static const __init u32 *get_reg_address(int root_cells,
>>> + const u32 *value, u64 *result)
>>> +{
>>> + int cell;
>>> + *result = 0;
>>> +
>>> + for (cell = root_cells; cell > 0; --cell)
>>> + *result = (*result << 32) + fdt32_to_cpu(*value++);
>>> +
>>> + return value;
>>> +}
>>> +
>>> +static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
>>> + int *size_cell)
>>> +{
>>> + int node = fdt_path_offset(dtb_early_va, path);
>>> + fdt64_t *prop;
>>> +
>>> + if (node < 0)
>>> + return -EINVAL;
>>> +
>>> + prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
>>> + if (!prop)
>>> + return -EINVAL;
>>> + *addr_cell = fdt32_to_cpu(*prop);
>>> +
>>> + prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
>>> + if (!prop)
>>> + return -EINVAL;
>>> + *size_cell = fdt32_to_cpu(*prop);
>>> +
>>> + return node;
>>> +}
>>> +
>>> +static __init void kaslr_get_mem_info(uintptr_t *mem_start,
>>> + uintptr_t *mem_size)
>>> +{
>>> + int node, root, addr_cells, size_cells;
>>> + u64 base, size;
>>> +
>>> + /* Get root node's address cells and size cells. */
>>> + root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
>>> + if (root < 0)
>>> + return;
>>> +
>>> + /* Get memory base address and size. */
>>> + fdt_for_each_subnode(node, dtb_early_va, root) {
>>> + const char *dev_type;
>>> + const u32 *reg;
>>> +
>>> + dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
>>> + if (!dev_type)
>>> + continue;
>>> +
>>> + if (!strcmp(dev_type, "memory")) {
>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
>>> + if (!reg)
>>> + return;
>>> +
>>> + reg = get_reg_address(addr_cells, reg, &base);
>>> + reg = get_reg_address(size_cells, reg, &size);
>>> +
>>> + *mem_start = base;
>>> + *mem_size = size;
>>> +
>>> + break;
>>> + }
>>> + }
>>> +}
>>> +
>>> +/* Return a default seed if there is no HW generator. */
>>> +static u64 kaslr_default_seed = ULL(-1);
>>> +static __init u64 kaslr_get_seed(void)
>>> +{
>>> + int node, len;
>>> + fdt64_t *prop;
>>> + u64 ret;
>>> +
>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
>>> + if (node < 0)
>>> + return kaslr_default_seed++;
>>> +
>>> + prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
>>> + if (!prop || len != sizeof(u64))
>>> + return kaslr_default_seed++;
>>> +
>>> + ret = fdt64_to_cpu(*prop);
>>> +
>>> + /* Re-write to zero for checking whether get seed at second time */
>>> + *prop = 0;
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
>>> + uintptr_t e2)
>>> +{
>>> + return e1 >= s2 && e2 >= s1;
>>> +}
>>
>> Inline this function or use a macro maybe.
>
> Yes, sure. Thanks.
>
>>
>>> +
>>> +static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
>>> + uintptr_t end_addr)
>>> +{
>>> + int node, rsv_mem, addr_cells, size_cells;
>>> +
>>> + /* Get the reserved-memory node. */
>>> + rsv_mem = get_node_addr_size_cells("/reserved-memory",
>>> + &addr_cells,
>>> + &size_cells);
>>> + if (rsv_mem < 0)
>>> + return false;
>>> +
>>> + /* Get memory base address and size. */
>>> + fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
>>> + uint64_t base, size;
>>> + const uint32_t *reg;
>>> +
>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
>>> + if (!reg)
>>> + return 0;
>>> +
>>> + reg = get_reg_address(addr_cells, reg, &base);
>>> + reg = get_reg_address(size_cells, reg, &size);
>>> +
>>> + if (is_overlap(start_addr, end_addr, base, base + size))
>>> + return true;
>>> + }
>>> +
>>> + return false;
>>> +}
>>> +
>>> +static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
>>> +{
>>> + int node;
>>> + uintptr_t initrd_start, initrd_end;
>>> + fdt64_t *prop;
>>> +
>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
>>> + if (node < 0)
>>> + return false;
>>> +
>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
>>> + if (!prop)
>>> + return false;
>>> +
>>> + initrd_start = fdt64_to_cpu(*prop);
>>> +
>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
>>> + if (!prop)
>>> + return false;
>>> +
>>> + initrd_end = fdt64_to_cpu(*prop);
>>> +
>>> + return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
>>> +}
>>> +
>>> +static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
>>> +{
>>> + uintptr_t dtb_start = dtb_early_pa;
>>> + uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
>>> +
>>> + return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
>>> +}
>>> +
>>> +static __init bool has_regions_overlapping(uintptr_t start_addr,
>>> + uintptr_t end_addr)
>>> +{
>>> + if (is_overlap_dtb(start_addr, end_addr))
>>> + return true;
>>> +
>>> + if (is_overlap_initrd(start_addr, end_addr))
>>> + return true;
>>> +
>>> + if (is_overlap_reserved_mem(start_addr, end_addr))
>>> + return true;
>>> +
>>> + return false;
>>> +}
>>> +
>>> +static inline __init unsigned long get_legal_offset(int random_index,
>>> + int max_index,
>>> + uintptr_t mem_start,
>>> + uintptr_t kernel_size)
>>> +{
>>> + uintptr_t start_addr, end_addr;
>>> + int idx, stop_idx;
>>> +
>>> + idx = stop_idx = random_index;
>>> +
>>> + do {
>>> + start_addr = mem_start + idx * SZ_2M + kernel_size;
>>> + end_addr = start_addr + kernel_size;
>>> +
>>> + /* Check overlap to other regions. */
>>> + if (!has_regions_overlapping(start_addr, end_addr))
>>> + return idx * SZ_2M + kernel_size;
>>> +
>>> + if (idx-- < 0)
>>> + idx = max_index;
>>
>> Isn't the fallback to max_index a security breach ? Because at some
>> point, the kernel will be loaded at this specific address.
>
> The max_index is the maximum safe index for destination of new kernel
> image. Could you give more explain here?
>

But max_index is not random at all. I really don't know if that's a
problem, I just found intriguing the fact the kernel could be loaded at
some specific location. Would it be more secure, instead of picking
max_index as fallback when reaching 0, to pick another random number
between random_index and max_index ?

Alex

>>
>>> +
>>> + } while (idx != stop_idx);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
>>> +{
>>> + size_t i;
>>> + uintptr_t *ptr = (uintptr_t *) area;
>>> +
>>> + for (i = 0; i < size / sizeof(hash); i++) {
>>> + /* Rotate by odd number of bits and XOR. */
>>> + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
>>> + hash ^= ptr[i];
>>> + }
>>> +
>>> + return hash;
>>> +}
>>> +
>>> +#define MEM_RESERVE_START __pa(PAGE_OFFSET)
>>> +static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
>>> +{
>>> + uintptr_t mem_start = 0, mem_size= 0, random_size;
>>> + uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
>>> + int index;
>>> + u64 random = 0;
>>> + cycles_t time_base;
>>> +
>>> + /* Attempt to create a simple but unpredictable starting entropy */
>>> + random = rotate_xor(random, linux_banner, strlen(linux_banner));
>>> +
>>> + /*
>>> + * If there is no HW random number generator, use timer to get a random
>>> + * number. This is better than nothing but not enough secure.
>>> + */
>>> + time_base = get_cycles() << 32;
>>> + time_base ^= get_cycles();
>>> + random = rotate_xor(random, &time_base, sizeof(time_base));
>>> +
>>> + if (seed)
>>> + random = rotate_xor(random, &seed, sizeof(seed));
>>> +
>>> + kaslr_get_mem_info(&mem_start, &mem_size);
>>> + if (!mem_size)
>>> + return 0;
>>> +
>>> + if (mem_start < MEM_RESERVE_START) {
>>> + mem_size -= MEM_RESERVE_START - mem_start;
>>> + mem_start = MEM_RESERVE_START;
>>> + }
>>> +
>>> + /*
>>> + * Limit randomization range within 1G, so we can exploit
>>> + * early_pmd/early_pte during early page table phase.
>>> + */
>>> + random_size = min_t(u64,
>>> + mem_size - (kernel_size_align * 2),
>>> + SZ_1G - (kernel_size_align * 2));
>>
>> pgdir size is 30 bits in sv39, but it's 39 bits in sv48, you should use
>> PGDIR_SIZE macro here.
>
> OK, change it in the next version. Thanks.
>
>>
>>> +
>>> + /* The index of 2M block in whole avaliable region */
>>> + index = random % (random_size / SZ_2M);
>>> +
>>> + return get_legal_offset(index, random_size / SZ_2M,
>>> + mem_start, kernel_size_align);
>>> +}
>>> +
>>> uintptr_t __init kaslr_early_init(void)
>>> {
>>> + u64 seed;
>>> uintptr_t dest_start, dest_end;
>>> uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
>>>
>>> /* Get zero value at second time to avoid doing randomization again. */
>>> - if (kaslr_offset)
>>> + seed = kaslr_get_seed();
>>> + if (!seed)
>>> return 0;
>>>
>>> /* Get the random number for kaslr offset. */
>>> - kaslr_offset = 0x10000000;
>>> + kaslr_offset = get_random_offset(seed, kernel_size);
>>>
>>> /* Update kernel_virt_addr for get_kaslr_offset. */
>>> kernel_virt_addr += kaslr_offset;
>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>>> index 2f5b25f02b6c..34c6ecf2c599 100644
>>> --- a/arch/riscv/mm/init.c
>>> +++ b/arch/riscv/mm/init.c
>>> @@ -125,7 +125,7 @@ static void __init setup_initrd(void)
>>> }
>>> #endif /* CONFIG_BLK_DEV_INITRD */
>>>
>>> -static phys_addr_t dtb_early_pa __initdata;
>>> +phys_addr_t dtb_early_pa __initdata;
>>>
>>> void __init setup_bootmem(void)
>>> {
>>>
>>
>> Alex

2020-04-09 05:54:03

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 3/8] riscv/kaslr: support KASLR infrastructure

Hi Zong,

On 4/7/20 6:34 AM, Zong Li wrote:
> On Tue, Apr 7, 2020 at 1:10 PM Alex Ghiti <[email protected]> wrote:
>>
>>
>>
>> On 3/24/20 3:30 AM, Zong Li wrote:
>>> This patch support KASLR implementation. It copies kernel image to a
>>> proper and random place, and make all harts go to the new destination.
>>>
>>> After KASLR initialization, secondary harts go to the new destination
>>> to wait their stack pointer to be setup by main hart, main hart goes to
>>> re-create the early page table and doing relocation by going back to
>>> setup_vm again.
>>>
>>> We separate the randomization process from this patch, so the kernel
>>> offset was not randomized yet, it just hardcode a meanless number here.
>>>
>>> Signed-off-by: Zong Li <[email protected]>
>>> ---
>>> arch/riscv/Kconfig | 15 +++++++++++
>>> arch/riscv/kernel/Makefile | 2 ++
>>> arch/riscv/kernel/head.S | 39 +++++++++++++++++++++++++++
>>> arch/riscv/kernel/kaslr.c | 55 ++++++++++++++++++++++++++++++++++++++
>>> arch/riscv/mm/init.c | 53 +++++++++++++++++++++++++++++++++++-
>>> 5 files changed, 163 insertions(+), 1 deletion(-)
>>> create mode 100644 arch/riscv/kernel/kaslr.c
>>>
>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>> index aea03ac470c8..8f566b40ea1e 100644
>>> --- a/arch/riscv/Kconfig
>>> +++ b/arch/riscv/Kconfig
>>> @@ -183,6 +183,21 @@ config RELOCATABLE
>>> relocation pass at runtime even if the kernel is loaded at the
>>> same address it was linked at.
>>>
>>> +config RANDOMIZE_BASE
>>> + bool "Randomize the address of the kernel image"
>>> + depends on MMU
>>> + select MODULE_SECTIONS if MODULES
>>> + select RELOCATABLE
>>> + help
>>> + Randomizes the virtual address at which the kernel image is
>>> + loaded, as a security feature that deters exploit attempts
>>> + relying on knowledge of the location of kernel internals.
>>> +
>>> + It is the job of previous stage to provide entropy, by passing a
>>> + random u64 value in /chosen/kaslr-seed at kernel entry.
>>> +
>>> + If unsure, say N.
>>> +
>>> source "arch/riscv/Kconfig.socs"
>>>
>>> menu "Platform type"
>>> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
>>> index d189bd3d8501..8f62732b1135 100644
>>> --- a/arch/riscv/kernel/Makefile
>>> +++ b/arch/riscv/kernel/Makefile
>>> @@ -45,4 +45,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o
>>> obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o
>>> obj-$(CONFIG_RISCV_SBI) += sbi.o
>>>
>>> +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
>>> +
>>> clean:
>>> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
>>> index cb4a6e2d3793..5191e528d813 100644
>>> --- a/arch/riscv/kernel/head.S
>>> +++ b/arch/riscv/kernel/head.S
>>> @@ -113,9 +113,12 @@ clear_bss_done:
>>> la a2, boot_cpu_hartid
>>> REG_S a0, (a2)
>>>
>>> +.align 2
>>
>> Why do you need this new alignment constraint ?
>
> We need to ensure the target of the trap vector is 4-byte alignment.

Ok thanks.

>
>>
>>> +early_page_table:
>>> /* Initialize page tables and relocate to virtual addresses */
>>> la sp, init_thread_union + THREAD_SIZE
>>> mv a0, s1
>>> +
>>
>> Newline ?
>
> Remove it in the next version. Thanks.
>
>>
>>> call setup_vm
>>> #ifdef CONFIG_MMU
>>> la a0, early_pg_dir
>>> @@ -127,6 +130,29 @@ clear_bss_done:
>>> sw zero, TASK_TI_CPU(tp)
>>> la sp, init_thread_union + THREAD_SIZE
>>>
>>> +#ifdef CONFIG_RANDOMIZE_BASE
>>> + /* KASRL initialization. Try to get a random kernel offset. */
>>> + call kaslr_early_init
>>> +
>>> + /* If return value equals to zero, we don't need to randomize kernel */
>>> + beqz a0, 1f
>>> +
>>> + la a1, early_page_table
>>> + add a1, a1, a0
>>> + la a0, va_pa_offset
>>> + REG_L a0, 0(a0)
>>> + sub a1, a1, a0
>>> + mv a0, s1
>>> +
>>> + /*
>>> + * Go to new kernel image destination, and disable MMU to re-create
>>> + * early page table and do relocation.
>>> + */
>>> + csrw CSR_TVEC, a1
>>> + csrw CSR_SATP, x0
>>> +1:
>>> +#endif
>>> +
>>> #ifdef CONFIG_KASAN
>>> call kasan_early_init
>>> #endif
>>> @@ -194,6 +220,19 @@ relocate:
>>> la a3, .Lsecondary_park
>>> csrw CSR_TVEC, a3
>>>
>>> +#ifdef CONFIG_RANDOMIZE_BASE
>>> + /*
>>> + * Wait winning hart to tell secondary harts where is the new
>>> + * destination to go.
>>> + */
>>> +.Lwait_for_next_target:
>>> + la a3, secondary_next_target
>>> + REG_L a3, 0(a3)
>>> + beqz a3, .Lwait_for_next_target
>>> + jr a3
>>> +.global secondary_random_target
>>> +secondary_random_target:
>>> +#endif
>>> slli a3, a0, LGREG
>>> la a1, __cpu_up_stack_pointer
>>> la a2, __cpu_up_task_pointer
>>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
>>> new file mode 100644
>>> index 000000000000..281b5fcca5c8
>>> --- /dev/null
>>> +++ b/arch/riscv/kernel/kaslr.c
>>> @@ -0,0 +1,55 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +/*
>>> + * Copyright (C) 2020 SiFive
>>> + * Copyright (C) 2020 Zong Li <[email protected]>
>>> + */
>>> +
>>> +#include <linux/libfdt.h>
>>> +#include <linux/timex.h>
>>> +#include <linux/random.h>
>>> +#include <linux/set_memory.h>
>>> +#include <asm/cacheflush.h>
>>> +
>>> +extern char _start[], _end[];
>>> +extern void secondary_random_target(void);
>>> +extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
>>> +
>>> +uintptr_t secondary_next_target __initdata;
>>> +static uintptr_t kaslr_offset __initdata;
>>> +
>>> +uintptr_t __init kaslr_early_init(void)
>>> +{
>>> + uintptr_t dest_start, dest_end;
>>> + uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
>>> +
>>> + /* Get zero value at second time to avoid doing randomization again. */
>>> + if (kaslr_offset)
>>> + return 0;
>>> +
>>> + /* Get the random number for kaslr offset. */
>>> + kaslr_offset = 0x10000000;
>>
>> For clarity, you could use a macro or something like that for this constant.
>
> This is a temporary assignment for this patch. The kaslr_offset is not
> randomized yet, so it is just a hardcode meaningless number here.
> Eventually, kalser_offset should be assigned a random number, that is
> what the next patch does ('riscv/kaslr: randomize the kernel image
> offset').

Yes, I just don't like random constants, even temporary. I was just
thinking of something like that:

#define KASLR_RANDOM_OFFSET 0x10000000

But it's up to you of course.

>
>>
>>> +
>>> + /* Update kernel_virt_addr for get_kaslr_offset. */
>>> + kernel_virt_addr += kaslr_offset;
>>
>> This could be done after you test if kaslr_offset is null below.
>
> Yes, make sense, change it in the next version patch. Thanks.
>
>>
>>> +
>>> + if (kaslr_offset) {
>>> + dest_start = (uintptr_t) (PAGE_OFFSET + kaslr_offset);
>>> + dest_end = dest_start + kernel_size;
>>
>> dest_end = dest_start + kernel_size - 1;
>
> OK, Thanks.
>
>>
>>> +
>>> + /* Create the new destination mapping for kernel image. */
>>> + kaslr_create_page_table(dest_start, dest_end);
>>> +
>>> + /* Copy kernel image from orignial location. */
>>> + memcpy((void *)dest_start, (void *)_start, kernel_size);
>>> + flush_icache_range(dest_start, dest_end); > +
>>> + /* Make secondary harts jump to new kernel image destination. */
>>> + WRITE_ONCE(secondary_next_target,
>>> + __pa_symbol(secondary_random_target) + kaslr_offset);
>>
>> Don't you need to sync secondary harts icache with main hart dcache here ?
>
> It seems to me that secondary harts could see secondary_next_target
> immediately through cache coherence, just like __cpu_up_stack_pointer
> and __cpu_up_task_pointer. Could you give more detail here or why we
> need to write secondary_next_target back to memory? Thanks.

I may be mistaken here, but flush_icache_range uses sfence.i instruction
that guarantees that following instruction fetches will see previously
written data. But this works for the local hart: what if other harts
already have a match in their instruction cache ? The ISA spec states:

"FENCE.I does not ensure that other RISC-V harts’ instruction fetches
will observe the local hart’s stores in a multiprocessor system. To make
a store to instruction memory visible to all RISC-V harts, the writing
hart has to execute a data FENCE before requesting that all remote
RISC-V harts execute a FENCE.I"

>
>>
>>> + } else {
>>> + WRITE_ONCE(secondary_next_target,
>>> + __pa_symbol(secondary_random_target));
>>> + }
>>> +
>>> + return kaslr_offset;
>>> +}
>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>>> index 51e263c04fa2..2f5b25f02b6c 100644
>>> --- a/arch/riscv/mm/init.c
>>> +++ b/arch/riscv/mm/init.c
>>> @@ -413,6 +413,41 @@ static void __init clear_pgd(pgd_t *pgdp)
>>> }
>>> }
>>> }
>>> +
>>> +static void __init clear_page_tables(void)
>>> +{
>>> + clear_pgd(early_pg_dir);
>>> + clear_pgd(trampoline_pg_dir);
>>> +}
>>
>> early page table and trampoline page table consist in one page per
>> level, I confirm that a memset to 0 is easier here.
>
> yes, I'll change it. Thanks.
>
>>
>>> +
>>> +void __init kaslr_create_page_table(uintptr_t start, uintptr_t end)
>>> +{
>>> + pgd_next_t *nextp;
>>> + phys_addr_t next_phys;
>>> + uintptr_t pgd_index, va;
>>> + phys_addr_t pa = __pa(PAGE_OFFSET) + get_kaslr_offset();
>>> + uintptr_t map_size =
>>> + best_map_size(__pa(PAGE_OFFSET), MAX_EARLY_MAPPING_SIZE);
>>> +
>>> + /* Expolit early_pg_dir and early_pmd during using early page table. */
>>> + for (va = start; va < end; va += map_size, pa += map_size) {
>>> + pgd_index = pgd_index(va);
>>> +
>>> + if (pgd_val(early_pg_dir[pgd_index]) == 0) {
>>> + next_phys = alloc_pgd_next(va);
>>> + early_pg_dir[pgd_index] =
>>> + pfn_pgd(PFN_DOWN(next_phys), PAGE_TABLE);
>>> + nextp = (pgd_next_t *)(__va(next_phys));
>>> + memset(nextp, 0, PAGE_SIZE);
>>> + } else {
>>> + next_phys = PFN_PHYS(_pgd_pfn(early_pg_dir[pgd_index]));
>>> + nextp = (pgd_next_t *)(__va(next_phys));
>>> + }
>>> +
>>> + create_pgd_next_mapping(nextp, va, pa, map_size,
>>> + PAGE_KERNEL_EXEC);
>>> + }
>>> +}
>>> #endif
>>
>> I may be missing something here: I don't see where the mappings for the
>> new kernel you create here are used between here and setup_vm ?
>
> Early page tables only create the mappings for original kernel image
> (i.e., from vmlinux_start to vmlinux_end), so the mapping of the
> destination of the new kernel image isn't be created, it would cause
> error when copying kernel image.

Oh right, setup_vm creates a mapping that only covers the kernel and not
a zone that spans an entire PGD: then you have to create mapping for the
destination.

Thanks,

>
>>
>> If I read correctly, if kaslr_early_init returns a random offset, you
>> disable mmu and then call setup_vm which will recreate early page tables
>> anyway.
>
> Yes, we can exploit the setup_vm implementation to create the page
> table for the destination of the new kernel image.
>
>>
>>>
>>> /*
>>> @@ -489,7 +524,13 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>>> uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);
>>>
>>> va_pa_offset = kernel_virt_addr - load_pa;
>>> - pfn_base = PFN_DOWN(load_pa);
>>> +
>>> + /*
>>> + * Update pfn_base only if pfn_base is empty. It's avoid to mess up it
>>> + * when re-enter this function by KASLR.
>>> + */
>>> + if (!pfn_base)
>>> + pfn_base = PFN_DOWN(load_pa);
>>>
>>> #ifdef CONFIG_RELOCATABLE
>>> /*
>>> @@ -513,6 +554,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>>> BUG_ON((load_pa % map_size) != 0);
>>> BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE);
>>>
>>> +#ifdef CONFIG_RANDOMIZE_BASE
>>> + /*
>>> + * Enter setup_vm twice if there is a legal random destination in KASLR,
>>> + * Reach here at second time, Clear page table because PTE entris allow
>>> + * writing when it's empty.
>>> + */
>>> + if (get_kaslr_offset())
>>> + clear_page_tables();
>>> +#endif
>>> +
>>> /* Setup early PGD for fixmap */
>>> create_pgd_mapping(early_pg_dir, FIXADDR_START,
>>> (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
>>>
>>
>> Just an idea, maybe worthless, but couldn't we benefit from kexec here ?
>> That's quite the same: copy a new kernel from the current kernel in some
>> new memory locations and then jump to it. We could pass the computed
>> random offset as a very early kernel parameter so that setup_vm would
>> only be called once (per kernel).
>
> Actually, I had tried something like you said, if that, we would
> encounter some difficulties. We need to limit kaslr implementation to
> use local symbols only, including all the functions which were used in
> other files, because the kernel is built as pie, so the global symbols
> need to be accessed by got table. If we want to access global symbols,
> we need to do relocation first, but even if we did relocation first,
> the content of each got table entry would be virtual address not
> physical address, it would cause error during MMU disabled.

The first time we enter setup_vm, relocations are done based on current
kernel_virt_addr so the GOT is already filled with virtual addresses
when MMU is disabled and it works since init.c is compiled with -fno-pie
option. So I'm not sure it would work differently from what you already do.

Alex

> Maybe we
> could overcome these problems, but it seems to me that it would be
> more difficult.
>
>>
>> Alex

2020-04-09 05:56:03

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 6/8] riscv/kaslr: clear the original kernel image



On 4/7/20 7:18 AM, Zong Li wrote:
> On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
>>
>> On 3/24/20 3:30 AM, Zong Li wrote:
>>> After completing final page table, we can clear original kernel image
>>> and remove executable permission.
>>>
>>> Signed-off-by: Zong Li <[email protected]>
>>> ---
>>> arch/riscv/include/asm/kaslr.h | 12 ++++++++++++
>>> arch/riscv/kernel/kaslr.c | 12 ++++++++++++
>>> arch/riscv/mm/init.c | 6 ++++++
>>> 3 files changed, 30 insertions(+)
>>> create mode 100644 arch/riscv/include/asm/kaslr.h
>>>
>>> diff --git a/arch/riscv/include/asm/kaslr.h b/arch/riscv/include/asm/kaslr.h
>>> new file mode 100644
>>> index 000000000000..b165fe71dd4a
>>> --- /dev/null
>>> +++ b/arch/riscv/include/asm/kaslr.h
>>> @@ -0,0 +1,12 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +/*
>>> + * Copyright (C) 2020 SiFive
>>> + * Copyright (C) 2020 Zong Li <[email protected]>
>>> + */
>>> +
>>> +#ifndef _ASM_RISCV_KASLR_H
>>> +#define _ASM_RISCV_KASLR_H
>>> +
>>> +void __init kaslr_late_init(void);
>>> +
>>> +#endif /* _ASM_RISCV_KASLR_H */
>>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
>>> index 59001d6fdfc3..0bd30831c455 100644
>>> --- a/arch/riscv/kernel/kaslr.c
>>> +++ b/arch/riscv/kernel/kaslr.c
>>> @@ -356,6 +356,18 @@ static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
>>> return get_legal_offset(random, kernel_size_align);
>>> }
>>>
>>> +void __init kaslr_late_init(void)
>>> +{
>>> + uintptr_t kernel_size;
>>> +
>>> + /* Clear original kernel image. */
>>> + if (kaslr_offset) {
>>> + kernel_size = (uintptr_t) _end - (uintptr_t) _start;
>>
>> kernel_size = (uintptr_t) _end - (uintptr_t) _start + 1;
>
> OK, change it in the next version. Thanks.
>
>>
>>> + memset((void *)PAGE_OFFSET, 0, kernel_size);
>>
>> I have been thinking again about our discussion regarding PAGE_OFFSET:
>> PAGE_OFFSET actually points to the address where the kernel was loaded,
>> not the beginning of memory, that's a bit weird.
>>
>> Just saying that here, because it took me a few seconds to remember that
>> and understand what you were doing here.
>
> In non-kaslr case, we load the kernel to PAGE_OFFSET which points to,
> so we clear the old kernel image through PAGE_OFFSET here. Certainly,
> we could use a symbol to record the start address of the old kernel
> image instead of PAGE_OFFSET here. I don't see other architectures
> changing PAGE_OFFSET after copying the kernel to the new location in
> kaslr. If you think the PAGE_OFFSET needs to be changed, we need to
> give another way to make the page table could create the mappings for
> the whole memory and memblock/buddy system could see the whole memory
> after the kernel moves.
> >>
>>> + set_memory_nx(PAGE_OFFSET, kaslr_offset >> PAGE_SHIFT);
>>
>> Again, I certainly missed something but when do you use old kernel
>> mappings ?
>
> We use old kernel mappings when KASLR calculates the random offset, at
> that moment, kernel is running on old kernel location.

Yes but haven't you already cleared the page table from the mappings for
the old kernel in clear_page_tables called in setup_vm of the new kernel ?

Alex

>
>>
>>> + }
>>> +}
>>> +
>>> uintptr_t __init kaslr_early_init(void)
>>> {
>>> u64 seed;
>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>>> index 34c6ecf2c599..08e2ce170533 100644
>>> --- a/arch/riscv/mm/init.c
>>> +++ b/arch/riscv/mm/init.c
>>> @@ -15,6 +15,7 @@
>>> #include <linux/set_memory.h>
>>> #ifdef CONFIG_RELOCATABLE
>>> #include <linux/elf.h>
>>> +#include <asm/kaslr.h>
>>> #endif
>>>
>>> #include <asm/fixmap.h>
>>> @@ -649,6 +650,11 @@ static void __init setup_vm_final(void)
>>> /* Move to swapper page table */
>>> csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
>>> local_flush_tlb_all();
>>> +
>>> +#ifdef CONFIG_RANDOMIZE_BASE
>>> + /* Clear orignial kernel image and set the right permission. */
>>> + kaslr_late_init();
>>> +#endif
>>> }
>>>
>>> void free_initmem(void)
>>>
>>
>> Alex

2020-04-09 08:16:32

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 6/8] riscv/kaslr: clear the original kernel image

On 4/9/20 1:53 AM, Alex Ghiti wrote:
>
>
> On 4/7/20 7:18 AM, Zong Li wrote:
>> On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
>>>
>>> On 3/24/20 3:30 AM, Zong Li wrote:
>>>> After completing final page table, we can clear original kernel image
>>>> and remove executable permission.
>>>>
>>>> Signed-off-by: Zong Li <[email protected]>
>>>> ---
>>>>    arch/riscv/include/asm/kaslr.h | 12 ++++++++++++
>>>>    arch/riscv/kernel/kaslr.c      | 12 ++++++++++++
>>>>    arch/riscv/mm/init.c           |  6 ++++++
>>>>    3 files changed, 30 insertions(+)
>>>>    create mode 100644 arch/riscv/include/asm/kaslr.h
>>>>
>>>> diff --git a/arch/riscv/include/asm/kaslr.h
>>>> b/arch/riscv/include/asm/kaslr.h
>>>> new file mode 100644
>>>> index 000000000000..b165fe71dd4a
>>>> --- /dev/null
>>>> +++ b/arch/riscv/include/asm/kaslr.h
>>>> @@ -0,0 +1,12 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>> +/*
>>>> + * Copyright (C) 2020 SiFive
>>>> + * Copyright (C) 2020 Zong Li <[email protected]>
>>>> + */
>>>> +
>>>> +#ifndef _ASM_RISCV_KASLR_H
>>>> +#define _ASM_RISCV_KASLR_H
>>>> +
>>>> +void __init kaslr_late_init(void);
>>>> +
>>>> +#endif /* _ASM_RISCV_KASLR_H */
>>>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
>>>> index 59001d6fdfc3..0bd30831c455 100644
>>>> --- a/arch/riscv/kernel/kaslr.c
>>>> +++ b/arch/riscv/kernel/kaslr.c
>>>> @@ -356,6 +356,18 @@ static __init uintptr_t get_random_offset(u64
>>>> seed, uintptr_t kernel_size)
>>>>        return get_legal_offset(random, kernel_size_align);
>>>>    }
>>>>
>>>> +void __init kaslr_late_init(void)
>>>> +{
>>>> +     uintptr_t kernel_size;
>>>> +
>>>> +     /* Clear original kernel image. */
>>>> +     if (kaslr_offset) {
>>>> +             kernel_size = (uintptr_t) _end - (uintptr_t) _start;
>>>
>>> kernel_size = (uintptr_t) _end - (uintptr_t) _start + 1;
>>
>> OK, change it in the next version. Thanks.
>>
>>>
>>>> +             memset((void *)PAGE_OFFSET, 0, kernel_size);
>>>
>>> I have been thinking again about our discussion regarding PAGE_OFFSET:
>>> PAGE_OFFSET actually points to the address where the kernel was loaded,
>>> not the beginning of memory, that's a bit weird.
>>>
>>> Just saying that here, because it took me a few seconds to remember that
>>> and understand what you were doing here.
>>
>> In non-kaslr case, we load the kernel to PAGE_OFFSET which points to,
>> so we clear the old kernel image through PAGE_OFFSET here. Certainly,
>> we could use a symbol to record the start address of the old kernel
>> image instead of PAGE_OFFSET here. I don't see other architectures
>> changing PAGE_OFFSET after copying the kernel to the new location in
>> kaslr. If you think the PAGE_OFFSET needs to be changed, we need to
>> give another way to make the page table could create the mappings for
>> the whole memory and memblock/buddy system could see the whole memory
>> after the kernel moves.
>>  >>
>>>> +             set_memory_nx(PAGE_OFFSET, kaslr_offset >> PAGE_SHIFT);
>>>
>>> Again, I certainly missed something but when do you use old kernel
>>> mappings ?
>>
>> We use old kernel mappings when KASLR calculates the random offset, at
>> that moment, kernel is running on old kernel location.
>
> Yes but haven't you already cleared the page table from the mappings for
> the old kernel in clear_page_tables called in setup_vm of the new kernel ?
>
> Alex
>

I had a doubt so I read set_memory_nx implementation again and I was
indeed completely wrong: set_memory_nx tackles init_mm and then
swapper_pg_dir. So you just remove executability for the old kernel
zone, that's ok I think.

Sorry for the noise !

Alex

>>
>>>
>>>> +     }
>>>> +}
>>>> +
>>>>    uintptr_t __init kaslr_early_init(void)
>>>>    {
>>>>        u64 seed;
>>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>>>> index 34c6ecf2c599..08e2ce170533 100644
>>>> --- a/arch/riscv/mm/init.c
>>>> +++ b/arch/riscv/mm/init.c
>>>> @@ -15,6 +15,7 @@
>>>>    #include <linux/set_memory.h>
>>>>    #ifdef CONFIG_RELOCATABLE
>>>>    #include <linux/elf.h>
>>>> +#include <asm/kaslr.h>
>>>>    #endif
>>>>
>>>>    #include <asm/fixmap.h>
>>>> @@ -649,6 +650,11 @@ static void __init setup_vm_final(void)
>>>>        /* Move to swapper page table */
>>>>        csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) |
>>>> SATP_MODE);
>>>>        local_flush_tlb_all();
>>>> +
>>>> +#ifdef CONFIG_RANDOMIZE_BASE
>>>> +     /* Clear orignial kernel image and set the right permission. */
>>>> +     kaslr_late_init();
>>>> +#endif
>>>>    }
>>>>
>>>>    void free_initmem(void)
>>>>
>>>
>>> Alex

2020-04-09 10:33:40

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset

On Thu, Apr 9, 2020 at 1:51 PM Alex Ghiti <[email protected]> wrote:
>
>
>
> On 4/7/20 6:53 AM, Zong Li wrote:
> > On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
> >>
> >>
> >> On 3/24/20 3:30 AM, Zong Li wrote:
> >>> Entropy is derived from the banner and timer, it is better than nothing
> >>> but not enough secure, so previous stage may pass entropy via the device
> >>> tree /chosen/kaslr-seed node.
> >>>
> >>> We limit randomization range within 1GB, so we can exploit early page
> >>> table to map new destination of kernel image. Additionally, the kernel
> >>> offset need 2M alignment to ensure it's good in PMD page table.
> >>>
> >>> We also checks the kernel offset whether it's safe by avoiding to
> >>> overlaps with dtb, initrd and reserved memory regions.
> >>>
> >>
> >> That maybe changes the way my sv48 patchset will be implemented: I can't
> >> get user preference (3-level or 4-level) by any means, device-tree or
> >> kernel parameter.
> >>
> >> But I don't see how you could get a random offset without info from the
> >> device tree anyway (reserved memory regions especially), so maybe I
> >> could parse dtb for allowing the user to choose. I'll move this
> >> discussion to the sv48 introduction.
> >
> > Maybe I'm a little bit misunderstanding here, but I think I got the
> > random offset through some information by parsing dtb.
> >
>
> I was just saying that I may use the dtb too in sv48 patchset to make it
> possible for users to choose sv39 even if sv48 is supported by hardware
> (which is not the case in my current patchset).
>
> >>
> >>> Signed-off-by: Zong Li <[email protected]>
> >>> ---
> >>> arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
> >>> arch/riscv/mm/init.c | 2 +-
> >>> 2 files changed, 273 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> >>> index 281b5fcca5c8..9ec2b608eb7f 100644
> >>> --- a/arch/riscv/kernel/kaslr.c
> >>> +++ b/arch/riscv/kernel/kaslr.c
> >>> @@ -11,23 +11,293 @@
> >>> #include <asm/cacheflush.h>
> >>>
> >>> extern char _start[], _end[];
> >>> +extern void *dtb_early_va;
> >>> +extern phys_addr_t dtb_early_pa;
> >>> extern void secondary_random_target(void);
> >>> extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
> >>>
> >>> uintptr_t secondary_next_target __initdata;
> >>> static uintptr_t kaslr_offset __initdata;
> >>>
> >>> +static const __init u32 *get_reg_address(int root_cells,
> >>> + const u32 *value, u64 *result)
> >>> +{
> >>> + int cell;
> >>> + *result = 0;
> >>> +
> >>> + for (cell = root_cells; cell > 0; --cell)
> >>> + *result = (*result << 32) + fdt32_to_cpu(*value++);
> >>> +
> >>> + return value;
> >>> +}
> >>> +
> >>> +static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
> >>> + int *size_cell)
> >>> +{
> >>> + int node = fdt_path_offset(dtb_early_va, path);
> >>> + fdt64_t *prop;
> >>> +
> >>> + if (node < 0)
> >>> + return -EINVAL;
> >>> +
> >>> + prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
> >>> + if (!prop)
> >>> + return -EINVAL;
> >>> + *addr_cell = fdt32_to_cpu(*prop);
> >>> +
> >>> + prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
> >>> + if (!prop)
> >>> + return -EINVAL;
> >>> + *size_cell = fdt32_to_cpu(*prop);
> >>> +
> >>> + return node;
> >>> +}
> >>> +
> >>> +static __init void kaslr_get_mem_info(uintptr_t *mem_start,
> >>> + uintptr_t *mem_size)
> >>> +{
> >>> + int node, root, addr_cells, size_cells;
> >>> + u64 base, size;
> >>> +
> >>> + /* Get root node's address cells and size cells. */
> >>> + root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
> >>> + if (root < 0)
> >>> + return;
> >>> +
> >>> + /* Get memory base address and size. */
> >>> + fdt_for_each_subnode(node, dtb_early_va, root) {
> >>> + const char *dev_type;
> >>> + const u32 *reg;
> >>> +
> >>> + dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
> >>> + if (!dev_type)
> >>> + continue;
> >>> +
> >>> + if (!strcmp(dev_type, "memory")) {
> >>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> >>> + if (!reg)
> >>> + return;
> >>> +
> >>> + reg = get_reg_address(addr_cells, reg, &base);
> >>> + reg = get_reg_address(size_cells, reg, &size);
> >>> +
> >>> + *mem_start = base;
> >>> + *mem_size = size;
> >>> +
> >>> + break;
> >>> + }
> >>> + }
> >>> +}
> >>> +
> >>> +/* Return a default seed if there is no HW generator. */
> >>> +static u64 kaslr_default_seed = ULL(-1);
> >>> +static __init u64 kaslr_get_seed(void)
> >>> +{
> >>> + int node, len;
> >>> + fdt64_t *prop;
> >>> + u64 ret;
> >>> +
> >>> + node = fdt_path_offset(dtb_early_va, "/chosen");
> >>> + if (node < 0)
> >>> + return kaslr_default_seed++;
> >>> +
> >>> + prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
> >>> + if (!prop || len != sizeof(u64))
> >>> + return kaslr_default_seed++;
> >>> +
> >>> + ret = fdt64_to_cpu(*prop);
> >>> +
> >>> + /* Re-write to zero for checking whether get seed at second time */
> >>> + *prop = 0;
> >>> +
> >>> + return ret;
> >>> +}
> >>> +
> >>> +static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
> >>> + uintptr_t e2)
> >>> +{
> >>> + return e1 >= s2 && e2 >= s1;
> >>> +}
> >>
> >> Inline this function or use a macro maybe.
> >
> > Yes, sure. Thanks.
> >
> >>
> >>> +
> >>> +static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
> >>> + uintptr_t end_addr)
> >>> +{
> >>> + int node, rsv_mem, addr_cells, size_cells;
> >>> +
> >>> + /* Get the reserved-memory node. */
> >>> + rsv_mem = get_node_addr_size_cells("/reserved-memory",
> >>> + &addr_cells,
> >>> + &size_cells);
> >>> + if (rsv_mem < 0)
> >>> + return false;
> >>> +
> >>> + /* Get memory base address and size. */
> >>> + fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
> >>> + uint64_t base, size;
> >>> + const uint32_t *reg;
> >>> +
> >>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> >>> + if (!reg)
> >>> + return 0;
> >>> +
> >>> + reg = get_reg_address(addr_cells, reg, &base);
> >>> + reg = get_reg_address(size_cells, reg, &size);
> >>> +
> >>> + if (is_overlap(start_addr, end_addr, base, base + size))
> >>> + return true;
> >>> + }
> >>> +
> >>> + return false;
> >>> +}
> >>> +
> >>> +static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
> >>> +{
> >>> + int node;
> >>> + uintptr_t initrd_start, initrd_end;
> >>> + fdt64_t *prop;
> >>> +
> >>> + node = fdt_path_offset(dtb_early_va, "/chosen");
> >>> + if (node < 0)
> >>> + return false;
> >>> +
> >>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
> >>> + if (!prop)
> >>> + return false;
> >>> +
> >>> + initrd_start = fdt64_to_cpu(*prop);
> >>> +
> >>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
> >>> + if (!prop)
> >>> + return false;
> >>> +
> >>> + initrd_end = fdt64_to_cpu(*prop);
> >>> +
> >>> + return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
> >>> +}
> >>> +
> >>> +static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
> >>> +{
> >>> + uintptr_t dtb_start = dtb_early_pa;
> >>> + uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
> >>> +
> >>> + return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
> >>> +}
> >>> +
> >>> +static __init bool has_regions_overlapping(uintptr_t start_addr,
> >>> + uintptr_t end_addr)
> >>> +{
> >>> + if (is_overlap_dtb(start_addr, end_addr))
> >>> + return true;
> >>> +
> >>> + if (is_overlap_initrd(start_addr, end_addr))
> >>> + return true;
> >>> +
> >>> + if (is_overlap_reserved_mem(start_addr, end_addr))
> >>> + return true;
> >>> +
> >>> + return false;
> >>> +}
> >>> +
> >>> +static inline __init unsigned long get_legal_offset(int random_index,
> >>> + int max_index,
> >>> + uintptr_t mem_start,
> >>> + uintptr_t kernel_size)
> >>> +{
> >>> + uintptr_t start_addr, end_addr;
> >>> + int idx, stop_idx;
> >>> +
> >>> + idx = stop_idx = random_index;
> >>> +
> >>> + do {
> >>> + start_addr = mem_start + idx * SZ_2M + kernel_size;
> >>> + end_addr = start_addr + kernel_size;
> >>> +
> >>> + /* Check overlap to other regions. */
> >>> + if (!has_regions_overlapping(start_addr, end_addr))
> >>> + return idx * SZ_2M + kernel_size;
> >>> +
> >>> + if (idx-- < 0)
> >>> + idx = max_index;
> >>
> >> Isn't the fallback to max_index a security breach ? Because at some
> >> point, the kernel will be loaded at this specific address.
> >
> > The max_index is the maximum safe index for destination of new kernel
> > image. Could you give more explain here?
> >
>
> But max_index is not random at all. I really don't know if that's a
> problem, I just found intriguing the fact the kernel could be loaded at
> some specific location. Would it be more secure, instead of picking
> max_index as fallback when reaching 0, to pick another random number
> between random_index and max_index ?

ok, I can get your point. The original idea here is that we get a
random index first, then we decrease the index to retry to find a good
place if there are overlapping with other regions. A bit like the ring
buffer, the end of index traversing is not zero, but the random_index
- 1, we might consider it as continuity, so we don't know where is the
end point because the start point is random, whether we stop at zero
or random_index - 1.

Pick another random number is more secure when occurring overlapping,
but I a little bit worry that it would take very long time to retry
many times in the worst case. for example, there is just only one
index could fit kernel image in (except for original location). In the
meantime, we don't need to wait the index being decreased to zero,
because it seems to me that they are the same to stop at zero or
random_index - 1, so if we decide to re-calculate a new random number,
maybe we could remove the index decreasing here.

>
> Alex
>
> >>
> >>> +
> >>> + } while (idx != stop_idx);
> >>> +
> >>> + return 0;
> >>> +}
> >>> +
> >>> +static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
> >>> +{
> >>> + size_t i;
> >>> + uintptr_t *ptr = (uintptr_t *) area;
> >>> +
> >>> + for (i = 0; i < size / sizeof(hash); i++) {
> >>> + /* Rotate by odd number of bits and XOR. */
> >>> + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
> >>> + hash ^= ptr[i];
> >>> + }
> >>> +
> >>> + return hash;
> >>> +}
> >>> +
> >>> +#define MEM_RESERVE_START __pa(PAGE_OFFSET)
> >>> +static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
> >>> +{
> >>> + uintptr_t mem_start = 0, mem_size= 0, random_size;
> >>> + uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
> >>> + int index;
> >>> + u64 random = 0;
> >>> + cycles_t time_base;
> >>> +
> >>> + /* Attempt to create a simple but unpredictable starting entropy */
> >>> + random = rotate_xor(random, linux_banner, strlen(linux_banner));
> >>> +
> >>> + /*
> >>> + * If there is no HW random number generator, use timer to get a random
> >>> + * number. This is better than nothing but not enough secure.
> >>> + */
> >>> + time_base = get_cycles() << 32;
> >>> + time_base ^= get_cycles();
> >>> + random = rotate_xor(random, &time_base, sizeof(time_base));
> >>> +
> >>> + if (seed)
> >>> + random = rotate_xor(random, &seed, sizeof(seed));
> >>> +
> >>> + kaslr_get_mem_info(&mem_start, &mem_size);
> >>> + if (!mem_size)
> >>> + return 0;
> >>> +
> >>> + if (mem_start < MEM_RESERVE_START) {
> >>> + mem_size -= MEM_RESERVE_START - mem_start;
> >>> + mem_start = MEM_RESERVE_START;
> >>> + }
> >>> +
> >>> + /*
> >>> + * Limit randomization range within 1G, so we can exploit
> >>> + * early_pmd/early_pte during early page table phase.
> >>> + */
> >>> + random_size = min_t(u64,
> >>> + mem_size - (kernel_size_align * 2),
> >>> + SZ_1G - (kernel_size_align * 2));
> >>
> >> pgdir size is 30 bits in sv39, but it's 39 bits in sv48, you should use
> >> PGDIR_SIZE macro here.
> >
> > OK, change it in the next version. Thanks.
> >
> >>
> >>> +
> >>> + /* The index of 2M block in whole avaliable region */
> >>> + index = random % (random_size / SZ_2M);
> >>> +
> >>> + return get_legal_offset(index, random_size / SZ_2M,
> >>> + mem_start, kernel_size_align);
> >>> +}
> >>> +
> >>> uintptr_t __init kaslr_early_init(void)
> >>> {
> >>> + u64 seed;
> >>> uintptr_t dest_start, dest_end;
> >>> uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
> >>>
> >>> /* Get zero value at second time to avoid doing randomization again. */
> >>> - if (kaslr_offset)
> >>> + seed = kaslr_get_seed();
> >>> + if (!seed)
> >>> return 0;
> >>>
> >>> /* Get the random number for kaslr offset. */
> >>> - kaslr_offset = 0x10000000;
> >>> + kaslr_offset = get_random_offset(seed, kernel_size);
> >>>
> >>> /* Update kernel_virt_addr for get_kaslr_offset. */
> >>> kernel_virt_addr += kaslr_offset;
> >>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> >>> index 2f5b25f02b6c..34c6ecf2c599 100644
> >>> --- a/arch/riscv/mm/init.c
> >>> +++ b/arch/riscv/mm/init.c
> >>> @@ -125,7 +125,7 @@ static void __init setup_initrd(void)
> >>> }
> >>> #endif /* CONFIG_BLK_DEV_INITRD */
> >>>
> >>> -static phys_addr_t dtb_early_pa __initdata;
> >>> +phys_addr_t dtb_early_pa __initdata;
> >>>
> >>> void __init setup_bootmem(void)
> >>> {
> >>>
> >>
> >> Alex

2020-04-09 11:09:19

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH RFC 3/8] riscv/kaslr: support KASLR infrastructure

On Thu, Apr 9, 2020 at 1:53 PM Alex Ghiti <[email protected]> wrote:
>
> Hi Zong,
>
> On 4/7/20 6:34 AM, Zong Li wrote:
> > On Tue, Apr 7, 2020 at 1:10 PM Alex Ghiti <[email protected]> wrote:
> >>
> >>
> >>
> >> On 3/24/20 3:30 AM, Zong Li wrote:
> >>> This patch support KASLR implementation. It copies kernel image to a
> >>> proper and random place, and make all harts go to the new destination.
> >>>
> >>> After KASLR initialization, secondary harts go to the new destination
> >>> to wait their stack pointer to be setup by main hart, main hart goes to
> >>> re-create the early page table and doing relocation by going back to
> >>> setup_vm again.
> >>>
> >>> We separate the randomization process from this patch, so the kernel
> >>> offset was not randomized yet, it just hardcode a meanless number here.
> >>>
> >>> Signed-off-by: Zong Li <[email protected]>
> >>> ---
> >>> arch/riscv/Kconfig | 15 +++++++++++
> >>> arch/riscv/kernel/Makefile | 2 ++
> >>> arch/riscv/kernel/head.S | 39 +++++++++++++++++++++++++++
> >>> arch/riscv/kernel/kaslr.c | 55 ++++++++++++++++++++++++++++++++++++++
> >>> arch/riscv/mm/init.c | 53 +++++++++++++++++++++++++++++++++++-
> >>> 5 files changed, 163 insertions(+), 1 deletion(-)
> >>> create mode 100644 arch/riscv/kernel/kaslr.c
> >>>
> >>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> >>> index aea03ac470c8..8f566b40ea1e 100644
> >>> --- a/arch/riscv/Kconfig
> >>> +++ b/arch/riscv/Kconfig
> >>> @@ -183,6 +183,21 @@ config RELOCATABLE
> >>> relocation pass at runtime even if the kernel is loaded at the
> >>> same address it was linked at.
> >>>
> >>> +config RANDOMIZE_BASE
> >>> + bool "Randomize the address of the kernel image"
> >>> + depends on MMU
> >>> + select MODULE_SECTIONS if MODULES
> >>> + select RELOCATABLE
> >>> + help
> >>> + Randomizes the virtual address at which the kernel image is
> >>> + loaded, as a security feature that deters exploit attempts
> >>> + relying on knowledge of the location of kernel internals.
> >>> +
> >>> + It is the job of previous stage to provide entropy, by passing a
> >>> + random u64 value in /chosen/kaslr-seed at kernel entry.
> >>> +
> >>> + If unsure, say N.
> >>> +
> >>> source "arch/riscv/Kconfig.socs"
> >>>
> >>> menu "Platform type"
> >>> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> >>> index d189bd3d8501..8f62732b1135 100644
> >>> --- a/arch/riscv/kernel/Makefile
> >>> +++ b/arch/riscv/kernel/Makefile
> >>> @@ -45,4 +45,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o
> >>> obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o
> >>> obj-$(CONFIG_RISCV_SBI) += sbi.o
> >>>
> >>> +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
> >>> +
> >>> clean:
> >>> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> >>> index cb4a6e2d3793..5191e528d813 100644
> >>> --- a/arch/riscv/kernel/head.S
> >>> +++ b/arch/riscv/kernel/head.S
> >>> @@ -113,9 +113,12 @@ clear_bss_done:
> >>> la a2, boot_cpu_hartid
> >>> REG_S a0, (a2)
> >>>
> >>> +.align 2
> >>
> >> Why do you need this new alignment constraint ?
> >
> > We need to ensure the target of the trap vector is 4-byte alignment.
>
> Ok thanks.
>
> >
> >>
> >>> +early_page_table:
> >>> /* Initialize page tables and relocate to virtual addresses */
> >>> la sp, init_thread_union + THREAD_SIZE
> >>> mv a0, s1
> >>> +
> >>
> >> Newline ?
> >
> > Remove it in the next version. Thanks.
> >
> >>
> >>> call setup_vm
> >>> #ifdef CONFIG_MMU
> >>> la a0, early_pg_dir
> >>> @@ -127,6 +130,29 @@ clear_bss_done:
> >>> sw zero, TASK_TI_CPU(tp)
> >>> la sp, init_thread_union + THREAD_SIZE
> >>>
> >>> +#ifdef CONFIG_RANDOMIZE_BASE
> >>> + /* KASRL initialization. Try to get a random kernel offset. */
> >>> + call kaslr_early_init
> >>> +
> >>> + /* If return value equals to zero, we don't need to randomize kernel */
> >>> + beqz a0, 1f
> >>> +
> >>> + la a1, early_page_table
> >>> + add a1, a1, a0
> >>> + la a0, va_pa_offset
> >>> + REG_L a0, 0(a0)
> >>> + sub a1, a1, a0
> >>> + mv a0, s1
> >>> +
> >>> + /*
> >>> + * Go to new kernel image destination, and disable MMU to re-create
> >>> + * early page table and do relocation.
> >>> + */
> >>> + csrw CSR_TVEC, a1
> >>> + csrw CSR_SATP, x0
> >>> +1:
> >>> +#endif
> >>> +
> >>> #ifdef CONFIG_KASAN
> >>> call kasan_early_init
> >>> #endif
> >>> @@ -194,6 +220,19 @@ relocate:
> >>> la a3, .Lsecondary_park
> >>> csrw CSR_TVEC, a3
> >>>
> >>> +#ifdef CONFIG_RANDOMIZE_BASE
> >>> + /*
> >>> + * Wait winning hart to tell secondary harts where is the new
> >>> + * destination to go.
> >>> + */
> >>> +.Lwait_for_next_target:
> >>> + la a3, secondary_next_target
> >>> + REG_L a3, 0(a3)
> >>> + beqz a3, .Lwait_for_next_target
> >>> + jr a3
> >>> +.global secondary_random_target
> >>> +secondary_random_target:
> >>> +#endif
> >>> slli a3, a0, LGREG
> >>> la a1, __cpu_up_stack_pointer
> >>> la a2, __cpu_up_task_pointer
> >>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> >>> new file mode 100644
> >>> index 000000000000..281b5fcca5c8
> >>> --- /dev/null
> >>> +++ b/arch/riscv/kernel/kaslr.c
> >>> @@ -0,0 +1,55 @@
> >>> +// SPDX-License-Identifier: GPL-2.0-only
> >>> +/*
> >>> + * Copyright (C) 2020 SiFive
> >>> + * Copyright (C) 2020 Zong Li <[email protected]>
> >>> + */
> >>> +
> >>> +#include <linux/libfdt.h>
> >>> +#include <linux/timex.h>
> >>> +#include <linux/random.h>
> >>> +#include <linux/set_memory.h>
> >>> +#include <asm/cacheflush.h>
> >>> +
> >>> +extern char _start[], _end[];
> >>> +extern void secondary_random_target(void);
> >>> +extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
> >>> +
> >>> +uintptr_t secondary_next_target __initdata;
> >>> +static uintptr_t kaslr_offset __initdata;
> >>> +
> >>> +uintptr_t __init kaslr_early_init(void)
> >>> +{
> >>> + uintptr_t dest_start, dest_end;
> >>> + uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
> >>> +
> >>> + /* Get zero value at second time to avoid doing randomization again. */
> >>> + if (kaslr_offset)
> >>> + return 0;
> >>> +
> >>> + /* Get the random number for kaslr offset. */
> >>> + kaslr_offset = 0x10000000;
> >>
> >> For clarity, you could use a macro or something like that for this constant.
> >
> > This is a temporary assignment for this patch. The kaslr_offset is not
> > randomized yet, so it is just a hardcode meaningless number here.
> > Eventually, kalser_offset should be assigned a random number, that is
> > what the next patch does ('riscv/kaslr: randomize the kernel image
> > offset').
>
> Yes, I just don't like random constants, even temporary. I was just
> thinking of something like that:
>
> #define KASLR_RANDOM_OFFSET 0x10000000
>
> But it's up to you of course.

It is OK to me to change it. Or maybe I add some comments here, does
it sound good to you?


>
> >
> >>
> >>> +
> >>> + /* Update kernel_virt_addr for get_kaslr_offset. */
> >>> + kernel_virt_addr += kaslr_offset;
> >>
> >> This could be done after you test if kaslr_offset is null below.
> >
> > Yes, make sense, change it in the next version patch. Thanks.
> >
> >>
> >>> +
> >>> + if (kaslr_offset) {
> >>> + dest_start = (uintptr_t) (PAGE_OFFSET + kaslr_offset);
> >>> + dest_end = dest_start + kernel_size;
> >>
> >> dest_end = dest_start + kernel_size - 1;
> >
> > OK, Thanks.
> >
> >>
> >>> +
> >>> + /* Create the new destination mapping for kernel image. */
> >>> + kaslr_create_page_table(dest_start, dest_end);
> >>> +
> >>> + /* Copy kernel image from orignial location. */
> >>> + memcpy((void *)dest_start, (void *)_start, kernel_size);
> >>> + flush_icache_range(dest_start, dest_end); > +
> >>> + /* Make secondary harts jump to new kernel image destination. */
> >>> + WRITE_ONCE(secondary_next_target,
> >>> + __pa_symbol(secondary_random_target) + kaslr_offset);
> >>
> >> Don't you need to sync secondary harts icache with main hart dcache here ?
> >
> > It seems to me that secondary harts could see secondary_next_target
> > immediately through cache coherence, just like __cpu_up_stack_pointer
> > and __cpu_up_task_pointer. Could you give more detail here or why we
> > need to write secondary_next_target back to memory? Thanks.
>
> I may be mistaken here, but flush_icache_range uses sfence.i instruction
> that guarantees that following instruction fetches will see previously
> written data. But this works for the local hart: what if other harts
> already have a match in their instruction cache ? The ISA spec states:
>
> "FENCE.I does not ensure that other RISC-V harts’ instruction fetches
> will observe the local hart’s stores in a multiprocessor system. To make
> a store to instruction memory visible to all RISC-V harts, the writing
> hart has to execute a data FENCE before requesting that all remote
> RISC-V harts execute a FENCE.I"
>
> >
> >>
> >>> + } else {
> >>> + WRITE_ONCE(secondary_next_target,
> >>> + __pa_symbol(secondary_random_target));
> >>> + }
> >>> +
> >>> + return kaslr_offset;
> >>> +}
> >>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> >>> index 51e263c04fa2..2f5b25f02b6c 100644
> >>> --- a/arch/riscv/mm/init.c
> >>> +++ b/arch/riscv/mm/init.c
> >>> @@ -413,6 +413,41 @@ static void __init clear_pgd(pgd_t *pgdp)
> >>> }
> >>> }
> >>> }
> >>> +
> >>> +static void __init clear_page_tables(void)
> >>> +{
> >>> + clear_pgd(early_pg_dir);
> >>> + clear_pgd(trampoline_pg_dir);
> >>> +}
> >>
> >> early page table and trampoline page table consist in one page per
> >> level, I confirm that a memset to 0 is easier here.
> >
> > yes, I'll change it. Thanks.
> >
> >>
> >>> +
> >>> +void __init kaslr_create_page_table(uintptr_t start, uintptr_t end)
> >>> +{
> >>> + pgd_next_t *nextp;
> >>> + phys_addr_t next_phys;
> >>> + uintptr_t pgd_index, va;
> >>> + phys_addr_t pa = __pa(PAGE_OFFSET) + get_kaslr_offset();
> >>> + uintptr_t map_size =
> >>> + best_map_size(__pa(PAGE_OFFSET), MAX_EARLY_MAPPING_SIZE);
> >>> +
> >>> + /* Expolit early_pg_dir and early_pmd during using early page table. */
> >>> + for (va = start; va < end; va += map_size, pa += map_size) {
> >>> + pgd_index = pgd_index(va);
> >>> +
> >>> + if (pgd_val(early_pg_dir[pgd_index]) == 0) {
> >>> + next_phys = alloc_pgd_next(va);
> >>> + early_pg_dir[pgd_index] =
> >>> + pfn_pgd(PFN_DOWN(next_phys), PAGE_TABLE);
> >>> + nextp = (pgd_next_t *)(__va(next_phys));
> >>> + memset(nextp, 0, PAGE_SIZE);
> >>> + } else {
> >>> + next_phys = PFN_PHYS(_pgd_pfn(early_pg_dir[pgd_index]));
> >>> + nextp = (pgd_next_t *)(__va(next_phys));
> >>> + }
> >>> +
> >>> + create_pgd_next_mapping(nextp, va, pa, map_size,
> >>> + PAGE_KERNEL_EXEC);
> >>> + }
> >>> +}
> >>> #endif
> >>
> >> I may be missing something here: I don't see where the mappings for the
> >> new kernel you create here are used between here and setup_vm ?
> >
> > Early page tables only create the mappings for original kernel image
> > (i.e., from vmlinux_start to vmlinux_end), so the mapping of the
> > destination of the new kernel image isn't be created, it would cause
> > error when copying kernel image.
>
> Oh right, setup_vm creates a mapping that only covers the kernel and not
> a zone that spans an entire PGD: then you have to create mapping for the
> destination.
>
> Thanks,
>
> >
> >>
> >> If I read correctly, if kaslr_early_init returns a random offset, you
> >> disable mmu and then call setup_vm which will recreate early page tables
> >> anyway.
> >
> > Yes, we can exploit the setup_vm implementation to create the page
> > table for the destination of the new kernel image.
> >
> >>
> >>>
> >>> /*
> >>> @@ -489,7 +524,13 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> >>> uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);
> >>>
> >>> va_pa_offset = kernel_virt_addr - load_pa;
> >>> - pfn_base = PFN_DOWN(load_pa);
> >>> +
> >>> + /*
> >>> + * Update pfn_base only if pfn_base is empty. It's avoid to mess up it
> >>> + * when re-enter this function by KASLR.
> >>> + */
> >>> + if (!pfn_base)
> >>> + pfn_base = PFN_DOWN(load_pa);
> >>>
> >>> #ifdef CONFIG_RELOCATABLE
> >>> /*
> >>> @@ -513,6 +554,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> >>> BUG_ON((load_pa % map_size) != 0);
> >>> BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE);
> >>>
> >>> +#ifdef CONFIG_RANDOMIZE_BASE
> >>> + /*
> >>> + * Enter setup_vm twice if there is a legal random destination in KASLR,
> >>> + * Reach here at second time, Clear page table because PTE entris allow
> >>> + * writing when it's empty.
> >>> + */
> >>> + if (get_kaslr_offset())
> >>> + clear_page_tables();
> >>> +#endif
> >>> +
> >>> /* Setup early PGD for fixmap */
> >>> create_pgd_mapping(early_pg_dir, FIXADDR_START,
> >>> (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> >>>
> >>
> >> Just an idea, maybe worthless, but couldn't we benefit from kexec here ?
> >> That's quite the same: copy a new kernel from the current kernel in some
> >> new memory locations and then jump to it. We could pass the computed
> >> random offset as a very early kernel parameter so that setup_vm would
> >> only be called once (per kernel).
> >
> > Actually, I had tried something like you said, if that, we would
> > encounter some difficulties. We need to limit kaslr implementation to
> > use local symbols only, including all the functions which were used in
> > other files, because the kernel is built as pie, so the global symbols
> > need to be accessed by got table. If we want to access global symbols,
> > we need to do relocation first, but even if we did relocation first,
> > the content of each got table entry would be virtual address not
> > physical address, it would cause error during MMU disabled.
>
> The first time we enter setup_vm, relocations are done based on current
> kernel_virt_addr so the GOT is already filled with virtual addresses
> when MMU is disabled and it works since init.c is compiled with -fno-pie
> option. So I'm not sure it would work differently from what you already do.
>

Yes, we need to apply -fno-pie to kaslr.c, but it might not be enough,
because we leveraged other code in linux source as well, such as
libfdt to parse dtb, so we also need to apply the -fno-pie to all
these files. Moreover, the relocation function needs to be extracted
from setup_vm, because we have to finish relocation and calculation of
random offset before setup_vm. So finally, it would be easier to me on
MMU enabled.

> Alex
>
> > Maybe we
> > could overcome these problems, but it seems to me that it would be
> > more difficult.
> >
> >>
> >> Alex

2020-04-10 16:00:20

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset

Hi Zong,

On 4/9/20 6:31 AM, Zong Li wrote:
> On Thu, Apr 9, 2020 at 1:51 PM Alex Ghiti <[email protected]> wrote:
>>
>>
>>
>> On 4/7/20 6:53 AM, Zong Li wrote:
>>> On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
>>>>
>>>>
>>>> On 3/24/20 3:30 AM, Zong Li wrote:
>>>>> Entropy is derived from the banner and timer, it is better than nothing
>>>>> but not enough secure, so previous stage may pass entropy via the device
>>>>> tree /chosen/kaslr-seed node.
>>>>>
>>>>> We limit randomization range within 1GB, so we can exploit early page
>>>>> table to map new destination of kernel image. Additionally, the kernel
>>>>> offset need 2M alignment to ensure it's good in PMD page table.
>>>>>
>>>>> We also checks the kernel offset whether it's safe by avoiding to
>>>>> overlaps with dtb, initrd and reserved memory regions.
>>>>>
>>>>
>>>> That maybe changes the way my sv48 patchset will be implemented: I can't
>>>> get user preference (3-level or 4-level) by any means, device-tree or
>>>> kernel parameter.
>>>>
>>>> But I don't see how you could get a random offset without info from the
>>>> device tree anyway (reserved memory regions especially), so maybe I
>>>> could parse dtb for allowing the user to choose. I'll move this
>>>> discussion to the sv48 introduction.
>>>
>>> Maybe I'm a little bit misunderstanding here, but I think I got the
>>> random offset through some information by parsing dtb.
>>>
>>
>> I was just saying that I may use the dtb too in sv48 patchset to make it
>> possible for users to choose sv39 even if sv48 is supported by hardware
>> (which is not the case in my current patchset).
>>
>>>>
>>>>> Signed-off-by: Zong Li <[email protected]>
>>>>> ---
>>>>> arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
>>>>> arch/riscv/mm/init.c | 2 +-
>>>>> 2 files changed, 273 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
>>>>> index 281b5fcca5c8..9ec2b608eb7f 100644
>>>>> --- a/arch/riscv/kernel/kaslr.c
>>>>> +++ b/arch/riscv/kernel/kaslr.c
>>>>> @@ -11,23 +11,293 @@
>>>>> #include <asm/cacheflush.h>
>>>>>
>>>>> extern char _start[], _end[];
>>>>> +extern void *dtb_early_va;
>>>>> +extern phys_addr_t dtb_early_pa;
>>>>> extern void secondary_random_target(void);
>>>>> extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
>>>>>
>>>>> uintptr_t secondary_next_target __initdata;
>>>>> static uintptr_t kaslr_offset __initdata;
>>>>>
>>>>> +static const __init u32 *get_reg_address(int root_cells,
>>>>> + const u32 *value, u64 *result)
>>>>> +{
>>>>> + int cell;
>>>>> + *result = 0;
>>>>> +
>>>>> + for (cell = root_cells; cell > 0; --cell)
>>>>> + *result = (*result << 32) + fdt32_to_cpu(*value++);
>>>>> +
>>>>> + return value;
>>>>> +}
>>>>> +
>>>>> +static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
>>>>> + int *size_cell)
>>>>> +{
>>>>> + int node = fdt_path_offset(dtb_early_va, path);
>>>>> + fdt64_t *prop;
>>>>> +
>>>>> + if (node < 0)
>>>>> + return -EINVAL;
>>>>> +
>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
>>>>> + if (!prop)
>>>>> + return -EINVAL;
>>>>> + *addr_cell = fdt32_to_cpu(*prop);
>>>>> +
>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
>>>>> + if (!prop)
>>>>> + return -EINVAL;
>>>>> + *size_cell = fdt32_to_cpu(*prop);
>>>>> +
>>>>> + return node;
>>>>> +}
>>>>> +
>>>>> +static __init void kaslr_get_mem_info(uintptr_t *mem_start,
>>>>> + uintptr_t *mem_size)
>>>>> +{
>>>>> + int node, root, addr_cells, size_cells;
>>>>> + u64 base, size;
>>>>> +
>>>>> + /* Get root node's address cells and size cells. */
>>>>> + root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
>>>>> + if (root < 0)
>>>>> + return;
>>>>> +
>>>>> + /* Get memory base address and size. */
>>>>> + fdt_for_each_subnode(node, dtb_early_va, root) {
>>>>> + const char *dev_type;
>>>>> + const u32 *reg;
>>>>> +
>>>>> + dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
>>>>> + if (!dev_type)
>>>>> + continue;
>>>>> +
>>>>> + if (!strcmp(dev_type, "memory")) {
>>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
>>>>> + if (!reg)
>>>>> + return;
>>>>> +
>>>>> + reg = get_reg_address(addr_cells, reg, &base);
>>>>> + reg = get_reg_address(size_cells, reg, &size);
>>>>> +
>>>>> + *mem_start = base;
>>>>> + *mem_size = size;
>>>>> +
>>>>> + break;
>>>>> + }
>>>>> + }
>>>>> +}
>>>>> +
>>>>> +/* Return a default seed if there is no HW generator. */
>>>>> +static u64 kaslr_default_seed = ULL(-1);
>>>>> +static __init u64 kaslr_get_seed(void)
>>>>> +{
>>>>> + int node, len;
>>>>> + fdt64_t *prop;
>>>>> + u64 ret;
>>>>> +
>>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
>>>>> + if (node < 0)
>>>>> + return kaslr_default_seed++;
>>>>> +
>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
>>>>> + if (!prop || len != sizeof(u64))
>>>>> + return kaslr_default_seed++;
>>>>> +
>>>>> + ret = fdt64_to_cpu(*prop);
>>>>> +
>>>>> + /* Re-write to zero for checking whether get seed at second time */
>>>>> + *prop = 0;
>>>>> +
>>>>> + return ret;
>>>>> +}
>>>>> +
>>>>> +static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
>>>>> + uintptr_t e2)
>>>>> +{
>>>>> + return e1 >= s2 && e2 >= s1;
>>>>> +}
>>>>
>>>> Inline this function or use a macro maybe.
>>>
>>> Yes, sure. Thanks.
>>>
>>>>
>>>>> +
>>>>> +static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
>>>>> + uintptr_t end_addr)
>>>>> +{
>>>>> + int node, rsv_mem, addr_cells, size_cells;
>>>>> +
>>>>> + /* Get the reserved-memory node. */
>>>>> + rsv_mem = get_node_addr_size_cells("/reserved-memory",
>>>>> + &addr_cells,
>>>>> + &size_cells);
>>>>> + if (rsv_mem < 0)
>>>>> + return false;
>>>>> +
>>>>> + /* Get memory base address and size. */
>>>>> + fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
>>>>> + uint64_t base, size;
>>>>> + const uint32_t *reg;
>>>>> +
>>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
>>>>> + if (!reg)
>>>>> + return 0;
>>>>> +
>>>>> + reg = get_reg_address(addr_cells, reg, &base);
>>>>> + reg = get_reg_address(size_cells, reg, &size);
>>>>> +
>>>>> + if (is_overlap(start_addr, end_addr, base, base + size))
>>>>> + return true;
>>>>> + }
>>>>> +
>>>>> + return false;
>>>>> +}
>>>>> +
>>>>> +static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
>>>>> +{
>>>>> + int node;
>>>>> + uintptr_t initrd_start, initrd_end;
>>>>> + fdt64_t *prop;
>>>>> +
>>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
>>>>> + if (node < 0)
>>>>> + return false;
>>>>> +
>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
>>>>> + if (!prop)
>>>>> + return false;
>>>>> +
>>>>> + initrd_start = fdt64_to_cpu(*prop);
>>>>> +
>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
>>>>> + if (!prop)
>>>>> + return false;
>>>>> +
>>>>> + initrd_end = fdt64_to_cpu(*prop);
>>>>> +
>>>>> + return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
>>>>> +}
>>>>> +
>>>>> +static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
>>>>> +{
>>>>> + uintptr_t dtb_start = dtb_early_pa;
>>>>> + uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
>>>>> +
>>>>> + return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
>>>>> +}
>>>>> +
>>>>> +static __init bool has_regions_overlapping(uintptr_t start_addr,
>>>>> + uintptr_t end_addr)
>>>>> +{
>>>>> + if (is_overlap_dtb(start_addr, end_addr))
>>>>> + return true;
>>>>> +
>>>>> + if (is_overlap_initrd(start_addr, end_addr))
>>>>> + return true;
>>>>> +
>>>>> + if (is_overlap_reserved_mem(start_addr, end_addr))
>>>>> + return true;
>>>>> +
>>>>> + return false;
>>>>> +}
>>>>> +
>>>>> +static inline __init unsigned long get_legal_offset(int random_index,
>>>>> + int max_index,
>>>>> + uintptr_t mem_start,
>>>>> + uintptr_t kernel_size)
>>>>> +{
>>>>> + uintptr_t start_addr, end_addr;
>>>>> + int idx, stop_idx;
>>>>> +
>>>>> + idx = stop_idx = random_index;
>>>>> +
>>>>> + do {
>>>>> + start_addr = mem_start + idx * SZ_2M + kernel_size;
>>>>> + end_addr = start_addr + kernel_size;
>>>>> +
>>>>> + /* Check overlap to other regions. */
>>>>> + if (!has_regions_overlapping(start_addr, end_addr))
>>>>> + return idx * SZ_2M + kernel_size;
>>>>> +
>>>>> + if (idx-- < 0)
>>>>> + idx = max_index;
>>>>
>>>> Isn't the fallback to max_index a security breach ? Because at some
>>>> point, the kernel will be loaded at this specific address.
>>>
>>> The max_index is the maximum safe index for destination of new kernel
>>> image. Could you give more explain here?
>>>
>>
>> But max_index is not random at all. I really don't know if that's a
>> problem, I just found intriguing the fact the kernel could be loaded at
>> some specific location. Would it be more secure, instead of picking
>> max_index as fallback when reaching 0, to pick another random number
>> between random_index and max_index ?
>
> ok, I can get your point. The original idea here is that we get a
> random index first, then we decrease the index to retry to find a good
> place if there are overlapping with other regions. A bit like the ring
> buffer, the end of index traversing is not zero, but the random_index
> - 1, we might consider it as continuity, so we don't know where is the
> end point because the start point is random, whether we stop at zero
> or random_index - 1.
>
> Pick another random number is more secure when occurring overlapping,
> but I a little bit worry that it would take very long time to retry
> many times in the worst case. for example, there is just only one
> index could fit kernel image in (except for original location). In the
> meantime, we don't need to wait the index being decreased to zero,
> because it seems to me that they are the same to stop at zero or
> random_index - 1, so if we decide to re-calculate a new random number,
> maybe we could remove the index decreasing here.

But you're right that it could take some time before converging to a
"good" index. Maybe we could restrict the index range to indexes that we
know for sure will be good ?

Alex

>
>>
>> Alex
>>
>>>>
>>>>> +
>>>>> + } while (idx != stop_idx);
>>>>> +
>>>>> + return 0;
>>>>> +}
>>>>> +
>>>>> +static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
>>>>> +{
>>>>> + size_t i;
>>>>> + uintptr_t *ptr = (uintptr_t *) area;
>>>>> +
>>>>> + for (i = 0; i < size / sizeof(hash); i++) {
>>>>> + /* Rotate by odd number of bits and XOR. */
>>>>> + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
>>>>> + hash ^= ptr[i];
>>>>> + }
>>>>> +
>>>>> + return hash;
>>>>> +}
>>>>> +
>>>>> +#define MEM_RESERVE_START __pa(PAGE_OFFSET)
>>>>> +static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
>>>>> +{
>>>>> + uintptr_t mem_start = 0, mem_size= 0, random_size;
>>>>> + uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
>>>>> + int index;
>>>>> + u64 random = 0;
>>>>> + cycles_t time_base;
>>>>> +
>>>>> + /* Attempt to create a simple but unpredictable starting entropy */
>>>>> + random = rotate_xor(random, linux_banner, strlen(linux_banner));
>>>>> +
>>>>> + /*
>>>>> + * If there is no HW random number generator, use timer to get a random
>>>>> + * number. This is better than nothing but not enough secure.
>>>>> + */
>>>>> + time_base = get_cycles() << 32;
>>>>> + time_base ^= get_cycles();
>>>>> + random = rotate_xor(random, &time_base, sizeof(time_base));
>>>>> +
>>>>> + if (seed)
>>>>> + random = rotate_xor(random, &seed, sizeof(seed));
>>>>> +
>>>>> + kaslr_get_mem_info(&mem_start, &mem_size);
>>>>> + if (!mem_size)
>>>>> + return 0;
>>>>> +
>>>>> + if (mem_start < MEM_RESERVE_START) {
>>>>> + mem_size -= MEM_RESERVE_START - mem_start;
>>>>> + mem_start = MEM_RESERVE_START;
>>>>> + }
>>>>> +
>>>>> + /*
>>>>> + * Limit randomization range within 1G, so we can exploit
>>>>> + * early_pmd/early_pte during early page table phase.
>>>>> + */
>>>>> + random_size = min_t(u64,
>>>>> + mem_size - (kernel_size_align * 2),
>>>>> + SZ_1G - (kernel_size_align * 2));
>>>>
>>>> pgdir size is 30 bits in sv39, but it's 39 bits in sv48, you should use
>>>> PGDIR_SIZE macro here.
>>>
>>> OK, change it in the next version. Thanks.
>>>
>>>>
>>>>> +
>>>>> + /* The index of 2M block in whole avaliable region */
>>>>> + index = random % (random_size / SZ_2M);
>>>>> +
>>>>> + return get_legal_offset(index, random_size / SZ_2M,
>>>>> + mem_start, kernel_size_align);
>>>>> +}
>>>>> +
>>>>> uintptr_t __init kaslr_early_init(void)
>>>>> {
>>>>> + u64 seed;
>>>>> uintptr_t dest_start, dest_end;
>>>>> uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
>>>>>
>>>>> /* Get zero value at second time to avoid doing randomization again. */
>>>>> - if (kaslr_offset)
>>>>> + seed = kaslr_get_seed();
>>>>> + if (!seed)
>>>>> return 0;
>>>>>
>>>>> /* Get the random number for kaslr offset. */
>>>>> - kaslr_offset = 0x10000000;
>>>>> + kaslr_offset = get_random_offset(seed, kernel_size);
>>>>>
>>>>> /* Update kernel_virt_addr for get_kaslr_offset. */
>>>>> kernel_virt_addr += kaslr_offset;
>>>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>>>>> index 2f5b25f02b6c..34c6ecf2c599 100644
>>>>> --- a/arch/riscv/mm/init.c
>>>>> +++ b/arch/riscv/mm/init.c
>>>>> @@ -125,7 +125,7 @@ static void __init setup_initrd(void)
>>>>> }
>>>>> #endif /* CONFIG_BLK_DEV_INITRD */
>>>>>
>>>>> -static phys_addr_t dtb_early_pa __initdata;
>>>>> +phys_addr_t dtb_early_pa __initdata;
>>>>>
>>>>> void __init setup_bootmem(void)
>>>>> {
>>>>>
>>>>
>>>> Alex

2020-04-11 08:21:54

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset

On Fri, Apr 10, 2020 at 11:58 PM Alex Ghiti <[email protected]> wrote:
>
> Hi Zong,
>
> On 4/9/20 6:31 AM, Zong Li wrote:
> > On Thu, Apr 9, 2020 at 1:51 PM Alex Ghiti <[email protected]> wrote:
> >>
> >>
> >>
> >> On 4/7/20 6:53 AM, Zong Li wrote:
> >>> On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
> >>>>
> >>>>
> >>>> On 3/24/20 3:30 AM, Zong Li wrote:
> >>>>> Entropy is derived from the banner and timer, it is better than nothing
> >>>>> but not enough secure, so previous stage may pass entropy via the device
> >>>>> tree /chosen/kaslr-seed node.
> >>>>>
> >>>>> We limit randomization range within 1GB, so we can exploit early page
> >>>>> table to map new destination of kernel image. Additionally, the kernel
> >>>>> offset need 2M alignment to ensure it's good in PMD page table.
> >>>>>
> >>>>> We also checks the kernel offset whether it's safe by avoiding to
> >>>>> overlaps with dtb, initrd and reserved memory regions.
> >>>>>
> >>>>
> >>>> That maybe changes the way my sv48 patchset will be implemented: I can't
> >>>> get user preference (3-level or 4-level) by any means, device-tree or
> >>>> kernel parameter.
> >>>>
> >>>> But I don't see how you could get a random offset without info from the
> >>>> device tree anyway (reserved memory regions especially), so maybe I
> >>>> could parse dtb for allowing the user to choose. I'll move this
> >>>> discussion to the sv48 introduction.
> >>>
> >>> Maybe I'm a little bit misunderstanding here, but I think I got the
> >>> random offset through some information by parsing dtb.
> >>>
> >>
> >> I was just saying that I may use the dtb too in sv48 patchset to make it
> >> possible for users to choose sv39 even if sv48 is supported by hardware
> >> (which is not the case in my current patchset).
> >>
> >>>>
> >>>>> Signed-off-by: Zong Li <[email protected]>
> >>>>> ---
> >>>>> arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
> >>>>> arch/riscv/mm/init.c | 2 +-
> >>>>> 2 files changed, 273 insertions(+), 3 deletions(-)
> >>>>>
> >>>>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> >>>>> index 281b5fcca5c8..9ec2b608eb7f 100644
> >>>>> --- a/arch/riscv/kernel/kaslr.c
> >>>>> +++ b/arch/riscv/kernel/kaslr.c
> >>>>> @@ -11,23 +11,293 @@
> >>>>> #include <asm/cacheflush.h>
> >>>>>
> >>>>> extern char _start[], _end[];
> >>>>> +extern void *dtb_early_va;
> >>>>> +extern phys_addr_t dtb_early_pa;
> >>>>> extern void secondary_random_target(void);
> >>>>> extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
> >>>>>
> >>>>> uintptr_t secondary_next_target __initdata;
> >>>>> static uintptr_t kaslr_offset __initdata;
> >>>>>
> >>>>> +static const __init u32 *get_reg_address(int root_cells,
> >>>>> + const u32 *value, u64 *result)
> >>>>> +{
> >>>>> + int cell;
> >>>>> + *result = 0;
> >>>>> +
> >>>>> + for (cell = root_cells; cell > 0; --cell)
> >>>>> + *result = (*result << 32) + fdt32_to_cpu(*value++);
> >>>>> +
> >>>>> + return value;
> >>>>> +}
> >>>>> +
> >>>>> +static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
> >>>>> + int *size_cell)
> >>>>> +{
> >>>>> + int node = fdt_path_offset(dtb_early_va, path);
> >>>>> + fdt64_t *prop;
> >>>>> +
> >>>>> + if (node < 0)
> >>>>> + return -EINVAL;
> >>>>> +
> >>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
> >>>>> + if (!prop)
> >>>>> + return -EINVAL;
> >>>>> + *addr_cell = fdt32_to_cpu(*prop);
> >>>>> +
> >>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
> >>>>> + if (!prop)
> >>>>> + return -EINVAL;
> >>>>> + *size_cell = fdt32_to_cpu(*prop);
> >>>>> +
> >>>>> + return node;
> >>>>> +}
> >>>>> +
> >>>>> +static __init void kaslr_get_mem_info(uintptr_t *mem_start,
> >>>>> + uintptr_t *mem_size)
> >>>>> +{
> >>>>> + int node, root, addr_cells, size_cells;
> >>>>> + u64 base, size;
> >>>>> +
> >>>>> + /* Get root node's address cells and size cells. */
> >>>>> + root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
> >>>>> + if (root < 0)
> >>>>> + return;
> >>>>> +
> >>>>> + /* Get memory base address and size. */
> >>>>> + fdt_for_each_subnode(node, dtb_early_va, root) {
> >>>>> + const char *dev_type;
> >>>>> + const u32 *reg;
> >>>>> +
> >>>>> + dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
> >>>>> + if (!dev_type)
> >>>>> + continue;
> >>>>> +
> >>>>> + if (!strcmp(dev_type, "memory")) {
> >>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> >>>>> + if (!reg)
> >>>>> + return;
> >>>>> +
> >>>>> + reg = get_reg_address(addr_cells, reg, &base);
> >>>>> + reg = get_reg_address(size_cells, reg, &size);
> >>>>> +
> >>>>> + *mem_start = base;
> >>>>> + *mem_size = size;
> >>>>> +
> >>>>> + break;
> >>>>> + }
> >>>>> + }
> >>>>> +}
> >>>>> +
> >>>>> +/* Return a default seed if there is no HW generator. */
> >>>>> +static u64 kaslr_default_seed = ULL(-1);
> >>>>> +static __init u64 kaslr_get_seed(void)
> >>>>> +{
> >>>>> + int node, len;
> >>>>> + fdt64_t *prop;
> >>>>> + u64 ret;
> >>>>> +
> >>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
> >>>>> + if (node < 0)
> >>>>> + return kaslr_default_seed++;
> >>>>> +
> >>>>> + prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
> >>>>> + if (!prop || len != sizeof(u64))
> >>>>> + return kaslr_default_seed++;
> >>>>> +
> >>>>> + ret = fdt64_to_cpu(*prop);
> >>>>> +
> >>>>> + /* Re-write to zero for checking whether get seed at second time */
> >>>>> + *prop = 0;
> >>>>> +
> >>>>> + return ret;
> >>>>> +}
> >>>>> +
> >>>>> +static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
> >>>>> + uintptr_t e2)
> >>>>> +{
> >>>>> + return e1 >= s2 && e2 >= s1;
> >>>>> +}
> >>>>
> >>>> Inline this function or use a macro maybe.
> >>>
> >>> Yes, sure. Thanks.
> >>>
> >>>>
> >>>>> +
> >>>>> +static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
> >>>>> + uintptr_t end_addr)
> >>>>> +{
> >>>>> + int node, rsv_mem, addr_cells, size_cells;
> >>>>> +
> >>>>> + /* Get the reserved-memory node. */
> >>>>> + rsv_mem = get_node_addr_size_cells("/reserved-memory",
> >>>>> + &addr_cells,
> >>>>> + &size_cells);
> >>>>> + if (rsv_mem < 0)
> >>>>> + return false;
> >>>>> +
> >>>>> + /* Get memory base address and size. */
> >>>>> + fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
> >>>>> + uint64_t base, size;
> >>>>> + const uint32_t *reg;
> >>>>> +
> >>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> >>>>> + if (!reg)
> >>>>> + return 0;
> >>>>> +
> >>>>> + reg = get_reg_address(addr_cells, reg, &base);
> >>>>> + reg = get_reg_address(size_cells, reg, &size);
> >>>>> +
> >>>>> + if (is_overlap(start_addr, end_addr, base, base + size))
> >>>>> + return true;
> >>>>> + }
> >>>>> +
> >>>>> + return false;
> >>>>> +}
> >>>>> +
> >>>>> +static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
> >>>>> +{
> >>>>> + int node;
> >>>>> + uintptr_t initrd_start, initrd_end;
> >>>>> + fdt64_t *prop;
> >>>>> +
> >>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
> >>>>> + if (node < 0)
> >>>>> + return false;
> >>>>> +
> >>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
> >>>>> + if (!prop)
> >>>>> + return false;
> >>>>> +
> >>>>> + initrd_start = fdt64_to_cpu(*prop);
> >>>>> +
> >>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
> >>>>> + if (!prop)
> >>>>> + return false;
> >>>>> +
> >>>>> + initrd_end = fdt64_to_cpu(*prop);
> >>>>> +
> >>>>> + return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
> >>>>> +}
> >>>>> +
> >>>>> +static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
> >>>>> +{
> >>>>> + uintptr_t dtb_start = dtb_early_pa;
> >>>>> + uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
> >>>>> +
> >>>>> + return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
> >>>>> +}
> >>>>> +
> >>>>> +static __init bool has_regions_overlapping(uintptr_t start_addr,
> >>>>> + uintptr_t end_addr)
> >>>>> +{
> >>>>> + if (is_overlap_dtb(start_addr, end_addr))
> >>>>> + return true;
> >>>>> +
> >>>>> + if (is_overlap_initrd(start_addr, end_addr))
> >>>>> + return true;
> >>>>> +
> >>>>> + if (is_overlap_reserved_mem(start_addr, end_addr))
> >>>>> + return true;
> >>>>> +
> >>>>> + return false;
> >>>>> +}
> >>>>> +
> >>>>> +static inline __init unsigned long get_legal_offset(int random_index,
> >>>>> + int max_index,
> >>>>> + uintptr_t mem_start,
> >>>>> + uintptr_t kernel_size)
> >>>>> +{
> >>>>> + uintptr_t start_addr, end_addr;
> >>>>> + int idx, stop_idx;
> >>>>> +
> >>>>> + idx = stop_idx = random_index;
> >>>>> +
> >>>>> + do {
> >>>>> + start_addr = mem_start + idx * SZ_2M + kernel_size;
> >>>>> + end_addr = start_addr + kernel_size;
> >>>>> +
> >>>>> + /* Check overlap to other regions. */
> >>>>> + if (!has_regions_overlapping(start_addr, end_addr))
> >>>>> + return idx * SZ_2M + kernel_size;
> >>>>> +
> >>>>> + if (idx-- < 0)
> >>>>> + idx = max_index;
> >>>>
> >>>> Isn't the fallback to max_index a security breach ? Because at some
> >>>> point, the kernel will be loaded at this specific address.
> >>>
> >>> The max_index is the maximum safe index for destination of new kernel
> >>> image. Could you give more explain here?
> >>>
> >>
> >> But max_index is not random at all. I really don't know if that's a
> >> problem, I just found intriguing the fact the kernel could be loaded at
> >> some specific location. Would it be more secure, instead of picking
> >> max_index as fallback when reaching 0, to pick another random number
> >> between random_index and max_index ?
> >
> > ok, I can get your point. The original idea here is that we get a
> > random index first, then we decrease the index to retry to find a good
> > place if there are overlapping with other regions. A bit like the ring
> > buffer, the end of index traversing is not zero, but the random_index
> > - 1, we might consider it as continuity, so we don't know where is the
> > end point because the start point is random, whether we stop at zero
> > or random_index - 1.
> >
> > Pick another random number is more secure when occurring overlapping,
> > but I a little bit worry that it would take very long time to retry
> > many times in the worst case. for example, there is just only one
> > index could fit kernel image in (except for original location). In the
> > meantime, we don't need to wait the index being decreased to zero,
> > because it seems to me that they are the same to stop at zero or
> > random_index - 1, so if we decide to re-calculate a new random number,
> > maybe we could remove the index decreasing here.
>
> But you're right that it could take some time before converging to a
> "good" index. Maybe we could restrict the index range to indexes that we
> know for sure will be good ?
>

Yes, it would be good for ensuring that we only need to get the random
number just once, but there are some points need to be discussed. The
first one is that we couldn't dynamically allocate a memory space at
that moment, because the memblock is not ready, so we might need to
declare a enough big array at static time to collect all good indexes.
Maybe CONFIG_MAXPHYSMEM_2GB and CONFIG_MAXPHYSMEM_128GB could be used
to decide the number of elements of this array. The second one is that
we always need to take the time to traverse the whole memory and check
the overlapping for all indexes no matter what the cases are. I'm not
sure whether it is good because this way increases the time and space
cost, but it would be more secure. Do you have any idea?


> Alex
>
> >
> >>
> >> Alex
> >>
> >>>>
> >>>>> +
> >>>>> + } while (idx != stop_idx);
> >>>>> +
> >>>>> + return 0;
> >>>>> +}
> >>>>> +
> >>>>> +static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
> >>>>> +{
> >>>>> + size_t i;
> >>>>> + uintptr_t *ptr = (uintptr_t *) area;
> >>>>> +
> >>>>> + for (i = 0; i < size / sizeof(hash); i++) {
> >>>>> + /* Rotate by odd number of bits and XOR. */
> >>>>> + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
> >>>>> + hash ^= ptr[i];
> >>>>> + }
> >>>>> +
> >>>>> + return hash;
> >>>>> +}
> >>>>> +
> >>>>> +#define MEM_RESERVE_START __pa(PAGE_OFFSET)
> >>>>> +static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
> >>>>> +{
> >>>>> + uintptr_t mem_start = 0, mem_size= 0, random_size;
> >>>>> + uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
> >>>>> + int index;
> >>>>> + u64 random = 0;
> >>>>> + cycles_t time_base;
> >>>>> +
> >>>>> + /* Attempt to create a simple but unpredictable starting entropy */
> >>>>> + random = rotate_xor(random, linux_banner, strlen(linux_banner));
> >>>>> +
> >>>>> + /*
> >>>>> + * If there is no HW random number generator, use timer to get a random
> >>>>> + * number. This is better than nothing but not enough secure.
> >>>>> + */
> >>>>> + time_base = get_cycles() << 32;
> >>>>> + time_base ^= get_cycles();
> >>>>> + random = rotate_xor(random, &time_base, sizeof(time_base));
> >>>>> +
> >>>>> + if (seed)
> >>>>> + random = rotate_xor(random, &seed, sizeof(seed));
> >>>>> +
> >>>>> + kaslr_get_mem_info(&mem_start, &mem_size);
> >>>>> + if (!mem_size)
> >>>>> + return 0;
> >>>>> +
> >>>>> + if (mem_start < MEM_RESERVE_START) {
> >>>>> + mem_size -= MEM_RESERVE_START - mem_start;
> >>>>> + mem_start = MEM_RESERVE_START;
> >>>>> + }
> >>>>> +
> >>>>> + /*
> >>>>> + * Limit randomization range within 1G, so we can exploit
> >>>>> + * early_pmd/early_pte during early page table phase.
> >>>>> + */
> >>>>> + random_size = min_t(u64,
> >>>>> + mem_size - (kernel_size_align * 2),
> >>>>> + SZ_1G - (kernel_size_align * 2));
> >>>>
> >>>> pgdir size is 30 bits in sv39, but it's 39 bits in sv48, you should use
> >>>> PGDIR_SIZE macro here.
> >>>
> >>> OK, change it in the next version. Thanks.
> >>>
> >>>>
> >>>>> +
> >>>>> + /* The index of 2M block in whole avaliable region */
> >>>>> + index = random % (random_size / SZ_2M);
> >>>>> +
> >>>>> + return get_legal_offset(index, random_size / SZ_2M,
> >>>>> + mem_start, kernel_size_align);
> >>>>> +}
> >>>>> +
> >>>>> uintptr_t __init kaslr_early_init(void)
> >>>>> {
> >>>>> + u64 seed;
> >>>>> uintptr_t dest_start, dest_end;
> >>>>> uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
> >>>>>
> >>>>> /* Get zero value at second time to avoid doing randomization again. */
> >>>>> - if (kaslr_offset)
> >>>>> + seed = kaslr_get_seed();
> >>>>> + if (!seed)
> >>>>> return 0;
> >>>>>
> >>>>> /* Get the random number for kaslr offset. */
> >>>>> - kaslr_offset = 0x10000000;
> >>>>> + kaslr_offset = get_random_offset(seed, kernel_size);
> >>>>>
> >>>>> /* Update kernel_virt_addr for get_kaslr_offset. */
> >>>>> kernel_virt_addr += kaslr_offset;
> >>>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> >>>>> index 2f5b25f02b6c..34c6ecf2c599 100644
> >>>>> --- a/arch/riscv/mm/init.c
> >>>>> +++ b/arch/riscv/mm/init.c
> >>>>> @@ -125,7 +125,7 @@ static void __init setup_initrd(void)
> >>>>> }
> >>>>> #endif /* CONFIG_BLK_DEV_INITRD */
> >>>>>
> >>>>> -static phys_addr_t dtb_early_pa __initdata;
> >>>>> +phys_addr_t dtb_early_pa __initdata;
> >>>>>
> >>>>> void __init setup_bootmem(void)
> >>>>> {
> >>>>>
> >>>>
> >>>> Alex

2020-04-12 06:55:00

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset



On 4/11/20 4:20 AM, Zong Li wrote:
> On Fri, Apr 10, 2020 at 11:58 PM Alex Ghiti <[email protected]> wrote:
>>
>> Hi Zong,
>>
>> On 4/9/20 6:31 AM, Zong Li wrote:
>>> On Thu, Apr 9, 2020 at 1:51 PM Alex Ghiti <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>> On 4/7/20 6:53 AM, Zong Li wrote:
>>>>> On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
>>>>>>
>>>>>>
>>>>>> On 3/24/20 3:30 AM, Zong Li wrote:
>>>>>>> Entropy is derived from the banner and timer, it is better than nothing
>>>>>>> but not enough secure, so previous stage may pass entropy via the device
>>>>>>> tree /chosen/kaslr-seed node.
>>>>>>>
>>>>>>> We limit randomization range within 1GB, so we can exploit early page
>>>>>>> table to map new destination of kernel image. Additionally, the kernel
>>>>>>> offset need 2M alignment to ensure it's good in PMD page table.
>>>>>>>
>>>>>>> We also checks the kernel offset whether it's safe by avoiding to
>>>>>>> overlaps with dtb, initrd and reserved memory regions.
>>>>>>>
>>>>>>
>>>>>> That maybe changes the way my sv48 patchset will be implemented: I can't
>>>>>> get user preference (3-level or 4-level) by any means, device-tree or
>>>>>> kernel parameter.
>>>>>>
>>>>>> But I don't see how you could get a random offset without info from the
>>>>>> device tree anyway (reserved memory regions especially), so maybe I
>>>>>> could parse dtb for allowing the user to choose. I'll move this
>>>>>> discussion to the sv48 introduction.
>>>>>
>>>>> Maybe I'm a little bit misunderstanding here, but I think I got the
>>>>> random offset through some information by parsing dtb.
>>>>>
>>>>
>>>> I was just saying that I may use the dtb too in sv48 patchset to make it
>>>> possible for users to choose sv39 even if sv48 is supported by hardware
>>>> (which is not the case in my current patchset).
>>>>
>>>>>>
>>>>>>> Signed-off-by: Zong Li <[email protected]>
>>>>>>> ---
>>>>>>> arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
>>>>>>> arch/riscv/mm/init.c | 2 +-
>>>>>>> 2 files changed, 273 insertions(+), 3 deletions(-)
>>>>>>>
>>>>>>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
>>>>>>> index 281b5fcca5c8..9ec2b608eb7f 100644
>>>>>>> --- a/arch/riscv/kernel/kaslr.c
>>>>>>> +++ b/arch/riscv/kernel/kaslr.c
>>>>>>> @@ -11,23 +11,293 @@
>>>>>>> #include <asm/cacheflush.h>
>>>>>>>
>>>>>>> extern char _start[], _end[];
>>>>>>> +extern void *dtb_early_va;
>>>>>>> +extern phys_addr_t dtb_early_pa;
>>>>>>> extern void secondary_random_target(void);
>>>>>>> extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
>>>>>>>
>>>>>>> uintptr_t secondary_next_target __initdata;
>>>>>>> static uintptr_t kaslr_offset __initdata;
>>>>>>>
>>>>>>> +static const __init u32 *get_reg_address(int root_cells,
>>>>>>> + const u32 *value, u64 *result)
>>>>>>> +{
>>>>>>> + int cell;
>>>>>>> + *result = 0;
>>>>>>> +
>>>>>>> + for (cell = root_cells; cell > 0; --cell)
>>>>>>> + *result = (*result << 32) + fdt32_to_cpu(*value++);
>>>>>>> +
>>>>>>> + return value;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
>>>>>>> + int *size_cell)
>>>>>>> +{
>>>>>>> + int node = fdt_path_offset(dtb_early_va, path);
>>>>>>> + fdt64_t *prop;
>>>>>>> +
>>>>>>> + if (node < 0)
>>>>>>> + return -EINVAL;
>>>>>>> +
>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
>>>>>>> + if (!prop)
>>>>>>> + return -EINVAL;
>>>>>>> + *addr_cell = fdt32_to_cpu(*prop);
>>>>>>> +
>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
>>>>>>> + if (!prop)
>>>>>>> + return -EINVAL;
>>>>>>> + *size_cell = fdt32_to_cpu(*prop);
>>>>>>> +
>>>>>>> + return node;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static __init void kaslr_get_mem_info(uintptr_t *mem_start,
>>>>>>> + uintptr_t *mem_size)
>>>>>>> +{
>>>>>>> + int node, root, addr_cells, size_cells;
>>>>>>> + u64 base, size;
>>>>>>> +
>>>>>>> + /* Get root node's address cells and size cells. */
>>>>>>> + root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
>>>>>>> + if (root < 0)
>>>>>>> + return;
>>>>>>> +
>>>>>>> + /* Get memory base address and size. */
>>>>>>> + fdt_for_each_subnode(node, dtb_early_va, root) {
>>>>>>> + const char *dev_type;
>>>>>>> + const u32 *reg;
>>>>>>> +
>>>>>>> + dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
>>>>>>> + if (!dev_type)
>>>>>>> + continue;
>>>>>>> +
>>>>>>> + if (!strcmp(dev_type, "memory")) {
>>>>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
>>>>>>> + if (!reg)
>>>>>>> + return;
>>>>>>> +
>>>>>>> + reg = get_reg_address(addr_cells, reg, &base);
>>>>>>> + reg = get_reg_address(size_cells, reg, &size);
>>>>>>> +
>>>>>>> + *mem_start = base;
>>>>>>> + *mem_size = size;
>>>>>>> +
>>>>>>> + break;
>>>>>>> + }
>>>>>>> + }
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* Return a default seed if there is no HW generator. */
>>>>>>> +static u64 kaslr_default_seed = ULL(-1);
>>>>>>> +static __init u64 kaslr_get_seed(void)
>>>>>>> +{
>>>>>>> + int node, len;
>>>>>>> + fdt64_t *prop;
>>>>>>> + u64 ret;
>>>>>>> +
>>>>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
>>>>>>> + if (node < 0)
>>>>>>> + return kaslr_default_seed++;
>>>>>>> +
>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
>>>>>>> + if (!prop || len != sizeof(u64))
>>>>>>> + return kaslr_default_seed++;
>>>>>>> +
>>>>>>> + ret = fdt64_to_cpu(*prop);
>>>>>>> +
>>>>>>> + /* Re-write to zero for checking whether get seed at second time */
>>>>>>> + *prop = 0;
>>>>>>> +
>>>>>>> + return ret;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
>>>>>>> + uintptr_t e2)
>>>>>>> +{
>>>>>>> + return e1 >= s2 && e2 >= s1;
>>>>>>> +}
>>>>>>
>>>>>> Inline this function or use a macro maybe.
>>>>>
>>>>> Yes, sure. Thanks.
>>>>>
>>>>>>
>>>>>>> +
>>>>>>> +static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
>>>>>>> + uintptr_t end_addr)
>>>>>>> +{
>>>>>>> + int node, rsv_mem, addr_cells, size_cells;
>>>>>>> +
>>>>>>> + /* Get the reserved-memory node. */
>>>>>>> + rsv_mem = get_node_addr_size_cells("/reserved-memory",
>>>>>>> + &addr_cells,
>>>>>>> + &size_cells);
>>>>>>> + if (rsv_mem < 0)
>>>>>>> + return false;
>>>>>>> +
>>>>>>> + /* Get memory base address and size. */
>>>>>>> + fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
>>>>>>> + uint64_t base, size;
>>>>>>> + const uint32_t *reg;
>>>>>>> +
>>>>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
>>>>>>> + if (!reg)
>>>>>>> + return 0;
>>>>>>> +
>>>>>>> + reg = get_reg_address(addr_cells, reg, &base);
>>>>>>> + reg = get_reg_address(size_cells, reg, &size);
>>>>>>> +
>>>>>>> + if (is_overlap(start_addr, end_addr, base, base + size))
>>>>>>> + return true;
>>>>>>> + }
>>>>>>> +
>>>>>>> + return false;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
>>>>>>> +{
>>>>>>> + int node;
>>>>>>> + uintptr_t initrd_start, initrd_end;
>>>>>>> + fdt64_t *prop;
>>>>>>> +
>>>>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
>>>>>>> + if (node < 0)
>>>>>>> + return false;
>>>>>>> +
>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
>>>>>>> + if (!prop)
>>>>>>> + return false;
>>>>>>> +
>>>>>>> + initrd_start = fdt64_to_cpu(*prop);
>>>>>>> +
>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
>>>>>>> + if (!prop)
>>>>>>> + return false;
>>>>>>> +
>>>>>>> + initrd_end = fdt64_to_cpu(*prop);
>>>>>>> +
>>>>>>> + return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
>>>>>>> +{
>>>>>>> + uintptr_t dtb_start = dtb_early_pa;
>>>>>>> + uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
>>>>>>> +
>>>>>>> + return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static __init bool has_regions_overlapping(uintptr_t start_addr,
>>>>>>> + uintptr_t end_addr)
>>>>>>> +{
>>>>>>> + if (is_overlap_dtb(start_addr, end_addr))
>>>>>>> + return true;
>>>>>>> +
>>>>>>> + if (is_overlap_initrd(start_addr, end_addr))
>>>>>>> + return true;
>>>>>>> +
>>>>>>> + if (is_overlap_reserved_mem(start_addr, end_addr))
>>>>>>> + return true;
>>>>>>> +
>>>>>>> + return false;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static inline __init unsigned long get_legal_offset(int random_index,
>>>>>>> + int max_index,
>>>>>>> + uintptr_t mem_start,
>>>>>>> + uintptr_t kernel_size)
>>>>>>> +{
>>>>>>> + uintptr_t start_addr, end_addr;
>>>>>>> + int idx, stop_idx;
>>>>>>> +
>>>>>>> + idx = stop_idx = random_index;
>>>>>>> +
>>>>>>> + do {
>>>>>>> + start_addr = mem_start + idx * SZ_2M + kernel_size;
>>>>>>> + end_addr = start_addr + kernel_size;
>>>>>>> +
>>>>>>> + /* Check overlap to other regions. */
>>>>>>> + if (!has_regions_overlapping(start_addr, end_addr))
>>>>>>> + return idx * SZ_2M + kernel_size;
>>>>>>> +
>>>>>>> + if (idx-- < 0)
>>>>>>> + idx = max_index;
>>>>>>
>>>>>> Isn't the fallback to max_index a security breach ? Because at some
>>>>>> point, the kernel will be loaded at this specific address.
>>>>>
>>>>> The max_index is the maximum safe index for destination of new kernel
>>>>> image. Could you give more explain here?
>>>>>
>>>>
>>>> But max_index is not random at all. I really don't know if that's a
>>>> problem, I just found intriguing the fact the kernel could be loaded at
>>>> some specific location. Would it be more secure, instead of picking
>>>> max_index as fallback when reaching 0, to pick another random number
>>>> between random_index and max_index ?
>>>
>>> ok, I can get your point. The original idea here is that we get a
>>> random index first, then we decrease the index to retry to find a good
>>> place if there are overlapping with other regions. A bit like the ring
>>> buffer, the end of index traversing is not zero, but the random_index
>>> - 1, we might consider it as continuity, so we don't know where is the
>>> end point because the start point is random, whether we stop at zero
>>> or random_index - 1.
>>>
>>> Pick another random number is more secure when occurring overlapping,
>>> but I a little bit worry that it would take very long time to retry
>>> many times in the worst case. for example, there is just only one
>>> index could fit kernel image in (except for original location). In the
>>> meantime, we don't need to wait the index being decreased to zero,
>>> because it seems to me that they are the same to stop at zero or
>>> random_index - 1, so if we decide to re-calculate a new random number,
>>> maybe we could remove the index decreasing here.
>>
>> But you're right that it could take some time before converging to a
>> "good" index. Maybe we could restrict the index range to indexes that we
>> know for sure will be good ?
>>
>
> Yes, it would be good for ensuring that we only need to get the random
> number just once, but there are some points need to be discussed. The
> first one is that we couldn't dynamically allocate a memory space at
> that moment, because the memblock is not ready, so we might need to
> declare a enough big array at static time to collect all good indexes.
> Maybe CONFIG_MAXPHYSMEM_2GB and CONFIG_MAXPHYSMEM_128GB could be used
> to decide the number of elements of this array. The second one is that
> we always need to take the time to traverse the whole memory and check
> the overlapping for all indexes no matter what the cases are. I'm not
> sure whether it is good because this way increases the time and space
> cost, but it would be more secure. Do you have any idea?
>

What about simply finding the biggest range of contiguous non-reserved
memory and getting an index from there ?

>
>> Alex
>>
>>>
>>>>
>>>> Alex
>>>>
>>>>>>
>>>>>>> +
>>>>>>> + } while (idx != stop_idx);
>>>>>>> +
>>>>>>> + return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
>>>>>>> +{
>>>>>>> + size_t i;
>>>>>>> + uintptr_t *ptr = (uintptr_t *) area;
>>>>>>> +
>>>>>>> + for (i = 0; i < size / sizeof(hash); i++) {
>>>>>>> + /* Rotate by odd number of bits and XOR. */
>>>>>>> + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
>>>>>>> + hash ^= ptr[i];
>>>>>>> + }
>>>>>>> +
>>>>>>> + return hash;
>>>>>>> +}
>>>>>>> +
>>>>>>> +#define MEM_RESERVE_START __pa(PAGE_OFFSET)
>>>>>>> +static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
>>>>>>> +{
>>>>>>> + uintptr_t mem_start = 0, mem_size= 0, random_size;
>>>>>>> + uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
>>>>>>> + int index;
>>>>>>> + u64 random = 0;
>>>>>>> + cycles_t time_base;
>>>>>>> +
>>>>>>> + /* Attempt to create a simple but unpredictable starting entropy */
>>>>>>> + random = rotate_xor(random, linux_banner, strlen(linux_banner));
>>>>>>> +
>>>>>>> + /*
>>>>>>> + * If there is no HW random number generator, use timer to get a random
>>>>>>> + * number. This is better than nothing but not enough secure.
>>>>>>> + */
>>>>>>> + time_base = get_cycles() << 32;
>>>>>>> + time_base ^= get_cycles();
>>>>>>> + random = rotate_xor(random, &time_base, sizeof(time_base));
>>>>>>> +
>>>>>>> + if (seed)
>>>>>>> + random = rotate_xor(random, &seed, sizeof(seed));
>>>>>>> +
>>>>>>> + kaslr_get_mem_info(&mem_start, &mem_size);
>>>>>>> + if (!mem_size)
>>>>>>> + return 0;
>>>>>>> +
>>>>>>> + if (mem_start < MEM_RESERVE_START) {
>>>>>>> + mem_size -= MEM_RESERVE_START - mem_start;
>>>>>>> + mem_start = MEM_RESERVE_START;
>>>>>>> + }
>>>>>>> +
>>>>>>> + /*
>>>>>>> + * Limit randomization range within 1G, so we can exploit
>>>>>>> + * early_pmd/early_pte during early page table phase.
>>>>>>> + */
>>>>>>> + random_size = min_t(u64,
>>>>>>> + mem_size - (kernel_size_align * 2),
>>>>>>> + SZ_1G - (kernel_size_align * 2));
>>>>>>
>>>>>> pgdir size is 30 bits in sv39, but it's 39 bits in sv48, you should use
>>>>>> PGDIR_SIZE macro here.
>>>>>
>>>>> OK, change it in the next version. Thanks.
>>>>>
>>>>>>
>>>>>>> +
>>>>>>> + /* The index of 2M block in whole avaliable region */
>>>>>>> + index = random % (random_size / SZ_2M);
>>>>>>> +
>>>>>>> + return get_legal_offset(index, random_size / SZ_2M,
>>>>>>> + mem_start, kernel_size_align);
>>>>>>> +}
>>>>>>> +
>>>>>>> uintptr_t __init kaslr_early_init(void)
>>>>>>> {
>>>>>>> + u64 seed;
>>>>>>> uintptr_t dest_start, dest_end;
>>>>>>> uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
>>>>>>>
>>>>>>> /* Get zero value at second time to avoid doing randomization again. */
>>>>>>> - if (kaslr_offset)
>>>>>>> + seed = kaslr_get_seed();
>>>>>>> + if (!seed)
>>>>>>> return 0;
>>>>>>>
>>>>>>> /* Get the random number for kaslr offset. */
>>>>>>> - kaslr_offset = 0x10000000;
>>>>>>> + kaslr_offset = get_random_offset(seed, kernel_size);
>>>>>>>
>>>>>>> /* Update kernel_virt_addr for get_kaslr_offset. */
>>>>>>> kernel_virt_addr += kaslr_offset;
>>>>>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>>>>>>> index 2f5b25f02b6c..34c6ecf2c599 100644
>>>>>>> --- a/arch/riscv/mm/init.c
>>>>>>> +++ b/arch/riscv/mm/init.c
>>>>>>> @@ -125,7 +125,7 @@ static void __init setup_initrd(void)
>>>>>>> }
>>>>>>> #endif /* CONFIG_BLK_DEV_INITRD */
>>>>>>>
>>>>>>> -static phys_addr_t dtb_early_pa __initdata;
>>>>>>> +phys_addr_t dtb_early_pa __initdata;
>>>>>>>
>>>>>>> void __init setup_bootmem(void)
>>>>>>> {
>>>>>>>
>>>>>>
>>>>>> Alex

2020-04-14 13:57:16

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset

On Sun, Apr 12, 2020 at 2:53 PM Alex Ghiti <[email protected]> wrote:
>
>
>
> On 4/11/20 4:20 AM, Zong Li wrote:
> > On Fri, Apr 10, 2020 at 11:58 PM Alex Ghiti <[email protected]> wrote:
> >>
> >> Hi Zong,
> >>
> >> On 4/9/20 6:31 AM, Zong Li wrote:
> >>> On Thu, Apr 9, 2020 at 1:51 PM Alex Ghiti <[email protected]> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 4/7/20 6:53 AM, Zong Li wrote:
> >>>>> On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 3/24/20 3:30 AM, Zong Li wrote:
> >>>>>>> Entropy is derived from the banner and timer, it is better than nothing
> >>>>>>> but not enough secure, so previous stage may pass entropy via the device
> >>>>>>> tree /chosen/kaslr-seed node.
> >>>>>>>
> >>>>>>> We limit randomization range within 1GB, so we can exploit early page
> >>>>>>> table to map new destination of kernel image. Additionally, the kernel
> >>>>>>> offset need 2M alignment to ensure it's good in PMD page table.
> >>>>>>>
> >>>>>>> We also checks the kernel offset whether it's safe by avoiding to
> >>>>>>> overlaps with dtb, initrd and reserved memory regions.
> >>>>>>>
> >>>>>>
> >>>>>> That maybe changes the way my sv48 patchset will be implemented: I can't
> >>>>>> get user preference (3-level or 4-level) by any means, device-tree or
> >>>>>> kernel parameter.
> >>>>>>
> >>>>>> But I don't see how you could get a random offset without info from the
> >>>>>> device tree anyway (reserved memory regions especially), so maybe I
> >>>>>> could parse dtb for allowing the user to choose. I'll move this
> >>>>>> discussion to the sv48 introduction.
> >>>>>
> >>>>> Maybe I'm a little bit misunderstanding here, but I think I got the
> >>>>> random offset through some information by parsing dtb.
> >>>>>
> >>>>
> >>>> I was just saying that I may use the dtb too in sv48 patchset to make it
> >>>> possible for users to choose sv39 even if sv48 is supported by hardware
> >>>> (which is not the case in my current patchset).
> >>>>
> >>>>>>
> >>>>>>> Signed-off-by: Zong Li <[email protected]>
> >>>>>>> ---
> >>>>>>> arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
> >>>>>>> arch/riscv/mm/init.c | 2 +-
> >>>>>>> 2 files changed, 273 insertions(+), 3 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> >>>>>>> index 281b5fcca5c8..9ec2b608eb7f 100644
> >>>>>>> --- a/arch/riscv/kernel/kaslr.c
> >>>>>>> +++ b/arch/riscv/kernel/kaslr.c
> >>>>>>> @@ -11,23 +11,293 @@
> >>>>>>> #include <asm/cacheflush.h>
> >>>>>>>
> >>>>>>> extern char _start[], _end[];
> >>>>>>> +extern void *dtb_early_va;
> >>>>>>> +extern phys_addr_t dtb_early_pa;
> >>>>>>> extern void secondary_random_target(void);
> >>>>>>> extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
> >>>>>>>
> >>>>>>> uintptr_t secondary_next_target __initdata;
> >>>>>>> static uintptr_t kaslr_offset __initdata;
> >>>>>>>
> >>>>>>> +static const __init u32 *get_reg_address(int root_cells,
> >>>>>>> + const u32 *value, u64 *result)
> >>>>>>> +{
> >>>>>>> + int cell;
> >>>>>>> + *result = 0;
> >>>>>>> +
> >>>>>>> + for (cell = root_cells; cell > 0; --cell)
> >>>>>>> + *result = (*result << 32) + fdt32_to_cpu(*value++);
> >>>>>>> +
> >>>>>>> + return value;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
> >>>>>>> + int *size_cell)
> >>>>>>> +{
> >>>>>>> + int node = fdt_path_offset(dtb_early_va, path);
> >>>>>>> + fdt64_t *prop;
> >>>>>>> +
> >>>>>>> + if (node < 0)
> >>>>>>> + return -EINVAL;
> >>>>>>> +
> >>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
> >>>>>>> + if (!prop)
> >>>>>>> + return -EINVAL;
> >>>>>>> + *addr_cell = fdt32_to_cpu(*prop);
> >>>>>>> +
> >>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
> >>>>>>> + if (!prop)
> >>>>>>> + return -EINVAL;
> >>>>>>> + *size_cell = fdt32_to_cpu(*prop);
> >>>>>>> +
> >>>>>>> + return node;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static __init void kaslr_get_mem_info(uintptr_t *mem_start,
> >>>>>>> + uintptr_t *mem_size)
> >>>>>>> +{
> >>>>>>> + int node, root, addr_cells, size_cells;
> >>>>>>> + u64 base, size;
> >>>>>>> +
> >>>>>>> + /* Get root node's address cells and size cells. */
> >>>>>>> + root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
> >>>>>>> + if (root < 0)
> >>>>>>> + return;
> >>>>>>> +
> >>>>>>> + /* Get memory base address and size. */
> >>>>>>> + fdt_for_each_subnode(node, dtb_early_va, root) {
> >>>>>>> + const char *dev_type;
> >>>>>>> + const u32 *reg;
> >>>>>>> +
> >>>>>>> + dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
> >>>>>>> + if (!dev_type)
> >>>>>>> + continue;
> >>>>>>> +
> >>>>>>> + if (!strcmp(dev_type, "memory")) {
> >>>>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> >>>>>>> + if (!reg)
> >>>>>>> + return;
> >>>>>>> +
> >>>>>>> + reg = get_reg_address(addr_cells, reg, &base);
> >>>>>>> + reg = get_reg_address(size_cells, reg, &size);
> >>>>>>> +
> >>>>>>> + *mem_start = base;
> >>>>>>> + *mem_size = size;
> >>>>>>> +
> >>>>>>> + break;
> >>>>>>> + }
> >>>>>>> + }
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +/* Return a default seed if there is no HW generator. */
> >>>>>>> +static u64 kaslr_default_seed = ULL(-1);
> >>>>>>> +static __init u64 kaslr_get_seed(void)
> >>>>>>> +{
> >>>>>>> + int node, len;
> >>>>>>> + fdt64_t *prop;
> >>>>>>> + u64 ret;
> >>>>>>> +
> >>>>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
> >>>>>>> + if (node < 0)
> >>>>>>> + return kaslr_default_seed++;
> >>>>>>> +
> >>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
> >>>>>>> + if (!prop || len != sizeof(u64))
> >>>>>>> + return kaslr_default_seed++;
> >>>>>>> +
> >>>>>>> + ret = fdt64_to_cpu(*prop);
> >>>>>>> +
> >>>>>>> + /* Re-write to zero for checking whether get seed at second time */
> >>>>>>> + *prop = 0;
> >>>>>>> +
> >>>>>>> + return ret;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
> >>>>>>> + uintptr_t e2)
> >>>>>>> +{
> >>>>>>> + return e1 >= s2 && e2 >= s1;
> >>>>>>> +}
> >>>>>>
> >>>>>> Inline this function or use a macro maybe.
> >>>>>
> >>>>> Yes, sure. Thanks.
> >>>>>
> >>>>>>
> >>>>>>> +
> >>>>>>> +static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
> >>>>>>> + uintptr_t end_addr)
> >>>>>>> +{
> >>>>>>> + int node, rsv_mem, addr_cells, size_cells;
> >>>>>>> +
> >>>>>>> + /* Get the reserved-memory node. */
> >>>>>>> + rsv_mem = get_node_addr_size_cells("/reserved-memory",
> >>>>>>> + &addr_cells,
> >>>>>>> + &size_cells);
> >>>>>>> + if (rsv_mem < 0)
> >>>>>>> + return false;
> >>>>>>> +
> >>>>>>> + /* Get memory base address and size. */
> >>>>>>> + fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
> >>>>>>> + uint64_t base, size;
> >>>>>>> + const uint32_t *reg;
> >>>>>>> +
> >>>>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> >>>>>>> + if (!reg)
> >>>>>>> + return 0;
> >>>>>>> +
> >>>>>>> + reg = get_reg_address(addr_cells, reg, &base);
> >>>>>>> + reg = get_reg_address(size_cells, reg, &size);
> >>>>>>> +
> >>>>>>> + if (is_overlap(start_addr, end_addr, base, base + size))
> >>>>>>> + return true;
> >>>>>>> + }
> >>>>>>> +
> >>>>>>> + return false;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
> >>>>>>> +{
> >>>>>>> + int node;
> >>>>>>> + uintptr_t initrd_start, initrd_end;
> >>>>>>> + fdt64_t *prop;
> >>>>>>> +
> >>>>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
> >>>>>>> + if (node < 0)
> >>>>>>> + return false;
> >>>>>>> +
> >>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
> >>>>>>> + if (!prop)
> >>>>>>> + return false;
> >>>>>>> +
> >>>>>>> + initrd_start = fdt64_to_cpu(*prop);
> >>>>>>> +
> >>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
> >>>>>>> + if (!prop)
> >>>>>>> + return false;
> >>>>>>> +
> >>>>>>> + initrd_end = fdt64_to_cpu(*prop);
> >>>>>>> +
> >>>>>>> + return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
> >>>>>>> +{
> >>>>>>> + uintptr_t dtb_start = dtb_early_pa;
> >>>>>>> + uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
> >>>>>>> +
> >>>>>>> + return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static __init bool has_regions_overlapping(uintptr_t start_addr,
> >>>>>>> + uintptr_t end_addr)
> >>>>>>> +{
> >>>>>>> + if (is_overlap_dtb(start_addr, end_addr))
> >>>>>>> + return true;
> >>>>>>> +
> >>>>>>> + if (is_overlap_initrd(start_addr, end_addr))
> >>>>>>> + return true;
> >>>>>>> +
> >>>>>>> + if (is_overlap_reserved_mem(start_addr, end_addr))
> >>>>>>> + return true;
> >>>>>>> +
> >>>>>>> + return false;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static inline __init unsigned long get_legal_offset(int random_index,
> >>>>>>> + int max_index,
> >>>>>>> + uintptr_t mem_start,
> >>>>>>> + uintptr_t kernel_size)
> >>>>>>> +{
> >>>>>>> + uintptr_t start_addr, end_addr;
> >>>>>>> + int idx, stop_idx;
> >>>>>>> +
> >>>>>>> + idx = stop_idx = random_index;
> >>>>>>> +
> >>>>>>> + do {
> >>>>>>> + start_addr = mem_start + idx * SZ_2M + kernel_size;
> >>>>>>> + end_addr = start_addr + kernel_size;
> >>>>>>> +
> >>>>>>> + /* Check overlap to other regions. */
> >>>>>>> + if (!has_regions_overlapping(start_addr, end_addr))
> >>>>>>> + return idx * SZ_2M + kernel_size;
> >>>>>>> +
> >>>>>>> + if (idx-- < 0)
> >>>>>>> + idx = max_index;
> >>>>>>
> >>>>>> Isn't the fallback to max_index a security breach ? Because at some
> >>>>>> point, the kernel will be loaded at this specific address.
> >>>>>
> >>>>> The max_index is the maximum safe index for destination of new kernel
> >>>>> image. Could you give more explain here?
> >>>>>
> >>>>
> >>>> But max_index is not random at all. I really don't know if that's a
> >>>> problem, I just found intriguing the fact the kernel could be loaded at
> >>>> some specific location. Would it be more secure, instead of picking
> >>>> max_index as fallback when reaching 0, to pick another random number
> >>>> between random_index and max_index ?
> >>>
> >>> ok, I can get your point. The original idea here is that we get a
> >>> random index first, then we decrease the index to retry to find a good
> >>> place if there are overlapping with other regions. A bit like the ring
> >>> buffer, the end of index traversing is not zero, but the random_index
> >>> - 1, we might consider it as continuity, so we don't know where is the
> >>> end point because the start point is random, whether we stop at zero
> >>> or random_index - 1.
> >>>
> >>> Pick another random number is more secure when occurring overlapping,
> >>> but I a little bit worry that it would take very long time to retry
> >>> many times in the worst case. for example, there is just only one
> >>> index could fit kernel image in (except for original location). In the
> >>> meantime, we don't need to wait the index being decreased to zero,
> >>> because it seems to me that they are the same to stop at zero or
> >>> random_index - 1, so if we decide to re-calculate a new random number,
> >>> maybe we could remove the index decreasing here.
> >>
> >> But you're right that it could take some time before converging to a
> >> "good" index. Maybe we could restrict the index range to indexes that we
> >> know for sure will be good ?
> >>
> >
> > Yes, it would be good for ensuring that we only need to get the random
> > number just once, but there are some points need to be discussed. The
> > first one is that we couldn't dynamically allocate a memory space at
> > that moment, because the memblock is not ready, so we might need to
> > declare a enough big array at static time to collect all good indexes.
> > Maybe CONFIG_MAXPHYSMEM_2GB and CONFIG_MAXPHYSMEM_128GB could be used
> > to decide the number of elements of this array. The second one is that
> > we always need to take the time to traverse the whole memory and check
> > the overlapping for all indexes no matter what the cases are. I'm not
> > sure whether it is good because this way increases the time and space
> > cost, but it would be more secure. Do you have any idea?
> >
>
> What about simply finding the biggest range of contiguous non-reserved
> memory and getting an index from there ?

This needs something like mentioned above, we need a big enough array
to collect these index of the biggest range, and check all indexes
whether they are safe, and it would limit and reduce the random range
of we could use. On original way, the value of max_index won't be the
end of traversing index, it would continue to decrease the index to
find a good place until the index becoming random_offset again, so
kernel doesn't be loaded to the specific location which max_index
specify to, it seems to me that there isn't the worry of you
mentioned.

>
> >
> >> Alex
> >>
> >>>
> >>>>
> >>>> Alex
> >>>>
> >>>>>>
> >>>>>>> +
> >>>>>>> + } while (idx != stop_idx);
> >>>>>>> +
> >>>>>>> + return 0;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
> >>>>>>> +{
> >>>>>>> + size_t i;
> >>>>>>> + uintptr_t *ptr = (uintptr_t *) area;
> >>>>>>> +
> >>>>>>> + for (i = 0; i < size / sizeof(hash); i++) {
> >>>>>>> + /* Rotate by odd number of bits and XOR. */
> >>>>>>> + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
> >>>>>>> + hash ^= ptr[i];
> >>>>>>> + }
> >>>>>>> +
> >>>>>>> + return hash;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +#define MEM_RESERVE_START __pa(PAGE_OFFSET)
> >>>>>>> +static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
> >>>>>>> +{
> >>>>>>> + uintptr_t mem_start = 0, mem_size= 0, random_size;
> >>>>>>> + uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
> >>>>>>> + int index;
> >>>>>>> + u64 random = 0;
> >>>>>>> + cycles_t time_base;
> >>>>>>> +
> >>>>>>> + /* Attempt to create a simple but unpredictable starting entropy */
> >>>>>>> + random = rotate_xor(random, linux_banner, strlen(linux_banner));
> >>>>>>> +
> >>>>>>> + /*
> >>>>>>> + * If there is no HW random number generator, use timer to get a random
> >>>>>>> + * number. This is better than nothing but not enough secure.
> >>>>>>> + */
> >>>>>>> + time_base = get_cycles() << 32;
> >>>>>>> + time_base ^= get_cycles();
> >>>>>>> + random = rotate_xor(random, &time_base, sizeof(time_base));
> >>>>>>> +
> >>>>>>> + if (seed)
> >>>>>>> + random = rotate_xor(random, &seed, sizeof(seed));
> >>>>>>> +
> >>>>>>> + kaslr_get_mem_info(&mem_start, &mem_size);
> >>>>>>> + if (!mem_size)
> >>>>>>> + return 0;
> >>>>>>> +
> >>>>>>> + if (mem_start < MEM_RESERVE_START) {
> >>>>>>> + mem_size -= MEM_RESERVE_START - mem_start;
> >>>>>>> + mem_start = MEM_RESERVE_START;
> >>>>>>> + }
> >>>>>>> +
> >>>>>>> + /*
> >>>>>>> + * Limit randomization range within 1G, so we can exploit
> >>>>>>> + * early_pmd/early_pte during early page table phase.
> >>>>>>> + */
> >>>>>>> + random_size = min_t(u64,
> >>>>>>> + mem_size - (kernel_size_align * 2),
> >>>>>>> + SZ_1G - (kernel_size_align * 2));
> >>>>>>
> >>>>>> pgdir size is 30 bits in sv39, but it's 39 bits in sv48, you should use
> >>>>>> PGDIR_SIZE macro here.
> >>>>>
> >>>>> OK, change it in the next version. Thanks.
> >>>>>
> >>>>>>
> >>>>>>> +
> >>>>>>> + /* The index of 2M block in whole avaliable region */
> >>>>>>> + index = random % (random_size / SZ_2M);
> >>>>>>> +
> >>>>>>> + return get_legal_offset(index, random_size / SZ_2M,
> >>>>>>> + mem_start, kernel_size_align);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> uintptr_t __init kaslr_early_init(void)
> >>>>>>> {
> >>>>>>> + u64 seed;
> >>>>>>> uintptr_t dest_start, dest_end;
> >>>>>>> uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
> >>>>>>>
> >>>>>>> /* Get zero value at second time to avoid doing randomization again. */
> >>>>>>> - if (kaslr_offset)
> >>>>>>> + seed = kaslr_get_seed();
> >>>>>>> + if (!seed)
> >>>>>>> return 0;
> >>>>>>>
> >>>>>>> /* Get the random number for kaslr offset. */
> >>>>>>> - kaslr_offset = 0x10000000;
> >>>>>>> + kaslr_offset = get_random_offset(seed, kernel_size);
> >>>>>>>
> >>>>>>> /* Update kernel_virt_addr for get_kaslr_offset. */
> >>>>>>> kernel_virt_addr += kaslr_offset;
> >>>>>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> >>>>>>> index 2f5b25f02b6c..34c6ecf2c599 100644
> >>>>>>> --- a/arch/riscv/mm/init.c
> >>>>>>> +++ b/arch/riscv/mm/init.c
> >>>>>>> @@ -125,7 +125,7 @@ static void __init setup_initrd(void)
> >>>>>>> }
> >>>>>>> #endif /* CONFIG_BLK_DEV_INITRD */
> >>>>>>>
> >>>>>>> -static phys_addr_t dtb_early_pa __initdata;
> >>>>>>> +phys_addr_t dtb_early_pa __initdata;
> >>>>>>>
> >>>>>>> void __init setup_bootmem(void)
> >>>>>>> {
> >>>>>>>
> >>>>>>
> >>>>>> Alex

2020-04-14 14:41:08

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset

On Tue, Apr 14, 2020 at 1:43 PM Alex Ghiti <[email protected]> wrote:
>
>
>
> On 4/13/20 10:46 PM, Zong Li wrote:
> > On Sun, Apr 12, 2020 at 2:53 PM Alex Ghiti <[email protected]> wrote:
> >>
> >>
> >>
> >> On 4/11/20 4:20 AM, Zong Li wrote:
> >>> On Fri, Apr 10, 2020 at 11:58 PM Alex Ghiti <[email protected]> wrote:
> >>>>
> >>>> Hi Zong,
> >>>>
> >>>> On 4/9/20 6:31 AM, Zong Li wrote:
> >>>>> On Thu, Apr 9, 2020 at 1:51 PM Alex Ghiti <[email protected]> wrote:
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 4/7/20 6:53 AM, Zong Li wrote:
> >>>>>>> On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> On 3/24/20 3:30 AM, Zong Li wrote:
> >>>>>>>>> Entropy is derived from the banner and timer, it is better than nothing
> >>>>>>>>> but not enough secure, so previous stage may pass entropy via the device
> >>>>>>>>> tree /chosen/kaslr-seed node.
> >>>>>>>>>
> >>>>>>>>> We limit randomization range within 1GB, so we can exploit early page
> >>>>>>>>> table to map new destination of kernel image. Additionally, the kernel
> >>>>>>>>> offset need 2M alignment to ensure it's good in PMD page table.
> >>>>>>>>>
> >>>>>>>>> We also checks the kernel offset whether it's safe by avoiding to
> >>>>>>>>> overlaps with dtb, initrd and reserved memory regions.
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> That maybe changes the way my sv48 patchset will be implemented: I can't
> >>>>>>>> get user preference (3-level or 4-level) by any means, device-tree or
> >>>>>>>> kernel parameter.
> >>>>>>>>
> >>>>>>>> But I don't see how you could get a random offset without info from the
> >>>>>>>> device tree anyway (reserved memory regions especially), so maybe I
> >>>>>>>> could parse dtb for allowing the user to choose. I'll move this
> >>>>>>>> discussion to the sv48 introduction.
> >>>>>>>
> >>>>>>> Maybe I'm a little bit misunderstanding here, but I think I got the
> >>>>>>> random offset through some information by parsing dtb.
> >>>>>>>
> >>>>>>
> >>>>>> I was just saying that I may use the dtb too in sv48 patchset to make it
> >>>>>> possible for users to choose sv39 even if sv48 is supported by hardware
> >>>>>> (which is not the case in my current patchset).
> >>>>>>
> >>>>>>>>
> >>>>>>>>> Signed-off-by: Zong Li <[email protected]>
> >>>>>>>>> ---
> >>>>>>>>> arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
> >>>>>>>>> arch/riscv/mm/init.c | 2 +-
> >>>>>>>>> 2 files changed, 273 insertions(+), 3 deletions(-)
> >>>>>>>>>
> >>>>>>>>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
> >>>>>>>>> index 281b5fcca5c8..9ec2b608eb7f 100644
> >>>>>>>>> --- a/arch/riscv/kernel/kaslr.c
> >>>>>>>>> +++ b/arch/riscv/kernel/kaslr.c
> >>>>>>>>> @@ -11,23 +11,293 @@
> >>>>>>>>> #include <asm/cacheflush.h>
> >>>>>>>>>
> >>>>>>>>> extern char _start[], _end[];
> >>>>>>>>> +extern void *dtb_early_va;
> >>>>>>>>> +extern phys_addr_t dtb_early_pa;
> >>>>>>>>> extern void secondary_random_target(void);
> >>>>>>>>> extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
> >>>>>>>>>
> >>>>>>>>> uintptr_t secondary_next_target __initdata;
> >>>>>>>>> static uintptr_t kaslr_offset __initdata;
> >>>>>>>>>
> >>>>>>>>> +static const __init u32 *get_reg_address(int root_cells,
> >>>>>>>>> + const u32 *value, u64 *result)
> >>>>>>>>> +{
> >>>>>>>>> + int cell;
> >>>>>>>>> + *result = 0;
> >>>>>>>>> +
> >>>>>>>>> + for (cell = root_cells; cell > 0; --cell)
> >>>>>>>>> + *result = (*result << 32) + fdt32_to_cpu(*value++);
> >>>>>>>>> +
> >>>>>>>>> + return value;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
> >>>>>>>>> + int *size_cell)
> >>>>>>>>> +{
> >>>>>>>>> + int node = fdt_path_offset(dtb_early_va, path);
> >>>>>>>>> + fdt64_t *prop;
> >>>>>>>>> +
> >>>>>>>>> + if (node < 0)
> >>>>>>>>> + return -EINVAL;
> >>>>>>>>> +
> >>>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
> >>>>>>>>> + if (!prop)
> >>>>>>>>> + return -EINVAL;
> >>>>>>>>> + *addr_cell = fdt32_to_cpu(*prop);
> >>>>>>>>> +
> >>>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
> >>>>>>>>> + if (!prop)
> >>>>>>>>> + return -EINVAL;
> >>>>>>>>> + *size_cell = fdt32_to_cpu(*prop);
> >>>>>>>>> +
> >>>>>>>>> + return node;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +static __init void kaslr_get_mem_info(uintptr_t *mem_start,
> >>>>>>>>> + uintptr_t *mem_size)
> >>>>>>>>> +{
> >>>>>>>>> + int node, root, addr_cells, size_cells;
> >>>>>>>>> + u64 base, size;
> >>>>>>>>> +
> >>>>>>>>> + /* Get root node's address cells and size cells. */
> >>>>>>>>> + root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
> >>>>>>>>> + if (root < 0)
> >>>>>>>>> + return;
> >>>>>>>>> +
> >>>>>>>>> + /* Get memory base address and size. */
> >>>>>>>>> + fdt_for_each_subnode(node, dtb_early_va, root) {
> >>>>>>>>> + const char *dev_type;
> >>>>>>>>> + const u32 *reg;
> >>>>>>>>> +
> >>>>>>>>> + dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
> >>>>>>>>> + if (!dev_type)
> >>>>>>>>> + continue;
> >>>>>>>>> +
> >>>>>>>>> + if (!strcmp(dev_type, "memory")) {
> >>>>>>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> >>>>>>>>> + if (!reg)
> >>>>>>>>> + return;
> >>>>>>>>> +
> >>>>>>>>> + reg = get_reg_address(addr_cells, reg, &base);
> >>>>>>>>> + reg = get_reg_address(size_cells, reg, &size);
> >>>>>>>>> +
> >>>>>>>>> + *mem_start = base;
> >>>>>>>>> + *mem_size = size;
> >>>>>>>>> +
> >>>>>>>>> + break;
> >>>>>>>>> + }
> >>>>>>>>> + }
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +/* Return a default seed if there is no HW generator. */
> >>>>>>>>> +static u64 kaslr_default_seed = ULL(-1);
> >>>>>>>>> +static __init u64 kaslr_get_seed(void)
> >>>>>>>>> +{
> >>>>>>>>> + int node, len;
> >>>>>>>>> + fdt64_t *prop;
> >>>>>>>>> + u64 ret;
> >>>>>>>>> +
> >>>>>>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
> >>>>>>>>> + if (node < 0)
> >>>>>>>>> + return kaslr_default_seed++;
> >>>>>>>>> +
> >>>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
> >>>>>>>>> + if (!prop || len != sizeof(u64))
> >>>>>>>>> + return kaslr_default_seed++;
> >>>>>>>>> +
> >>>>>>>>> + ret = fdt64_to_cpu(*prop);
> >>>>>>>>> +
> >>>>>>>>> + /* Re-write to zero for checking whether get seed at second time */
> >>>>>>>>> + *prop = 0;
> >>>>>>>>> +
> >>>>>>>>> + return ret;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
> >>>>>>>>> + uintptr_t e2)
> >>>>>>>>> +{
> >>>>>>>>> + return e1 >= s2 && e2 >= s1;
> >>>>>>>>> +}
> >>>>>>>>
> >>>>>>>> Inline this function or use a macro maybe.
> >>>>>>>
> >>>>>>> Yes, sure. Thanks.
> >>>>>>>
> >>>>>>>>
> >>>>>>>>> +
> >>>>>>>>> +static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
> >>>>>>>>> + uintptr_t end_addr)
> >>>>>>>>> +{
> >>>>>>>>> + int node, rsv_mem, addr_cells, size_cells;
> >>>>>>>>> +
> >>>>>>>>> + /* Get the reserved-memory node. */
> >>>>>>>>> + rsv_mem = get_node_addr_size_cells("/reserved-memory",
> >>>>>>>>> + &addr_cells,
> >>>>>>>>> + &size_cells);
> >>>>>>>>> + if (rsv_mem < 0)
> >>>>>>>>> + return false;
> >>>>>>>>> +
> >>>>>>>>> + /* Get memory base address and size. */
> >>>>>>>>> + fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
> >>>>>>>>> + uint64_t base, size;
> >>>>>>>>> + const uint32_t *reg;
> >>>>>>>>> +
> >>>>>>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
> >>>>>>>>> + if (!reg)
> >>>>>>>>> + return 0;
> >>>>>>>>> +
> >>>>>>>>> + reg = get_reg_address(addr_cells, reg, &base);
> >>>>>>>>> + reg = get_reg_address(size_cells, reg, &size);
> >>>>>>>>> +
> >>>>>>>>> + if (is_overlap(start_addr, end_addr, base, base + size))
> >>>>>>>>> + return true;
> >>>>>>>>> + }
> >>>>>>>>> +
> >>>>>>>>> + return false;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
> >>>>>>>>> +{
> >>>>>>>>> + int node;
> >>>>>>>>> + uintptr_t initrd_start, initrd_end;
> >>>>>>>>> + fdt64_t *prop;
> >>>>>>>>> +
> >>>>>>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
> >>>>>>>>> + if (node < 0)
> >>>>>>>>> + return false;
> >>>>>>>>> +
> >>>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
> >>>>>>>>> + if (!prop)
> >>>>>>>>> + return false;
> >>>>>>>>> +
> >>>>>>>>> + initrd_start = fdt64_to_cpu(*prop);
> >>>>>>>>> +
> >>>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
> >>>>>>>>> + if (!prop)
> >>>>>>>>> + return false;
> >>>>>>>>> +
> >>>>>>>>> + initrd_end = fdt64_to_cpu(*prop);
> >>>>>>>>> +
> >>>>>>>>> + return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
> >>>>>>>>> +{
> >>>>>>>>> + uintptr_t dtb_start = dtb_early_pa;
> >>>>>>>>> + uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
> >>>>>>>>> +
> >>>>>>>>> + return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +static __init bool has_regions_overlapping(uintptr_t start_addr,
> >>>>>>>>> + uintptr_t end_addr)
> >>>>>>>>> +{
> >>>>>>>>> + if (is_overlap_dtb(start_addr, end_addr))
> >>>>>>>>> + return true;
> >>>>>>>>> +
> >>>>>>>>> + if (is_overlap_initrd(start_addr, end_addr))
> >>>>>>>>> + return true;
> >>>>>>>>> +
> >>>>>>>>> + if (is_overlap_reserved_mem(start_addr, end_addr))
> >>>>>>>>> + return true;
> >>>>>>>>> +
> >>>>>>>>> + return false;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +static inline __init unsigned long get_legal_offset(int random_index,
> >>>>>>>>> + int max_index,
> >>>>>>>>> + uintptr_t mem_start,
> >>>>>>>>> + uintptr_t kernel_size)
> >>>>>>>>> +{
> >>>>>>>>> + uintptr_t start_addr, end_addr;
> >>>>>>>>> + int idx, stop_idx;
> >>>>>>>>> +
> >>>>>>>>> + idx = stop_idx = random_index;
> >>>>>>>>> +
> >>>>>>>>> + do {
> >>>>>>>>> + start_addr = mem_start + idx * SZ_2M + kernel_size;
> >>>>>>>>> + end_addr = start_addr + kernel_size;
> >>>>>>>>> +
> >>>>>>>>> + /* Check overlap to other regions. */
> >>>>>>>>> + if (!has_regions_overlapping(start_addr, end_addr))
> >>>>>>>>> + return idx * SZ_2M + kernel_size;
> >>>>>>>>> +
> >>>>>>>>> + if (idx-- < 0)
> >>>>>>>>> + idx = max_index;
> >>>>>>>>
> >>>>>>>> Isn't the fallback to max_index a security breach ? Because at some
> >>>>>>>> point, the kernel will be loaded at this specific address.
> >>>>>>>
> >>>>>>> The max_index is the maximum safe index for destination of new kernel
> >>>>>>> image. Could you give more explain here?
> >>>>>>>
> >>>>>>
> >>>>>> But max_index is not random at all. I really don't know if that's a
> >>>>>> problem, I just found intriguing the fact the kernel could be loaded at
> >>>>>> some specific location. Would it be more secure, instead of picking
> >>>>>> max_index as fallback when reaching 0, to pick another random number
> >>>>>> between random_index and max_index ?
> >>>>>
> >>>>> ok, I can get your point. The original idea here is that we get a
> >>>>> random index first, then we decrease the index to retry to find a good
> >>>>> place if there are overlapping with other regions. A bit like the ring
> >>>>> buffer, the end of index traversing is not zero, but the random_index
> >>>>> - 1, we might consider it as continuity, so we don't know where is the
> >>>>> end point because the start point is random, whether we stop at zero
> >>>>> or random_index - 1.
> >>>>>
> >>>>> Pick another random number is more secure when occurring overlapping,
> >>>>> but I a little bit worry that it would take very long time to retry
> >>>>> many times in the worst case. for example, there is just only one
> >>>>> index could fit kernel image in (except for original location). In the
> >>>>> meantime, we don't need to wait the index being decreased to zero,
> >>>>> because it seems to me that they are the same to stop at zero or
> >>>>> random_index - 1, so if we decide to re-calculate a new random number,
> >>>>> maybe we could remove the index decreasing here.
> >>>>
> >>>> But you're right that it could take some time before converging to a
> >>>> "good" index. Maybe we could restrict the index range to indexes that we
> >>>> know for sure will be good ?
> >>>>
> >>>
> >>> Yes, it would be good for ensuring that we only need to get the random
> >>> number just once, but there are some points need to be discussed. The
> >>> first one is that we couldn't dynamically allocate a memory space at
> >>> that moment, because the memblock is not ready, so we might need to
> >>> declare a enough big array at static time to collect all good indexes.
> >>> Maybe CONFIG_MAXPHYSMEM_2GB and CONFIG_MAXPHYSMEM_128GB could be used
> >>> to decide the number of elements of this array. The second one is that
> >>> we always need to take the time to traverse the whole memory and check
> >>> the overlapping for all indexes no matter what the cases are. I'm not
> >>> sure whether it is good because this way increases the time and space
> >>> cost, but it would be more secure. Do you have any idea?
> >>>
> >>
> >> What about simply finding the biggest range of contiguous non-reserved
> >> memory and getting an index from there ?
> >
> > This needs something like mentioned above, we need a big enough array
> > to collect these index of the biggest range, and check all indexes > whether they are safe, and it would limit and reduce the random range
> > of we could use.
>
> You just have to get the min and max indexes of the biggest range, no
> need to store all indexes. And the vast majority of the usable memory

Oh, yes, all indexes in this biggest region is good for kernel image.

> will be in this biggest range, so it won't reduce the random range.

Actually, my reducing random range meant that assume there are three
reserved memories, then we could get four regions for destination of
new kernel image at most. If all these four regions could fit kernel
image in, but we only consider to put kernel image into the biggest
one of these regions, then we would waste the other three regions for
randomization. Maybe the original implementation which considering
whole memories would be more secure and wider? It seems to me that the
original worry is that kernel would be loaded at the specific address
by max_index, but actually, the max_index is just one candidate of all
available indexes during decreasing, it would continue to count down
because the condition "idx != stop_idx" won't be true, so we can start
to find a good place from a random address in every boot.

>
> > On original way, the value of max_index won't be the
> > end of traversing index, it would continue to decrease the index to
> > find a good place until the index becoming random_offset again, so
> > kernel doesn't be loaded to the specific location which max_index
> > specify to, it seems to me that there isn't the worry of you
> > mentioned >
> >>
> >>>
> >>>> Alex
> >>>>
> >>>>>
> >>>>>>
> >>>>>> Alex
> >>>>>>
> >>>>>>>>
> >>>>>>>>> +
> >>>>>>>>> + } while (idx != stop_idx);
> >>>>>>>>> +
> >>>>>>>>> + return 0;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
> >>>>>>>>> +{
> >>>>>>>>> + size_t i;
> >>>>>>>>> + uintptr_t *ptr = (uintptr_t *) area;
> >>>>>>>>> +
> >>>>>>>>> + for (i = 0; i < size / sizeof(hash); i++) {
> >>>>>>>>> + /* Rotate by odd number of bits and XOR. */
> >>>>>>>>> + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
> >>>>>>>>> + hash ^= ptr[i];
> >>>>>>>>> + }
> >>>>>>>>> +
> >>>>>>>>> + return hash;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +#define MEM_RESERVE_START __pa(PAGE_OFFSET)
> >>>>>>>>> +static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
> >>>>>>>>> +{
> >>>>>>>>> + uintptr_t mem_start = 0, mem_size= 0, random_size;
> >>>>>>>>> + uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
> >>>>>>>>> + int index;
> >>>>>>>>> + u64 random = 0;
> >>>>>>>>> + cycles_t time_base;
> >>>>>>>>> +
> >>>>>>>>> + /* Attempt to create a simple but unpredictable starting entropy */
> >>>>>>>>> + random = rotate_xor(random, linux_banner, strlen(linux_banner));
> >>>>>>>>> +
> >>>>>>>>> + /*
> >>>>>>>>> + * If there is no HW random number generator, use timer to get a random
> >>>>>>>>> + * number. This is better than nothing but not enough secure.
> >>>>>>>>> + */
> >>>>>>>>> + time_base = get_cycles() << 32;
> >>>>>>>>> + time_base ^= get_cycles();
> >>>>>>>>> + random = rotate_xor(random, &time_base, sizeof(time_base));
> >>>>>>>>> +
> >>>>>>>>> + if (seed)
> >>>>>>>>> + random = rotate_xor(random, &seed, sizeof(seed));
> >>>>>>>>> +
> >>>>>>>>> + kaslr_get_mem_info(&mem_start, &mem_size);
> >>>>>>>>> + if (!mem_size)
> >>>>>>>>> + return 0;
> >>>>>>>>> +
> >>>>>>>>> + if (mem_start < MEM_RESERVE_START) {
> >>>>>>>>> + mem_size -= MEM_RESERVE_START - mem_start;
> >>>>>>>>> + mem_start = MEM_RESERVE_START;
> >>>>>>>>> + }
> >>>>>>>>> +
> >>>>>>>>> + /*
> >>>>>>>>> + * Limit randomization range within 1G, so we can exploit
> >>>>>>>>> + * early_pmd/early_pte during early page table phase.
> >>>>>>>>> + */
> >>>>>>>>> + random_size = min_t(u64,
> >>>>>>>>> + mem_size - (kernel_size_align * 2),
> >>>>>>>>> + SZ_1G - (kernel_size_align * 2));
> >>>>>>>>
> >>>>>>>> pgdir size is 30 bits in sv39, but it's 39 bits in sv48, you should use
> >>>>>>>> PGDIR_SIZE macro here.
> >>>>>>>
> >>>>>>> OK, change it in the next version. Thanks.
> >>>>>>>
> >>>>>>>>
> >>>>>>>>> +
> >>>>>>>>> + /* The index of 2M block in whole avaliable region */
> >>>>>>>>> + index = random % (random_size / SZ_2M);
> >>>>>>>>> +
> >>>>>>>>> + return get_legal_offset(index, random_size / SZ_2M,
> >>>>>>>>> + mem_start, kernel_size_align);
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> uintptr_t __init kaslr_early_init(void)
> >>>>>>>>> {
> >>>>>>>>> + u64 seed;
> >>>>>>>>> uintptr_t dest_start, dest_end;
> >>>>>>>>> uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
> >>>>>>>>>
> >>>>>>>>> /* Get zero value at second time to avoid doing randomization again. */
> >>>>>>>>> - if (kaslr_offset)
> >>>>>>>>> + seed = kaslr_get_seed();
> >>>>>>>>> + if (!seed)
> >>>>>>>>> return 0;
> >>>>>>>>>
> >>>>>>>>> /* Get the random number for kaslr offset. */
> >>>>>>>>> - kaslr_offset = 0x10000000;
> >>>>>>>>> + kaslr_offset = get_random_offset(seed, kernel_size);
> >>>>>>>>>
> >>>>>>>>> /* Update kernel_virt_addr for get_kaslr_offset. */
> >>>>>>>>> kernel_virt_addr += kaslr_offset;
> >>>>>>>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> >>>>>>>>> index 2f5b25f02b6c..34c6ecf2c599 100644
> >>>>>>>>> --- a/arch/riscv/mm/init.c
> >>>>>>>>> +++ b/arch/riscv/mm/init.c
> >>>>>>>>> @@ -125,7 +125,7 @@ static void __init setup_initrd(void)
> >>>>>>>>> }
> >>>>>>>>> #endif /* CONFIG_BLK_DEV_INITRD */
> >>>>>>>>>
> >>>>>>>>> -static phys_addr_t dtb_early_pa __initdata;
> >>>>>>>>> +phys_addr_t dtb_early_pa __initdata;
> >>>>>>>>>
> >>>>>>>>> void __init setup_bootmem(void)
> >>>>>>>>> {
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> Alex

2020-04-15 08:36:59

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH RFC 4/8] riscv/kaslr: randomize the kernel image offset



On 4/13/20 10:46 PM, Zong Li wrote:
> On Sun, Apr 12, 2020 at 2:53 PM Alex Ghiti <[email protected]> wrote:
>>
>>
>>
>> On 4/11/20 4:20 AM, Zong Li wrote:
>>> On Fri, Apr 10, 2020 at 11:58 PM Alex Ghiti <[email protected]> wrote:
>>>>
>>>> Hi Zong,
>>>>
>>>> On 4/9/20 6:31 AM, Zong Li wrote:
>>>>> On Thu, Apr 9, 2020 at 1:51 PM Alex Ghiti <[email protected]> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 4/7/20 6:53 AM, Zong Li wrote:
>>>>>>> On Tue, Apr 7, 2020 at 1:11 PM Alex Ghiti <[email protected]> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 3/24/20 3:30 AM, Zong Li wrote:
>>>>>>>>> Entropy is derived from the banner and timer, it is better than nothing
>>>>>>>>> but not enough secure, so previous stage may pass entropy via the device
>>>>>>>>> tree /chosen/kaslr-seed node.
>>>>>>>>>
>>>>>>>>> We limit randomization range within 1GB, so we can exploit early page
>>>>>>>>> table to map new destination of kernel image. Additionally, the kernel
>>>>>>>>> offset need 2M alignment to ensure it's good in PMD page table.
>>>>>>>>>
>>>>>>>>> We also checks the kernel offset whether it's safe by avoiding to
>>>>>>>>> overlaps with dtb, initrd and reserved memory regions.
>>>>>>>>>
>>>>>>>>
>>>>>>>> That maybe changes the way my sv48 patchset will be implemented: I can't
>>>>>>>> get user preference (3-level or 4-level) by any means, device-tree or
>>>>>>>> kernel parameter.
>>>>>>>>
>>>>>>>> But I don't see how you could get a random offset without info from the
>>>>>>>> device tree anyway (reserved memory regions especially), so maybe I
>>>>>>>> could parse dtb for allowing the user to choose. I'll move this
>>>>>>>> discussion to the sv48 introduction.
>>>>>>>
>>>>>>> Maybe I'm a little bit misunderstanding here, but I think I got the
>>>>>>> random offset through some information by parsing dtb.
>>>>>>>
>>>>>>
>>>>>> I was just saying that I may use the dtb too in sv48 patchset to make it
>>>>>> possible for users to choose sv39 even if sv48 is supported by hardware
>>>>>> (which is not the case in my current patchset).
>>>>>>
>>>>>>>>
>>>>>>>>> Signed-off-by: Zong Li <[email protected]>
>>>>>>>>> ---
>>>>>>>>> arch/riscv/kernel/kaslr.c | 274 +++++++++++++++++++++++++++++++++++++-
>>>>>>>>> arch/riscv/mm/init.c | 2 +-
>>>>>>>>> 2 files changed, 273 insertions(+), 3 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/arch/riscv/kernel/kaslr.c b/arch/riscv/kernel/kaslr.c
>>>>>>>>> index 281b5fcca5c8..9ec2b608eb7f 100644
>>>>>>>>> --- a/arch/riscv/kernel/kaslr.c
>>>>>>>>> +++ b/arch/riscv/kernel/kaslr.c
>>>>>>>>> @@ -11,23 +11,293 @@
>>>>>>>>> #include <asm/cacheflush.h>
>>>>>>>>>
>>>>>>>>> extern char _start[], _end[];
>>>>>>>>> +extern void *dtb_early_va;
>>>>>>>>> +extern phys_addr_t dtb_early_pa;
>>>>>>>>> extern void secondary_random_target(void);
>>>>>>>>> extern void kaslr_create_page_table(uintptr_t start, uintptr_t end);
>>>>>>>>>
>>>>>>>>> uintptr_t secondary_next_target __initdata;
>>>>>>>>> static uintptr_t kaslr_offset __initdata;
>>>>>>>>>
>>>>>>>>> +static const __init u32 *get_reg_address(int root_cells,
>>>>>>>>> + const u32 *value, u64 *result)
>>>>>>>>> +{
>>>>>>>>> + int cell;
>>>>>>>>> + *result = 0;
>>>>>>>>> +
>>>>>>>>> + for (cell = root_cells; cell > 0; --cell)
>>>>>>>>> + *result = (*result << 32) + fdt32_to_cpu(*value++);
>>>>>>>>> +
>>>>>>>>> + return value;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static __init int get_node_addr_size_cells(const char *path, int *addr_cell,
>>>>>>>>> + int *size_cell)
>>>>>>>>> +{
>>>>>>>>> + int node = fdt_path_offset(dtb_early_va, path);
>>>>>>>>> + fdt64_t *prop;
>>>>>>>>> +
>>>>>>>>> + if (node < 0)
>>>>>>>>> + return -EINVAL;
>>>>>>>>> +
>>>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#address-cells", NULL);
>>>>>>>>> + if (!prop)
>>>>>>>>> + return -EINVAL;
>>>>>>>>> + *addr_cell = fdt32_to_cpu(*prop);
>>>>>>>>> +
>>>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "#size-cells", NULL);
>>>>>>>>> + if (!prop)
>>>>>>>>> + return -EINVAL;
>>>>>>>>> + *size_cell = fdt32_to_cpu(*prop);
>>>>>>>>> +
>>>>>>>>> + return node;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static __init void kaslr_get_mem_info(uintptr_t *mem_start,
>>>>>>>>> + uintptr_t *mem_size)
>>>>>>>>> +{
>>>>>>>>> + int node, root, addr_cells, size_cells;
>>>>>>>>> + u64 base, size;
>>>>>>>>> +
>>>>>>>>> + /* Get root node's address cells and size cells. */
>>>>>>>>> + root = get_node_addr_size_cells("/", &addr_cells, &size_cells);
>>>>>>>>> + if (root < 0)
>>>>>>>>> + return;
>>>>>>>>> +
>>>>>>>>> + /* Get memory base address and size. */
>>>>>>>>> + fdt_for_each_subnode(node, dtb_early_va, root) {
>>>>>>>>> + const char *dev_type;
>>>>>>>>> + const u32 *reg;
>>>>>>>>> +
>>>>>>>>> + dev_type = fdt_getprop(dtb_early_va, node, "device_type", NULL);
>>>>>>>>> + if (!dev_type)
>>>>>>>>> + continue;
>>>>>>>>> +
>>>>>>>>> + if (!strcmp(dev_type, "memory")) {
>>>>>>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
>>>>>>>>> + if (!reg)
>>>>>>>>> + return;
>>>>>>>>> +
>>>>>>>>> + reg = get_reg_address(addr_cells, reg, &base);
>>>>>>>>> + reg = get_reg_address(size_cells, reg, &size);
>>>>>>>>> +
>>>>>>>>> + *mem_start = base;
>>>>>>>>> + *mem_size = size;
>>>>>>>>> +
>>>>>>>>> + break;
>>>>>>>>> + }
>>>>>>>>> + }
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +/* Return a default seed if there is no HW generator. */
>>>>>>>>> +static u64 kaslr_default_seed = ULL(-1);
>>>>>>>>> +static __init u64 kaslr_get_seed(void)
>>>>>>>>> +{
>>>>>>>>> + int node, len;
>>>>>>>>> + fdt64_t *prop;
>>>>>>>>> + u64 ret;
>>>>>>>>> +
>>>>>>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
>>>>>>>>> + if (node < 0)
>>>>>>>>> + return kaslr_default_seed++;
>>>>>>>>> +
>>>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "kaslr-seed", &len);
>>>>>>>>> + if (!prop || len != sizeof(u64))
>>>>>>>>> + return kaslr_default_seed++;
>>>>>>>>> +
>>>>>>>>> + ret = fdt64_to_cpu(*prop);
>>>>>>>>> +
>>>>>>>>> + /* Re-write to zero for checking whether get seed at second time */
>>>>>>>>> + *prop = 0;
>>>>>>>>> +
>>>>>>>>> + return ret;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static __init bool is_overlap(uintptr_t s1, uintptr_t e1, uintptr_t s2,
>>>>>>>>> + uintptr_t e2)
>>>>>>>>> +{
>>>>>>>>> + return e1 >= s2 && e2 >= s1;
>>>>>>>>> +}
>>>>>>>>
>>>>>>>> Inline this function or use a macro maybe.
>>>>>>>
>>>>>>> Yes, sure. Thanks.
>>>>>>>
>>>>>>>>
>>>>>>>>> +
>>>>>>>>> +static __init bool is_overlap_reserved_mem(uintptr_t start_addr,
>>>>>>>>> + uintptr_t end_addr)
>>>>>>>>> +{
>>>>>>>>> + int node, rsv_mem, addr_cells, size_cells;
>>>>>>>>> +
>>>>>>>>> + /* Get the reserved-memory node. */
>>>>>>>>> + rsv_mem = get_node_addr_size_cells("/reserved-memory",
>>>>>>>>> + &addr_cells,
>>>>>>>>> + &size_cells);
>>>>>>>>> + if (rsv_mem < 0)
>>>>>>>>> + return false;
>>>>>>>>> +
>>>>>>>>> + /* Get memory base address and size. */
>>>>>>>>> + fdt_for_each_subnode(node, dtb_early_va, rsv_mem) {
>>>>>>>>> + uint64_t base, size;
>>>>>>>>> + const uint32_t *reg;
>>>>>>>>> +
>>>>>>>>> + reg = fdt_getprop(dtb_early_va, node, "reg", NULL);
>>>>>>>>> + if (!reg)
>>>>>>>>> + return 0;
>>>>>>>>> +
>>>>>>>>> + reg = get_reg_address(addr_cells, reg, &base);
>>>>>>>>> + reg = get_reg_address(size_cells, reg, &size);
>>>>>>>>> +
>>>>>>>>> + if (is_overlap(start_addr, end_addr, base, base + size))
>>>>>>>>> + return true;
>>>>>>>>> + }
>>>>>>>>> +
>>>>>>>>> + return false;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static __init bool is_overlap_initrd(uintptr_t start_addr, uintptr_t end_addr)
>>>>>>>>> +{
>>>>>>>>> + int node;
>>>>>>>>> + uintptr_t initrd_start, initrd_end;
>>>>>>>>> + fdt64_t *prop;
>>>>>>>>> +
>>>>>>>>> + node = fdt_path_offset(dtb_early_va, "/chosen");
>>>>>>>>> + if (node < 0)
>>>>>>>>> + return false;
>>>>>>>>> +
>>>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-start", NULL);
>>>>>>>>> + if (!prop)
>>>>>>>>> + return false;
>>>>>>>>> +
>>>>>>>>> + initrd_start = fdt64_to_cpu(*prop);
>>>>>>>>> +
>>>>>>>>> + prop = fdt_getprop_w(dtb_early_va, node, "linux,initrd-end", NULL);
>>>>>>>>> + if (!prop)
>>>>>>>>> + return false;
>>>>>>>>> +
>>>>>>>>> + initrd_end = fdt64_to_cpu(*prop);
>>>>>>>>> +
>>>>>>>>> + return is_overlap(start_addr, end_addr, initrd_start, initrd_end);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static __init bool is_overlap_dtb(uintptr_t start_addr, uintptr_t end_addr)
>>>>>>>>> +{
>>>>>>>>> + uintptr_t dtb_start = dtb_early_pa;
>>>>>>>>> + uintptr_t dtb_end = dtb_start + fdt_totalsize(dtb_early_va);
>>>>>>>>> +
>>>>>>>>> + return is_overlap(start_addr, end_addr, dtb_start, dtb_end);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static __init bool has_regions_overlapping(uintptr_t start_addr,
>>>>>>>>> + uintptr_t end_addr)
>>>>>>>>> +{
>>>>>>>>> + if (is_overlap_dtb(start_addr, end_addr))
>>>>>>>>> + return true;
>>>>>>>>> +
>>>>>>>>> + if (is_overlap_initrd(start_addr, end_addr))
>>>>>>>>> + return true;
>>>>>>>>> +
>>>>>>>>> + if (is_overlap_reserved_mem(start_addr, end_addr))
>>>>>>>>> + return true;
>>>>>>>>> +
>>>>>>>>> + return false;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static inline __init unsigned long get_legal_offset(int random_index,
>>>>>>>>> + int max_index,
>>>>>>>>> + uintptr_t mem_start,
>>>>>>>>> + uintptr_t kernel_size)
>>>>>>>>> +{
>>>>>>>>> + uintptr_t start_addr, end_addr;
>>>>>>>>> + int idx, stop_idx;
>>>>>>>>> +
>>>>>>>>> + idx = stop_idx = random_index;
>>>>>>>>> +
>>>>>>>>> + do {
>>>>>>>>> + start_addr = mem_start + idx * SZ_2M + kernel_size;
>>>>>>>>> + end_addr = start_addr + kernel_size;
>>>>>>>>> +
>>>>>>>>> + /* Check overlap to other regions. */
>>>>>>>>> + if (!has_regions_overlapping(start_addr, end_addr))
>>>>>>>>> + return idx * SZ_2M + kernel_size;
>>>>>>>>> +
>>>>>>>>> + if (idx-- < 0)
>>>>>>>>> + idx = max_index;
>>>>>>>>
>>>>>>>> Isn't the fallback to max_index a security breach ? Because at some
>>>>>>>> point, the kernel will be loaded at this specific address.
>>>>>>>
>>>>>>> The max_index is the maximum safe index for destination of new kernel
>>>>>>> image. Could you give more explain here?
>>>>>>>
>>>>>>
>>>>>> But max_index is not random at all. I really don't know if that's a
>>>>>> problem, I just found intriguing the fact the kernel could be loaded at
>>>>>> some specific location. Would it be more secure, instead of picking
>>>>>> max_index as fallback when reaching 0, to pick another random number
>>>>>> between random_index and max_index ?
>>>>>
>>>>> ok, I can get your point. The original idea here is that we get a
>>>>> random index first, then we decrease the index to retry to find a good
>>>>> place if there are overlapping with other regions. A bit like the ring
>>>>> buffer, the end of index traversing is not zero, but the random_index
>>>>> - 1, we might consider it as continuity, so we don't know where is the
>>>>> end point because the start point is random, whether we stop at zero
>>>>> or random_index - 1.
>>>>>
>>>>> Pick another random number is more secure when occurring overlapping,
>>>>> but I a little bit worry that it would take very long time to retry
>>>>> many times in the worst case. for example, there is just only one
>>>>> index could fit kernel image in (except for original location). In the
>>>>> meantime, we don't need to wait the index being decreased to zero,
>>>>> because it seems to me that they are the same to stop at zero or
>>>>> random_index - 1, so if we decide to re-calculate a new random number,
>>>>> maybe we could remove the index decreasing here.
>>>>
>>>> But you're right that it could take some time before converging to a
>>>> "good" index. Maybe we could restrict the index range to indexes that we
>>>> know for sure will be good ?
>>>>
>>>
>>> Yes, it would be good for ensuring that we only need to get the random
>>> number just once, but there are some points need to be discussed. The
>>> first one is that we couldn't dynamically allocate a memory space at
>>> that moment, because the memblock is not ready, so we might need to
>>> declare a enough big array at static time to collect all good indexes.
>>> Maybe CONFIG_MAXPHYSMEM_2GB and CONFIG_MAXPHYSMEM_128GB could be used
>>> to decide the number of elements of this array. The second one is that
>>> we always need to take the time to traverse the whole memory and check
>>> the overlapping for all indexes no matter what the cases are. I'm not
>>> sure whether it is good because this way increases the time and space
>>> cost, but it would be more secure. Do you have any idea?
>>>
>>
>> What about simply finding the biggest range of contiguous non-reserved
>> memory and getting an index from there ?
>
> This needs something like mentioned above, we need a big enough array
> to collect these index of the biggest range, and check all indexes > whether they are safe, and it would limit and reduce the random range
> of we could use.

You just have to get the min and max indexes of the biggest range, no
need to store all indexes. And the vast majority of the usable memory
will be in this biggest range, so it won't reduce the random range.

> On original way, the value of max_index won't be the
> end of traversing index, it would continue to decrease the index to
> find a good place until the index becoming random_offset again, so
> kernel doesn't be loaded to the specific location which max_index
> specify to, it seems to me that there isn't the worry of you
> mentioned >
>>
>>>
>>>> Alex
>>>>
>>>>>
>>>>>>
>>>>>> Alex
>>>>>>
>>>>>>>>
>>>>>>>>> +
>>>>>>>>> + } while (idx != stop_idx);
>>>>>>>>> +
>>>>>>>>> + return 0;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static inline __init u64 rotate_xor(u64 hash, const void *area, size_t size)
>>>>>>>>> +{
>>>>>>>>> + size_t i;
>>>>>>>>> + uintptr_t *ptr = (uintptr_t *) area;
>>>>>>>>> +
>>>>>>>>> + for (i = 0; i < size / sizeof(hash); i++) {
>>>>>>>>> + /* Rotate by odd number of bits and XOR. */
>>>>>>>>> + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
>>>>>>>>> + hash ^= ptr[i];
>>>>>>>>> + }
>>>>>>>>> +
>>>>>>>>> + return hash;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +#define MEM_RESERVE_START __pa(PAGE_OFFSET)
>>>>>>>>> +static __init uintptr_t get_random_offset(u64 seed, uintptr_t kernel_size)
>>>>>>>>> +{
>>>>>>>>> + uintptr_t mem_start = 0, mem_size= 0, random_size;
>>>>>>>>> + uintptr_t kernel_size_align = round_up(kernel_size, SZ_2M);
>>>>>>>>> + int index;
>>>>>>>>> + u64 random = 0;
>>>>>>>>> + cycles_t time_base;
>>>>>>>>> +
>>>>>>>>> + /* Attempt to create a simple but unpredictable starting entropy */
>>>>>>>>> + random = rotate_xor(random, linux_banner, strlen(linux_banner));
>>>>>>>>> +
>>>>>>>>> + /*
>>>>>>>>> + * If there is no HW random number generator, use timer to get a random
>>>>>>>>> + * number. This is better than nothing but not enough secure.
>>>>>>>>> + */
>>>>>>>>> + time_base = get_cycles() << 32;
>>>>>>>>> + time_base ^= get_cycles();
>>>>>>>>> + random = rotate_xor(random, &time_base, sizeof(time_base));
>>>>>>>>> +
>>>>>>>>> + if (seed)
>>>>>>>>> + random = rotate_xor(random, &seed, sizeof(seed));
>>>>>>>>> +
>>>>>>>>> + kaslr_get_mem_info(&mem_start, &mem_size);
>>>>>>>>> + if (!mem_size)
>>>>>>>>> + return 0;
>>>>>>>>> +
>>>>>>>>> + if (mem_start < MEM_RESERVE_START) {
>>>>>>>>> + mem_size -= MEM_RESERVE_START - mem_start;
>>>>>>>>> + mem_start = MEM_RESERVE_START;
>>>>>>>>> + }
>>>>>>>>> +
>>>>>>>>> + /*
>>>>>>>>> + * Limit randomization range within 1G, so we can exploit
>>>>>>>>> + * early_pmd/early_pte during early page table phase.
>>>>>>>>> + */
>>>>>>>>> + random_size = min_t(u64,
>>>>>>>>> + mem_size - (kernel_size_align * 2),
>>>>>>>>> + SZ_1G - (kernel_size_align * 2));
>>>>>>>>
>>>>>>>> pgdir size is 30 bits in sv39, but it's 39 bits in sv48, you should use
>>>>>>>> PGDIR_SIZE macro here.
>>>>>>>
>>>>>>> OK, change it in the next version. Thanks.
>>>>>>>
>>>>>>>>
>>>>>>>>> +
>>>>>>>>> + /* The index of 2M block in whole avaliable region */
>>>>>>>>> + index = random % (random_size / SZ_2M);
>>>>>>>>> +
>>>>>>>>> + return get_legal_offset(index, random_size / SZ_2M,
>>>>>>>>> + mem_start, kernel_size_align);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> uintptr_t __init kaslr_early_init(void)
>>>>>>>>> {
>>>>>>>>> + u64 seed;
>>>>>>>>> uintptr_t dest_start, dest_end;
>>>>>>>>> uintptr_t kernel_size = (uintptr_t) _end - (uintptr_t) _start;
>>>>>>>>>
>>>>>>>>> /* Get zero value at second time to avoid doing randomization again. */
>>>>>>>>> - if (kaslr_offset)
>>>>>>>>> + seed = kaslr_get_seed();
>>>>>>>>> + if (!seed)
>>>>>>>>> return 0;
>>>>>>>>>
>>>>>>>>> /* Get the random number for kaslr offset. */
>>>>>>>>> - kaslr_offset = 0x10000000;
>>>>>>>>> + kaslr_offset = get_random_offset(seed, kernel_size);
>>>>>>>>>
>>>>>>>>> /* Update kernel_virt_addr for get_kaslr_offset. */
>>>>>>>>> kernel_virt_addr += kaslr_offset;
>>>>>>>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>>>>>>>>> index 2f5b25f02b6c..34c6ecf2c599 100644
>>>>>>>>> --- a/arch/riscv/mm/init.c
>>>>>>>>> +++ b/arch/riscv/mm/init.c
>>>>>>>>> @@ -125,7 +125,7 @@ static void __init setup_initrd(void)
>>>>>>>>> }
>>>>>>>>> #endif /* CONFIG_BLK_DEV_INITRD */
>>>>>>>>>
>>>>>>>>> -static phys_addr_t dtb_early_pa __initdata;
>>>>>>>>> +phys_addr_t dtb_early_pa __initdata;
>>>>>>>>>
>>>>>>>>> void __init setup_bootmem(void)
>>>>>>>>> {
>>>>>>>>>
>>>>>>>>
>>>>>>>> Alex