The main purpose of this patch series is changing the kernel mapping permission
, make sure that code is not writeable, data is not executable, and read-only
data is neither writable nor executable.
This patch series also supports the relevant implementations such as
ARCH_HAS_SET_MEMORY, ARCH_HAS_SET_DIRECT_MAP,
ARCH_SUPPORTS_DEBUG_PAGEALLOC and DEBUG_WX.
Changes in v3:
- Fix build error on nommu configuration. We already support nommu on
RISC-V, so we should consider nommu case and test not only rv32/64,
but also nommu.
Changes in v2:
- Use _data to specify the start of data section with write permission.
- Change ftrace patch text implementaion.
- Separate DEBUG_WX patch to another patchset.
Zong Li (9):
riscv: add ARCH_HAS_SET_MEMORY support
riscv: add ARCH_HAS_SET_DIRECT_MAP support
riscv: add ARCH_SUPPORTS_DEBUG_PAGEALLOC support
riscv: move exception table immediately after RO_DATA
riscv: add alignment for text, rodata and data sections
riscv: add STRICT_KERNEL_RWX support
riscv: add macro to get instruction length
riscv: introduce interfaces to patch kernel code
riscv: patch code by fixmap mapping
arch/riscv/Kconfig | 6 +
arch/riscv/include/asm/bug.h | 8 ++
arch/riscv/include/asm/fixmap.h | 2 +
arch/riscv/include/asm/patch.h | 12 ++
arch/riscv/include/asm/set_memory.h | 48 +++++++
arch/riscv/kernel/Makefile | 4 +-
arch/riscv/kernel/ftrace.c | 13 +-
arch/riscv/kernel/patch.c | 120 ++++++++++++++++++
arch/riscv/kernel/traps.c | 3 +-
arch/riscv/kernel/vmlinux.lds.S | 11 +-
arch/riscv/mm/Makefile | 2 +-
arch/riscv/mm/init.c | 44 +++++++
arch/riscv/mm/pageattr.c | 187 ++++++++++++++++++++++++++++
13 files changed, 445 insertions(+), 15 deletions(-)
create mode 100644 arch/riscv/include/asm/patch.h
create mode 100644 arch/riscv/include/asm/set_memory.h
create mode 100644 arch/riscv/kernel/patch.c
create mode 100644 arch/riscv/mm/pageattr.c
--
2.25.1
ARCH_SUPPORTS_DEBUG_PAGEALLOC provides a hook to map and unmap
pages for debugging purposes. Implement the __kernel_map_pages
functions to fill the poison pattern.
Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/Kconfig | 3 +++
arch/riscv/mm/pageattr.c | 13 +++++++++++++
2 files changed, 16 insertions(+)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 71fabb0ffdbe..54437d7662a5 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -133,6 +133,9 @@ config ARCH_SELECT_MEMORY_MODEL
config ARCH_WANT_GENERAL_HUGETLB
def_bool y
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ def_bool y
+
config SYS_SUPPORTS_HUGETLBFS
def_bool y
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index 7be6cd67e2ef..728759eb530a 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -172,3 +172,16 @@ int set_direct_map_default_noflush(struct page *page)
return walk_page_range(&init_mm, start, end, &pageattr_ops, &masks);
}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (!debug_pagealloc_enabled())
+ return;
+
+ if (enable)
+ __set_memory((unsigned long)page_address(page), numpages,
+ __pgprot(_PAGE_PRESENT), __pgprot(0));
+ else
+ __set_memory((unsigned long)page_address(page), numpages,
+ __pgprot(0), __pgprot(_PAGE_PRESENT));
+}
--
2.25.1
Move EXCEPTION_TABLE immediately after RO_DATA. Make it easy to set the
attribution of the sections which should be read-only at a time.
Add _data to specify the start of data section with write permission.
This patch is prepared for STRICT_KERNEL_RWX support.
Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/kernel/vmlinux.lds.S | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
index 1e0193ded420..02e948b620af 100644
--- a/arch/riscv/kernel/vmlinux.lds.S
+++ b/arch/riscv/kernel/vmlinux.lds.S
@@ -58,6 +58,10 @@ SECTIONS
*(.srodata*)
}
+ EXCEPTION_TABLE(0x10)
+
+ _data = .;
+
RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
.sdata : {
__global_pointer$ = . + 0x800;
@@ -69,8 +73,6 @@ SECTIONS
BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0)
- EXCEPTION_TABLE(0x10)
-
.rel.dyn : {
*(.rel.dyn*)
}
--
2.25.1
The kernel mapping will tried to optimize its mapping by using bigger
size. In rv64, it tries to use PMD_SIZE, and tryies to use PGDIR_SIZE in
rv32. To ensure that the start address of these sections could fit the
mapping entry size, make them align to the biggest alignment.
Define a macro SECTION_ALIGN because the HPAGE_SIZE or PMD_SIZE, etc.,
are invisible in linker script.
This patch is prepared for STRICT_KERNEL_RWX support.
Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/include/asm/set_memory.h | 13 +++++++++++++
arch/riscv/kernel/vmlinux.lds.S | 5 ++++-
2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
index 620d81c372d9..4c5bae7ca01c 100644
--- a/arch/riscv/include/asm/set_memory.h
+++ b/arch/riscv/include/asm/set_memory.h
@@ -6,6 +6,7 @@
#ifndef _ASM_RISCV_SET_MEMORY_H
#define _ASM_RISCV_SET_MEMORY_H
+#ifndef __ASSEMBLY__
/*
* Functions to change memory attributes.
*/
@@ -24,4 +25,16 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
+#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_ARCH_HAS_STRICT_KERNEL_RWX
+#ifdef CONFIG_64BIT
+#define SECTION_ALIGN (1 << 21)
+#else
+#define SECTION_ALIGN (1 << 22)
+#endif
+#else /* !CONFIG_ARCH_HAS_STRICT_KERNEL_RWX */
+#define SECTION_ALIGN L1_CACHE_BYTES
+#endif /* CONFIG_ARCH_HAS_STRICT_KERNEL_RWX */
+
#endif /* _ASM_RISCV_SET_MEMORY_H */
diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
index 02e948b620af..ef87deea0350 100644
--- a/arch/riscv/kernel/vmlinux.lds.S
+++ b/arch/riscv/kernel/vmlinux.lds.S
@@ -9,6 +9,7 @@
#include <asm/page.h>
#include <asm/cache.h>
#include <asm/thread_info.h>
+#include <asm/set_memory.h>
OUTPUT_ARCH(riscv)
ENTRY(_start)
@@ -36,6 +37,7 @@ SECTIONS
PERCPU_SECTION(L1_CACHE_BYTES)
__init_end = .;
+ . = ALIGN(SECTION_ALIGN);
.text : {
_text = .;
_stext = .;
@@ -53,13 +55,14 @@ SECTIONS
/* Start of data section */
_sdata = .;
- RO_DATA(L1_CACHE_BYTES)
+ RO_DATA(SECTION_ALIGN)
.srodata : {
*(.srodata*)
}
EXCEPTION_TABLE(0x10)
+ . = ALIGN(SECTION_ALIGN);
_data = .;
RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
--
2.25.1
Extract the calculation of instruction length for common use.
Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/include/asm/bug.h | 8 ++++++++
arch/riscv/kernel/traps.c | 3 ++-
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h
index 75604fec1b1b..d6f1ec08d97b 100644
--- a/arch/riscv/include/asm/bug.h
+++ b/arch/riscv/include/asm/bug.h
@@ -19,6 +19,14 @@
#define __BUG_INSN_32 _UL(0x00100073) /* ebreak */
#define __BUG_INSN_16 _UL(0x9002) /* c.ebreak */
+#define GET_INSN_LENGTH(insn) \
+({ \
+ unsigned long __len; \
+ __len = ((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) ? \
+ 4UL : 2UL; \
+ __len; \
+})
+
typedef u32 bug_insn_t;
#ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index ffb3d94bf0cc..a4d136355f78 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -118,7 +118,8 @@ static inline unsigned long get_break_insn_length(unsigned long pc)
if (probe_kernel_address((bug_insn_t *)pc, insn))
return 0;
- return (((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) ? 4UL : 2UL);
+
+ return GET_INSN_LENGTH(insn);
}
asmlinkage __visible void do_trap_break(struct pt_regs *regs)
--
2.25.1
The commit contains that make text section as non-writable, rodata
section as read-only, and data section as non-executable.
The init section should be changed to non-executable.
Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/Kconfig | 1 +
arch/riscv/include/asm/set_memory.h | 8 ++++++
arch/riscv/mm/init.c | 44 +++++++++++++++++++++++++++++
3 files changed, 53 insertions(+)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 54437d7662a5..5fbae6380b32 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -62,6 +62,7 @@ config RISCV
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
+ select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
select SPARSEMEM_STATIC if 32BIT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
index 4c5bae7ca01c..c38df4771c09 100644
--- a/arch/riscv/include/asm/set_memory.h
+++ b/arch/riscv/include/asm/set_memory.h
@@ -22,6 +22,14 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
#endif
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void set_kernel_text_ro(void);
+void set_kernel_text_rw(void);
+#else
+static inline void set_kernel_text_ro(void) { }
+static inline void set_kernel_text_rw(void) { }
+#endif
+
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index fab855963c73..b55be44ff9bd 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -12,6 +12,7 @@
#include <linux/sizes.h>
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
+#include <linux/set_memory.h>
#include <asm/fixmap.h>
#include <asm/tlbflush.h>
@@ -477,6 +478,17 @@ static void __init setup_vm_final(void)
csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
local_flush_tlb_all();
}
+
+void free_initmem(void)
+{
+ unsigned long init_begin = (unsigned long)__init_begin;
+ unsigned long init_end = (unsigned long)__init_end;
+
+ /* Make the region as non-execuatble. */
+ set_memory_nx(init_begin, (init_end - init_begin) >> PAGE_SHIFT);
+ free_initmem_default(POISON_FREE_INITMEM);
+}
+
#else
asmlinkage void __init setup_vm(uintptr_t dtb_pa)
{
@@ -488,6 +500,38 @@ static inline void setup_vm_final(void)
}
#endif /* CONFIG_MMU */
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void set_kernel_text_rw(void)
+{
+ unsigned long text_start = (unsigned long)_text;
+ unsigned long text_end = (unsigned long)_etext;
+
+ set_memory_rw(text_start, (text_end - text_start) >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+ unsigned long text_start = (unsigned long)_text;
+ unsigned long text_end = (unsigned long)_etext;
+
+ set_memory_ro(text_start, (text_end - text_start) >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+ unsigned long text_start = (unsigned long)_text;
+ unsigned long text_end = (unsigned long)_etext;
+ unsigned long rodata_start = (unsigned long)__start_rodata;
+ unsigned long data_start = (unsigned long)_data;
+ unsigned long max_low = (unsigned long)(__va(PFN_PHYS(max_low_pfn)));
+
+ set_memory_ro(text_start, (text_end - text_start) >> PAGE_SHIFT);
+ set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(data_start, (max_low - data_start) >> PAGE_SHIFT);
+}
+#endif
+
void __init paging_init(void)
{
setup_vm_final();
--
2.25.1
On strict kernel memory permission, the ftrace have to change the
permission of text for dynamic patching the intructions. Use
riscv_patch_text_nosync() to patch code instead of probe_kernel_write.
Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/kernel/ftrace.c | 13 ++++---------
1 file changed, 4 insertions(+), 9 deletions(-)
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index c40fdcdeb950..ce69b34ff55d 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -8,6 +8,7 @@
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
+#include <asm/patch.h>
#ifdef CONFIG_DYNAMIC_FTRACE
static int ftrace_check_current_call(unsigned long hook_pos,
@@ -46,20 +47,14 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
{
unsigned int call[2];
unsigned int nops[2] = {NOP4, NOP4};
- int ret = 0;
make_call(hook_pos, target, call);
- /* replace the auipc-jalr pair at once */
- ret = probe_kernel_write((void *)hook_pos, enable ? call : nops,
- MCOUNT_INSN_SIZE);
- /* return must be -EPERM on write error */
- if (ret)
+ /* Replace the auipc-jalr pair at once. Return -EPERM on write error. */
+ if (riscv_patch_text_nosync
+ ((void *)hook_pos, enable ? call : nops, MCOUNT_INSN_SIZE))
return -EPERM;
- smp_mb();
- flush_icache_range((void *)hook_pos, (void *)hook_pos + MCOUNT_INSN_SIZE);
-
return 0;
}
--
2.25.1
On strict kernel memory permission, we couldn't patch code without
writable permission. Preserve two holes in fixmap area, so we can map
the kernel code temporarily to fixmap area, then patch the instructions.
We need two pages here because we support the compressed instruction, so
the instruction might be align to 2 bytes. When patching the 32-bit
length instruction which is 2 bytes alignment, it will across two pages.
Introduce two interfaces to patch kernel code:
riscv_patch_text_nosync:
- patch code without synchronization, it's caller's responsibility to
synchronize all CPUs if needed.
riscv_patch_text:
- patch code and always synchronize with stop_machine()
Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/include/asm/fixmap.h | 2 +
arch/riscv/include/asm/patch.h | 12 ++++
arch/riscv/kernel/Makefile | 4 +-
arch/riscv/kernel/patch.c | 120 ++++++++++++++++++++++++++++++++
4 files changed, 137 insertions(+), 1 deletion(-)
create mode 100644 arch/riscv/include/asm/patch.h
create mode 100644 arch/riscv/kernel/patch.c
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index 42d2c42f3cc9..2368d49eb4ef 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -27,6 +27,8 @@ enum fixed_addresses {
FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
FIX_PTE,
FIX_PMD,
+ FIX_TEXT_POKE1,
+ FIX_TEXT_POKE0,
FIX_EARLYCON_MEM_BASE,
__end_of_fixed_addresses
};
diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/patch.h
new file mode 100644
index 000000000000..b5918a6e0615
--- /dev/null
+++ b/arch/riscv/include/asm/patch.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 SiFive
+ */
+
+#ifndef _ASM_RISCV_PATCH_H
+#define _ASM_RISCV_PATCH_H
+
+int riscv_patch_text_nosync(void *addr, const void *insns, size_t len);
+int riscv_patch_text(void *addr, u32 insn);
+
+#endif /* _ASM_RISCV_PATCH_H */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index f40205cb9a22..d189bd3d8501 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -4,7 +4,8 @@
#
ifdef CONFIG_FTRACE
-CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_patch.o = -pg
endif
extra-y += head.o
@@ -26,6 +27,7 @@ obj-y += traps.o
obj-y += riscv_ksyms.o
obj-y += stacktrace.o
obj-y += cacheinfo.o
+obj-y += patch.o
obj-$(CONFIG_MMU) += vdso.o vdso/
obj-$(CONFIG_RISCV_M_MODE) += clint.o
diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c
new file mode 100644
index 000000000000..8a4fc65ee022
--- /dev/null
+++ b/arch/riscv/kernel/patch.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 SiFive
+ */
+
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/stop_machine.h>
+#include <asm/kprobes.h>
+#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
+
+struct riscv_insn_patch {
+ void *addr;
+ u32 insn;
+ atomic_t cpu_count;
+};
+
+#ifdef CONFIG_MMU
+static DEFINE_RAW_SPINLOCK(patch_lock);
+
+static void __kprobes *patch_map(void *addr, int fixmap)
+{
+ uintptr_t uintaddr = (uintptr_t) addr;
+ struct page *page;
+
+ if (core_kernel_text(uintaddr))
+ page = phys_to_page(__pa_symbol(addr));
+ else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ page = vmalloc_to_page(addr);
+ else
+ return addr;
+
+ BUG_ON(!page);
+
+ return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
+ (uintaddr & ~PAGE_MASK));
+}
+
+static void __kprobes patch_unmap(int fixmap)
+{
+ clear_fixmap(fixmap);
+}
+
+static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
+{
+ void *waddr = addr;
+ bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
+ unsigned long flags = 0;
+ int ret;
+
+ raw_spin_lock_irqsave(&patch_lock, flags);
+
+ if (across_pages)
+ patch_map(addr + len, FIX_TEXT_POKE1);
+
+ waddr = patch_map(addr, FIX_TEXT_POKE0);
+
+ ret = probe_kernel_write(waddr, insn, len);
+
+ patch_unmap(FIX_TEXT_POKE0);
+
+ if (across_pages)
+ patch_unmap(FIX_TEXT_POKE1);
+
+ raw_spin_unlock_irqrestore(&patch_lock, flags);
+
+ return ret;
+}
+#else
+static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
+{
+ return probe_kernel_write(addr, insn, len);
+}
+#endif /* CONFIG_MMU */
+
+int __kprobes riscv_patch_text_nosync(void *addr, const void *insns, size_t len)
+{
+ u32 *tp = addr;
+ int ret;
+
+ ret = riscv_insn_write(tp, insns, len);
+
+ if (!ret)
+ flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len);
+
+ return ret;
+}
+
+static int __kprobes riscv_patch_text_cb(void *data)
+{
+ struct riscv_insn_patch *patch = data;
+ int ret = 0;
+
+ if (atomic_inc_return(&patch->cpu_count) == 1) {
+ ret =
+ riscv_patch_text_nosync(patch->addr, &patch->insn,
+ GET_INSN_LENGTH(patch->insn));
+ atomic_inc(&patch->cpu_count);
+ } else {
+ while (atomic_read(&patch->cpu_count) <= num_online_cpus())
+ cpu_relax();
+ smp_mb();
+ }
+
+ return ret;
+}
+
+int __kprobes riscv_patch_text(void *addr, u32 insn)
+{
+ struct riscv_insn_patch patch = {
+ .addr = addr,
+ .insn = insn,
+ .cpu_count = ATOMIC_INIT(0),
+ };
+
+ return stop_machine_cpuslocked(riscv_patch_text_cb,
+ &patch, cpu_online_mask);
+}
--
2.25.1
Add set_memory_ro/rw/x/nx architecture hooks to change the page
attribution.
Use own set_memory.h rather than generic set_memory.h
(i.e. include/asm-generic/set_memory.h), because we want to add other
function prototypes here.
Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/Kconfig | 1 +
arch/riscv/include/asm/set_memory.h | 24 +++++
arch/riscv/mm/Makefile | 2 +-
arch/riscv/mm/pageattr.c | 150 ++++++++++++++++++++++++++++
4 files changed, 176 insertions(+), 1 deletion(-)
create mode 100644 arch/riscv/include/asm/set_memory.h
create mode 100644 arch/riscv/mm/pageattr.c
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index af9b03609c5a..9ab592aa1b5f 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -60,6 +60,7 @@ config RISCV
select HAVE_EBPF_JIT if 64BIT
select EDAC_SUPPORT
select ARCH_HAS_GIGANTIC_PAGE
+ select ARCH_HAS_SET_MEMORY
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
select SPARSEMEM_STATIC if 32BIT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
new file mode 100644
index 000000000000..79a810f0f38b
--- /dev/null
+++ b/arch/riscv/include/asm/set_memory.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 SiFive
+ */
+
+#ifndef _ASM_RISCV_SET_MEMORY_H
+#define _ASM_RISCV_SET_MEMORY_H
+
+/*
+ * Functions to change memory attributes.
+ */
+#ifdef CONFIG_MMU
+int set_memory_ro(unsigned long addr, int numpages);
+int set_memory_rw(unsigned long addr, int numpages);
+int set_memory_x(unsigned long addr, int numpages);
+int set_memory_nx(unsigned long addr, int numpages);
+#else
+static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
+#endif
+
+#endif /* _ASM_RISCV_SET_MEMORY_H */
diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile
index 814e16a8d68a..363ef01c30b1 100644
--- a/arch/riscv/mm/Makefile
+++ b/arch/riscv/mm/Makefile
@@ -7,7 +7,7 @@ endif
obj-y += init.o
obj-y += extable.o
-obj-$(CONFIG_MMU) += fault.o
+obj-$(CONFIG_MMU) += fault.o pageattr.o
obj-y += cacheflush.o
obj-y += context.o
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
new file mode 100644
index 000000000000..fcd59ef2835b
--- /dev/null
+++ b/arch/riscv/mm/pageattr.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 SiFive
+ */
+
+#include <linux/pagewalk.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/bitops.h>
+
+struct pageattr_masks {
+ pgprot_t set_mask;
+ pgprot_t clear_mask;
+};
+
+static unsigned long set_pageattr_masks(unsigned long val, struct mm_walk *walk)
+{
+ struct pageattr_masks *masks = walk->private;
+ unsigned long new_val = val;
+
+ new_val &= ~(pgprot_val(masks->clear_mask));
+ new_val |= (pgprot_val(masks->set_mask));
+
+ return new_val;
+}
+
+static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pgd_t val = READ_ONCE(*pgd);
+
+ if (pgd_leaf(val)) {
+ val = __pgd(set_pageattr_masks(pgd_val(val), walk));
+ set_pgd(pgd, val);
+ }
+
+ return 0;
+}
+
+static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ p4d_t val = READ_ONCE(*p4d);
+
+ if (p4d_leaf(val)) {
+ val = __p4d(set_pageattr_masks(p4d_val(val), walk));
+ set_p4d(p4d, val);
+ }
+
+ return 0;
+}
+
+static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t val = READ_ONCE(*pud);
+
+ if (pud_leaf(val)) {
+ val = __pud(set_pageattr_masks(pud_val(val), walk));
+ set_pud(pud, val);
+ }
+
+ return 0;
+}
+
+static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t val = READ_ONCE(*pmd);
+
+ if (pmd_leaf(val)) {
+ val = __pmd(set_pageattr_masks(pmd_val(val), walk));
+ set_pmd(pmd, val);
+ }
+
+ return 0;
+}
+
+static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t val = READ_ONCE(*pte);
+
+ val = __pte(set_pageattr_masks(pte_val(val), walk));
+ set_pte(pte, val);
+
+ return 0;
+}
+
+static int pageattr_pte_hole(unsigned long addr, unsigned long next,
+ int depth, struct mm_walk *walk)
+{
+ /* Nothing to do here */
+ return 0;
+}
+
+const static struct mm_walk_ops pageattr_ops = {
+ .pgd_entry = pageattr_pgd_entry,
+ .p4d_entry = pageattr_p4d_entry,
+ .pud_entry = pageattr_pud_entry,
+ .pmd_entry = pageattr_pmd_entry,
+ .pte_entry = pageattr_pte_entry,
+ .pte_hole = pageattr_pte_hole,
+};
+
+static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
+ pgprot_t clear_mask)
+{
+ int ret;
+ unsigned long start = addr;
+ unsigned long end = start + PAGE_SIZE * numpages;
+ struct pageattr_masks masks = {
+ .set_mask = set_mask,
+ .clear_mask = clear_mask
+ };
+
+ if (!numpages)
+ return 0;
+
+ down_read(&init_mm.mmap_sem);
+ ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
+ &masks);
+ up_read(&init_mm.mmap_sem);
+
+ flush_tlb_kernel_range(start, end);
+
+ return ret;
+}
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+ return __set_memory(addr, numpages, __pgprot(_PAGE_READ),
+ __pgprot(_PAGE_WRITE));
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+ return __set_memory(addr, numpages, __pgprot(_PAGE_READ | _PAGE_WRITE),
+ __pgprot(0));
+}
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+ return __set_memory(addr, numpages, __pgprot(_PAGE_EXEC), __pgprot(0));
+}
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+ return __set_memory(addr, numpages, __pgprot(0), __pgprot(_PAGE_EXEC));
+}
--
2.25.1
Add set_direct_map_*() functions for setting the direct map alias for
the page to its default permissions and to an invalid state that cannot
be cached in a TLB. (See d253ca0c ("x86/mm/cpa: Add set_direct_map_*()
functions")) Add a similar implementation for RISC-V.
Signed-off-by: Zong Li <[email protected]>
---
arch/riscv/Kconfig | 1 +
arch/riscv/include/asm/set_memory.h | 3 +++
arch/riscv/mm/pageattr.c | 24 ++++++++++++++++++++++++
3 files changed, 28 insertions(+)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 9ab592aa1b5f..71fabb0ffdbe 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -60,6 +60,7 @@ config RISCV
select HAVE_EBPF_JIT if 64BIT
select EDAC_SUPPORT
select ARCH_HAS_GIGANTIC_PAGE
+ select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
select SPARSEMEM_STATIC if 32BIT
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
index 79a810f0f38b..620d81c372d9 100644
--- a/arch/riscv/include/asm/set_memory.h
+++ b/arch/riscv/include/asm/set_memory.h
@@ -21,4 +21,7 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
#endif
+int set_direct_map_invalid_noflush(struct page *page);
+int set_direct_map_default_noflush(struct page *page);
+
#endif /* _ASM_RISCV_SET_MEMORY_H */
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index fcd59ef2835b..7be6cd67e2ef 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -148,3 +148,27 @@ int set_memory_nx(unsigned long addr, int numpages)
{
return __set_memory(addr, numpages, __pgprot(0), __pgprot(_PAGE_EXEC));
}
+
+int set_direct_map_invalid_noflush(struct page *page)
+{
+ unsigned long start = (unsigned long)page_address(page);
+ unsigned long end = start + PAGE_SIZE;
+ struct pageattr_masks masks = {
+ .set_mask = __pgprot(0),
+ .clear_mask = __pgprot(_PAGE_PRESENT)
+ };
+
+ return walk_page_range(&init_mm, start, end, &pageattr_ops, &masks);
+}
+
+int set_direct_map_default_noflush(struct page *page)
+{
+ unsigned long start = (unsigned long)page_address(page);
+ unsigned long end = start + PAGE_SIZE;
+ struct pageattr_masks masks = {
+ .set_mask = PAGE_KERNEL,
+ .clear_mask = __pgprot(0)
+ };
+
+ return walk_page_range(&init_mm, start, end, &pageattr_ops, &masks);
+}
--
2.25.1
Hi Zong,
On Tue, 10 Mar 2020 00:55:35 +0800
Zong Li <[email protected]> wrote:
> The main purpose of this patch series is changing the kernel mapping permission
> , make sure that code is not writeable, data is not executable, and read-only
> data is neither writable nor executable.
>
> This patch series also supports the relevant implementations such as
> ARCH_HAS_SET_MEMORY, ARCH_HAS_SET_DIRECT_MAP,
> ARCH_SUPPORTS_DEBUG_PAGEALLOC and DEBUG_WX.
The order of the patches seems a bit strange. Since the first 7 patches
makes kernel read-only, at that point ftrace is broken and it is fixed
by the last 2 patches. That is not bisect-friendly. Can you move the
last 2 patches to the top?
Thank you,
>
> Changes in v3:
> - Fix build error on nommu configuration. We already support nommu on
> RISC-V, so we should consider nommu case and test not only rv32/64,
> but also nommu.
>
> Changes in v2:
> - Use _data to specify the start of data section with write permission.
> - Change ftrace patch text implementaion.
> - Separate DEBUG_WX patch to another patchset.
>
> Zong Li (9):
> riscv: add ARCH_HAS_SET_MEMORY support
> riscv: add ARCH_HAS_SET_DIRECT_MAP support
> riscv: add ARCH_SUPPORTS_DEBUG_PAGEALLOC support
> riscv: move exception table immediately after RO_DATA
> riscv: add alignment for text, rodata and data sections
> riscv: add STRICT_KERNEL_RWX support
> riscv: add macro to get instruction length
> riscv: introduce interfaces to patch kernel code
> riscv: patch code by fixmap mapping
>
> arch/riscv/Kconfig | 6 +
> arch/riscv/include/asm/bug.h | 8 ++
> arch/riscv/include/asm/fixmap.h | 2 +
> arch/riscv/include/asm/patch.h | 12 ++
> arch/riscv/include/asm/set_memory.h | 48 +++++++
> arch/riscv/kernel/Makefile | 4 +-
> arch/riscv/kernel/ftrace.c | 13 +-
> arch/riscv/kernel/patch.c | 120 ++++++++++++++++++
> arch/riscv/kernel/traps.c | 3 +-
> arch/riscv/kernel/vmlinux.lds.S | 11 +-
> arch/riscv/mm/Makefile | 2 +-
> arch/riscv/mm/init.c | 44 +++++++
> arch/riscv/mm/pageattr.c | 187 ++++++++++++++++++++++++++++
> 13 files changed, 445 insertions(+), 15 deletions(-)
> create mode 100644 arch/riscv/include/asm/patch.h
> create mode 100644 arch/riscv/include/asm/set_memory.h
> create mode 100644 arch/riscv/kernel/patch.c
> create mode 100644 arch/riscv/mm/pageattr.c
>
> --
> 2.25.1
>
--
Masami Hiramatsu <[email protected]>
Hi,
On Tue, 10 Mar 2020 00:55:43 +0800
Zong Li <[email protected]> wrote:
> On strict kernel memory permission, we couldn't patch code without
> writable permission. Preserve two holes in fixmap area, so we can map
> the kernel code temporarily to fixmap area, then patch the instructions.
>
> We need two pages here because we support the compressed instruction, so
> the instruction might be align to 2 bytes. When patching the 32-bit
> length instruction which is 2 bytes alignment, it will across two pages.
>
> Introduce two interfaces to patch kernel code:
> riscv_patch_text_nosync:
> - patch code without synchronization, it's caller's responsibility to
> synchronize all CPUs if needed.
> riscv_patch_text:
> - patch code and always synchronize with stop_machine()
>
> Signed-off-by: Zong Li <[email protected]>
> ---
> arch/riscv/include/asm/fixmap.h | 2 +
> arch/riscv/include/asm/patch.h | 12 ++++
> arch/riscv/kernel/Makefile | 4 +-
> arch/riscv/kernel/patch.c | 120 ++++++++++++++++++++++++++++++++
> 4 files changed, 137 insertions(+), 1 deletion(-)
> create mode 100644 arch/riscv/include/asm/patch.h
> create mode 100644 arch/riscv/kernel/patch.c
>
> diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> index 42d2c42f3cc9..2368d49eb4ef 100644
> --- a/arch/riscv/include/asm/fixmap.h
> +++ b/arch/riscv/include/asm/fixmap.h
> @@ -27,6 +27,8 @@ enum fixed_addresses {
> FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
> FIX_PTE,
> FIX_PMD,
> + FIX_TEXT_POKE1,
> + FIX_TEXT_POKE0,
> FIX_EARLYCON_MEM_BASE,
> __end_of_fixed_addresses
> };
> diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/patch.h
> new file mode 100644
> index 000000000000..b5918a6e0615
> --- /dev/null
> +++ b/arch/riscv/include/asm/patch.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2020 SiFive
> + */
> +
> +#ifndef _ASM_RISCV_PATCH_H
> +#define _ASM_RISCV_PATCH_H
> +
> +int riscv_patch_text_nosync(void *addr, const void *insns, size_t len);
> +int riscv_patch_text(void *addr, u32 insn);
> +
> +#endif /* _ASM_RISCV_PATCH_H */
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index f40205cb9a22..d189bd3d8501 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -4,7 +4,8 @@
> #
>
> ifdef CONFIG_FTRACE
> -CFLAGS_REMOVE_ftrace.o = -pg
> +CFLAGS_REMOVE_ftrace.o = -pg
> +CFLAGS_REMOVE_patch.o = -pg
> endif
>
> extra-y += head.o
> @@ -26,6 +27,7 @@ obj-y += traps.o
> obj-y += riscv_ksyms.o
> obj-y += stacktrace.o
> obj-y += cacheinfo.o
> +obj-y += patch.o
> obj-$(CONFIG_MMU) += vdso.o vdso/
>
> obj-$(CONFIG_RISCV_M_MODE) += clint.o
> diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c
> new file mode 100644
> index 000000000000..8a4fc65ee022
> --- /dev/null
> +++ b/arch/riscv/kernel/patch.c
> @@ -0,0 +1,120 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2020 SiFive
> + */
> +
> +#include <linux/spinlock.h>
> +#include <linux/mm.h>
> +#include <linux/uaccess.h>
> +#include <linux/stop_machine.h>
> +#include <asm/kprobes.h>
> +#include <asm/cacheflush.h>
> +#include <asm/fixmap.h>
> +
> +struct riscv_insn_patch {
> + void *addr;
> + u32 insn;
> + atomic_t cpu_count;
> +};
> +
> +#ifdef CONFIG_MMU
> +static DEFINE_RAW_SPINLOCK(patch_lock);
> +
> +static void __kprobes *patch_map(void *addr, int fixmap)
Please use NOKPROBE_SYMBOL() instead of __kprobes. __kprobes is old style.
> +{
> + uintptr_t uintaddr = (uintptr_t) addr;
> + struct page *page;
> +
> + if (core_kernel_text(uintaddr))
> + page = phys_to_page(__pa_symbol(addr));
> + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
> + page = vmalloc_to_page(addr);
> + else
> + return addr;
> +
> + BUG_ON(!page);
> +
> + return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
> + (uintaddr & ~PAGE_MASK));
> +}
> +
> +static void __kprobes patch_unmap(int fixmap)
> +{
> + clear_fixmap(fixmap);
> +}
> +
> +static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
Why would you add "riscv_" prefix for those functions? It seems a bit odd.
> +{
> + void *waddr = addr;
> + bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
> + unsigned long flags = 0;
> + int ret;
> +
> + raw_spin_lock_irqsave(&patch_lock, flags);
This looks a bit odd since stop_machine() is protected by its own mutex,
and also the irq is already disabled here.
Thank you,
> +
> + if (across_pages)
> + patch_map(addr + len, FIX_TEXT_POKE1);
> +
> + waddr = patch_map(addr, FIX_TEXT_POKE0);
> +
> + ret = probe_kernel_write(waddr, insn, len);
> +
> + patch_unmap(FIX_TEXT_POKE0);
> +
> + if (across_pages)
> + patch_unmap(FIX_TEXT_POKE1);
> +
> + raw_spin_unlock_irqrestore(&patch_lock, flags);
> +
> + return ret;
> +}
> +#else
> +static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
> +{
> + return probe_kernel_write(addr, insn, len);
> +}
> +#endif /* CONFIG_MMU */
> +
> +int __kprobes riscv_patch_text_nosync(void *addr, const void *insns, size_t len)
> +{
> + u32 *tp = addr;
> + int ret;
> +
> + ret = riscv_insn_write(tp, insns, len);
> +
> + if (!ret)
> + flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len);
> +
> + return ret;
> +}
> +
> +static int __kprobes riscv_patch_text_cb(void *data)
> +{
> + struct riscv_insn_patch *patch = data;
> + int ret = 0;
> +
> + if (atomic_inc_return(&patch->cpu_count) == 1) {
> + ret =
> + riscv_patch_text_nosync(patch->addr, &patch->insn,
> + GET_INSN_LENGTH(patch->insn));
> + atomic_inc(&patch->cpu_count);
> + } else {
> + while (atomic_read(&patch->cpu_count) <= num_online_cpus())
> + cpu_relax();
> + smp_mb();
> + }
> +
> + return ret;
> +}
> +
> +int __kprobes riscv_patch_text(void *addr, u32 insn)
> +{
> + struct riscv_insn_patch patch = {
> + .addr = addr,
> + .insn = insn,
> + .cpu_count = ATOMIC_INIT(0),
> + };
> +
> + return stop_machine_cpuslocked(riscv_patch_text_cb,
> + &patch, cpu_online_mask);
> +}
> --
> 2.25.1
>
--
Masami Hiramatsu <[email protected]>
On Tue, Mar 31, 2020 at 9:32 PM Masami Hiramatsu <[email protected]> wrote:
>
> Hi Zong,
>
> On Tue, 10 Mar 2020 00:55:35 +0800
> Zong Li <[email protected]> wrote:
>
> > The main purpose of this patch series is changing the kernel mapping permission
> > , make sure that code is not writeable, data is not executable, and read-only
> > data is neither writable nor executable.
> >
> > This patch series also supports the relevant implementations such as
> > ARCH_HAS_SET_MEMORY, ARCH_HAS_SET_DIRECT_MAP,
> > ARCH_SUPPORTS_DEBUG_PAGEALLOC and DEBUG_WX.
>
> The order of the patches seems a bit strange. Since the first 7 patches
> makes kernel read-only, at that point ftrace is broken and it is fixed
> by the last 2 patches. That is not bisect-friendly. Can you move the
> last 2 patches to the top?
>
OK. Let me rearrange the order. Thanks.
> Thank you,
>
> >
> > Changes in v3:
> > - Fix build error on nommu configuration. We already support nommu on
> > RISC-V, so we should consider nommu case and test not only rv32/64,
> > but also nommu.
> >
> > Changes in v2:
> > - Use _data to specify the start of data section with write permission.
> > - Change ftrace patch text implementaion.
> > - Separate DEBUG_WX patch to another patchset.
> >
> > Zong Li (9):
> > riscv: add ARCH_HAS_SET_MEMORY support
> > riscv: add ARCH_HAS_SET_DIRECT_MAP support
> > riscv: add ARCH_SUPPORTS_DEBUG_PAGEALLOC support
> > riscv: move exception table immediately after RO_DATA
> > riscv: add alignment for text, rodata and data sections
> > riscv: add STRICT_KERNEL_RWX support
> > riscv: add macro to get instruction length
> > riscv: introduce interfaces to patch kernel code
> > riscv: patch code by fixmap mapping
> >
> > arch/riscv/Kconfig | 6 +
> > arch/riscv/include/asm/bug.h | 8 ++
> > arch/riscv/include/asm/fixmap.h | 2 +
> > arch/riscv/include/asm/patch.h | 12 ++
> > arch/riscv/include/asm/set_memory.h | 48 +++++++
> > arch/riscv/kernel/Makefile | 4 +-
> > arch/riscv/kernel/ftrace.c | 13 +-
> > arch/riscv/kernel/patch.c | 120 ++++++++++++++++++
> > arch/riscv/kernel/traps.c | 3 +-
> > arch/riscv/kernel/vmlinux.lds.S | 11 +-
> > arch/riscv/mm/Makefile | 2 +-
> > arch/riscv/mm/init.c | 44 +++++++
> > arch/riscv/mm/pageattr.c | 187 ++++++++++++++++++++++++++++
> > 13 files changed, 445 insertions(+), 15 deletions(-)
> > create mode 100644 arch/riscv/include/asm/patch.h
> > create mode 100644 arch/riscv/include/asm/set_memory.h
> > create mode 100644 arch/riscv/kernel/patch.c
> > create mode 100644 arch/riscv/mm/pageattr.c
> >
> > --
> > 2.25.1
> >
>
>
> --
> Masami Hiramatsu <[email protected]>
On Tue, Mar 31, 2020 at 11:32 PM Masami Hiramatsu <[email protected]> wrote:
>
> Hi,
>
> On Tue, 10 Mar 2020 00:55:43 +0800
> Zong Li <[email protected]> wrote:
>
> > On strict kernel memory permission, we couldn't patch code without
> > writable permission. Preserve two holes in fixmap area, so we can map
> > the kernel code temporarily to fixmap area, then patch the instructions.
> >
> > We need two pages here because we support the compressed instruction, so
> > the instruction might be align to 2 bytes. When patching the 32-bit
> > length instruction which is 2 bytes alignment, it will across two pages.
> >
> > Introduce two interfaces to patch kernel code:
> > riscv_patch_text_nosync:
> > - patch code without synchronization, it's caller's responsibility to
> > synchronize all CPUs if needed.
> > riscv_patch_text:
> > - patch code and always synchronize with stop_machine()
> >
> > Signed-off-by: Zong Li <[email protected]>
> > ---
> > arch/riscv/include/asm/fixmap.h | 2 +
> > arch/riscv/include/asm/patch.h | 12 ++++
> > arch/riscv/kernel/Makefile | 4 +-
> > arch/riscv/kernel/patch.c | 120 ++++++++++++++++++++++++++++++++
> > 4 files changed, 137 insertions(+), 1 deletion(-)
> > create mode 100644 arch/riscv/include/asm/patch.h
> > create mode 100644 arch/riscv/kernel/patch.c
> >
> > diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> > index 42d2c42f3cc9..2368d49eb4ef 100644
> > --- a/arch/riscv/include/asm/fixmap.h
> > +++ b/arch/riscv/include/asm/fixmap.h
> > @@ -27,6 +27,8 @@ enum fixed_addresses {
> > FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
> > FIX_PTE,
> > FIX_PMD,
> > + FIX_TEXT_POKE1,
> > + FIX_TEXT_POKE0,
> > FIX_EARLYCON_MEM_BASE,
> > __end_of_fixed_addresses
> > };
> > diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/patch.h
> > new file mode 100644
> > index 000000000000..b5918a6e0615
> > --- /dev/null
> > +++ b/arch/riscv/include/asm/patch.h
> > @@ -0,0 +1,12 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (C) 2020 SiFive
> > + */
> > +
> > +#ifndef _ASM_RISCV_PATCH_H
> > +#define _ASM_RISCV_PATCH_H
> > +
> > +int riscv_patch_text_nosync(void *addr, const void *insns, size_t len);
> > +int riscv_patch_text(void *addr, u32 insn);
> > +
> > +#endif /* _ASM_RISCV_PATCH_H */
> > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > index f40205cb9a22..d189bd3d8501 100644
> > --- a/arch/riscv/kernel/Makefile
> > +++ b/arch/riscv/kernel/Makefile
> > @@ -4,7 +4,8 @@
> > #
> >
> > ifdef CONFIG_FTRACE
> > -CFLAGS_REMOVE_ftrace.o = -pg
> > +CFLAGS_REMOVE_ftrace.o = -pg
> > +CFLAGS_REMOVE_patch.o = -pg
> > endif
> >
> > extra-y += head.o
> > @@ -26,6 +27,7 @@ obj-y += traps.o
> > obj-y += riscv_ksyms.o
> > obj-y += stacktrace.o
> > obj-y += cacheinfo.o
> > +obj-y += patch.o
> > obj-$(CONFIG_MMU) += vdso.o vdso/
> >
> > obj-$(CONFIG_RISCV_M_MODE) += clint.o
> > diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c
> > new file mode 100644
> > index 000000000000..8a4fc65ee022
> > --- /dev/null
> > +++ b/arch/riscv/kernel/patch.c
> > @@ -0,0 +1,120 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright (C) 2020 SiFive
> > + */
> > +
> > +#include <linux/spinlock.h>
> > +#include <linux/mm.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/stop_machine.h>
> > +#include <asm/kprobes.h>
> > +#include <asm/cacheflush.h>
> > +#include <asm/fixmap.h>
> > +
> > +struct riscv_insn_patch {
> > + void *addr;
> > + u32 insn;
> > + atomic_t cpu_count;
> > +};
> > +
> > +#ifdef CONFIG_MMU
> > +static DEFINE_RAW_SPINLOCK(patch_lock);
> > +
> > +static void __kprobes *patch_map(void *addr, int fixmap)
>
> Please use NOKPROBE_SYMBOL() instead of __kprobes. __kprobes is old style.
>
> > +{
> > + uintptr_t uintaddr = (uintptr_t) addr;
> > + struct page *page;
> > +
> > + if (core_kernel_text(uintaddr))
> > + page = phys_to_page(__pa_symbol(addr));
> > + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
> > + page = vmalloc_to_page(addr);
> > + else
> > + return addr;
> > +
> > + BUG_ON(!page);
> > +
> > + return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
> > + (uintaddr & ~PAGE_MASK));
> > +}
> > +
> > +static void __kprobes patch_unmap(int fixmap)
> > +{
> > + clear_fixmap(fixmap);
> > +}
> > +
> > +static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
>
> Why would you add "riscv_" prefix for those functions? It seems a bit odd.
There is no particular reason, I just was used to adding a prefix for
arch-related stuff. I have no preference here, it's OK to me to remove
the prefix of these functions, do you think we need to remove them?
>
> > +{
> > + void *waddr = addr;
> > + bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
> > + unsigned long flags = 0;
> > + int ret;
> > +
> > + raw_spin_lock_irqsave(&patch_lock, flags);
>
> This looks a bit odd since stop_machine() is protected by its own mutex,
> and also the irq is already disabled here.
We need it because we don't always enter the riscv_patch_text_nosync()
through stop_machine mechanism. If we call the
riscv_patch_text_nosync() directly, we need a lock to protect the
page.
>
> Thank you,
>
> > +
> > + if (across_pages)
> > + patch_map(addr + len, FIX_TEXT_POKE1);
> > +
> > + waddr = patch_map(addr, FIX_TEXT_POKE0);
> > +
> > + ret = probe_kernel_write(waddr, insn, len);
> > +
> > + patch_unmap(FIX_TEXT_POKE0);
> > +
> > + if (across_pages)
> > + patch_unmap(FIX_TEXT_POKE1);
> > +
> > + raw_spin_unlock_irqrestore(&patch_lock, flags);
> > +
> > + return ret;
> > +}
> > +#else
> > +static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
> > +{
> > + return probe_kernel_write(addr, insn, len);
> > +}
> > +#endif /* CONFIG_MMU */
> > +
> > +int __kprobes riscv_patch_text_nosync(void *addr, const void *insns, size_t len)
> > +{
> > + u32 *tp = addr;
> > + int ret;
> > +
> > + ret = riscv_insn_write(tp, insns, len);
> > +
> > + if (!ret)
> > + flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len);
> > +
> > + return ret;
> > +}
> > +
> > +static int __kprobes riscv_patch_text_cb(void *data)
> > +{
> > + struct riscv_insn_patch *patch = data;
> > + int ret = 0;
> > +
> > + if (atomic_inc_return(&patch->cpu_count) == 1) {
> > + ret =
> > + riscv_patch_text_nosync(patch->addr, &patch->insn,
> > + GET_INSN_LENGTH(patch->insn));
> > + atomic_inc(&patch->cpu_count);
> > + } else {
> > + while (atomic_read(&patch->cpu_count) <= num_online_cpus())
> > + cpu_relax();
> > + smp_mb();
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +int __kprobes riscv_patch_text(void *addr, u32 insn)
> > +{
> > + struct riscv_insn_patch patch = {
> > + .addr = addr,
> > + .insn = insn,
> > + .cpu_count = ATOMIC_INIT(0),
> > + };
> > +
> > + return stop_machine_cpuslocked(riscv_patch_text_cb,
> > + &patch, cpu_online_mask);
> > +}
> > --
> > 2.25.1
> >
>
>
> --
> Masami Hiramatsu <[email protected]>
Hi,
On Wed, 1 Apr 2020 15:42:30 +0800
Zong Li <[email protected]> wrote:
> > > +
> > > +static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
> >
> > Why would you add "riscv_" prefix for those functions? It seems a bit odd.
>
> There is no particular reason, I just was used to adding a prefix for
> arch-related stuff. I have no preference here, it's OK to me to remove
> the prefix of these functions, do you think we need to remove them?
Yeah, it will be better, unless it can mixed up with arch-independent
functions.
> > > +{
> > > + void *waddr = addr;
> > > + bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
> > > + unsigned long flags = 0;
> > > + int ret;
> > > +
> > > + raw_spin_lock_irqsave(&patch_lock, flags);
> >
> > This looks a bit odd since stop_machine() is protected by its own mutex,
> > and also the irq is already disabled here.
>
> We need it because we don't always enter the riscv_patch_text_nosync()
> through stop_machine mechanism. If we call the
> riscv_patch_text_nosync() directly, we need a lock to protect the
> page.
Oh, OK, but it leads another question. Is that safe to patch the
text without sync? Would you use it for UP system?
I think it is better to clarify "in what case user can call _nosync()"
and add a comment on it.
Thank you,
>
> >
> > Thank you,
> >
> > > +
> > > + if (across_pages)
> > > + patch_map(addr + len, FIX_TEXT_POKE1);
> > > +
> > > + waddr = patch_map(addr, FIX_TEXT_POKE0);
> > > +
> > > + ret = probe_kernel_write(waddr, insn, len);
> > > +
> > > + patch_unmap(FIX_TEXT_POKE0);
> > > +
> > > + if (across_pages)
> > > + patch_unmap(FIX_TEXT_POKE1);
> > > +
> > > + raw_spin_unlock_irqrestore(&patch_lock, flags);
> > > +
> > > + return ret;
> > > +}
> > > +#else
> > > +static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
> > > +{
> > > + return probe_kernel_write(addr, insn, len);
> > > +}
> > > +#endif /* CONFIG_MMU */
> > > +
> > > +int __kprobes riscv_patch_text_nosync(void *addr, const void *insns, size_t len)
> > > +{
> > > + u32 *tp = addr;
> > > + int ret;
> > > +
> > > + ret = riscv_insn_write(tp, insns, len);
> > > +
> > > + if (!ret)
> > > + flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len);
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +static int __kprobes riscv_patch_text_cb(void *data)
> > > +{
> > > + struct riscv_insn_patch *patch = data;
> > > + int ret = 0;
> > > +
> > > + if (atomic_inc_return(&patch->cpu_count) == 1) {
> > > + ret =
> > > + riscv_patch_text_nosync(patch->addr, &patch->insn,
> > > + GET_INSN_LENGTH(patch->insn));
> > > + atomic_inc(&patch->cpu_count);
> > > + } else {
> > > + while (atomic_read(&patch->cpu_count) <= num_online_cpus())
> > > + cpu_relax();
> > > + smp_mb();
> > > + }
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +int __kprobes riscv_patch_text(void *addr, u32 insn)
> > > +{
> > > + struct riscv_insn_patch patch = {
> > > + .addr = addr,
> > > + .insn = insn,
> > > + .cpu_count = ATOMIC_INIT(0),
> > > + };
> > > +
> > > + return stop_machine_cpuslocked(riscv_patch_text_cb,
> > > + &patch, cpu_online_mask);
> > > +}
> > > --
> > > 2.25.1
> > >
> >
> >
> > --
> > Masami Hiramatsu <[email protected]>
--
Masami Hiramatsu <[email protected]>
On Thu, Apr 2, 2020 at 9:17 AM Masami Hiramatsu <[email protected]> wrote:
>
> Hi,
>
> On Wed, 1 Apr 2020 15:42:30 +0800
> Zong Li <[email protected]> wrote:
>
> > > > +
> > > > +static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
> > >
> > > Why would you add "riscv_" prefix for those functions? It seems a bit odd.
> >
> > There is no particular reason, I just was used to adding a prefix for
> > arch-related stuff. I have no preference here, it's OK to me to remove
> > the prefix of these functions, do you think we need to remove them?
>
> Yeah, it will be better, unless it can mixed up with arch-independent
> functions.
OK. I'll remove it and use NOKPROBE_SYMBOL() instead of __kprobes annotation.
>
> > > > +{
> > > > + void *waddr = addr;
> > > > + bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
> > > > + unsigned long flags = 0;
> > > > + int ret;
> > > > +
> > > > + raw_spin_lock_irqsave(&patch_lock, flags);
> > >
> > > This looks a bit odd since stop_machine() is protected by its own mutex,
> > > and also the irq is already disabled here.
> >
> > We need it because we don't always enter the riscv_patch_text_nosync()
> > through stop_machine mechanism. If we call the
> > riscv_patch_text_nosync() directly, we need a lock to protect the
> > page.
>
> Oh, OK, but it leads another question. Is that safe to patch the
> text without sync? Would you use it for UP system?
> I think it is better to clarify "in what case user can call _nosync()"
> and add a comment on it.
The ftrace is one of the cases, as documentation of ftrace said, when
dynamic ftrace is initialized, it calls kstop_machine to make the
machine act like a uniprocessor so that it can freely modify code
without worrying about other processors executing that same code. So
the ftrace called the _nosync interface here directly.
>
> Thank you,
>
> >
> > >
> > > Thank you,
> > >
> > > > +
> > > > + if (across_pages)
> > > > + patch_map(addr + len, FIX_TEXT_POKE1);
> > > > +
> > > > + waddr = patch_map(addr, FIX_TEXT_POKE0);
> > > > +
> > > > + ret = probe_kernel_write(waddr, insn, len);
> > > > +
> > > > + patch_unmap(FIX_TEXT_POKE0);
> > > > +
> > > > + if (across_pages)
> > > > + patch_unmap(FIX_TEXT_POKE1);
> > > > +
> > > > + raw_spin_unlock_irqrestore(&patch_lock, flags);
> > > > +
> > > > + return ret;
> > > > +}
> > > > +#else
> > > > +static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
> > > > +{
> > > > + return probe_kernel_write(addr, insn, len);
> > > > +}
> > > > +#endif /* CONFIG_MMU */
> > > > +
> > > > +int __kprobes riscv_patch_text_nosync(void *addr, const void *insns, size_t len)
> > > > +{
> > > > + u32 *tp = addr;
> > > > + int ret;
> > > > +
> > > > + ret = riscv_insn_write(tp, insns, len);
> > > > +
> > > > + if (!ret)
> > > > + flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len);
> > > > +
> > > > + return ret;
> > > > +}
> > > > +
> > > > +static int __kprobes riscv_patch_text_cb(void *data)
> > > > +{
> > > > + struct riscv_insn_patch *patch = data;
> > > > + int ret = 0;
> > > > +
> > > > + if (atomic_inc_return(&patch->cpu_count) == 1) {
> > > > + ret =
> > > > + riscv_patch_text_nosync(patch->addr, &patch->insn,
> > > > + GET_INSN_LENGTH(patch->insn));
> > > > + atomic_inc(&patch->cpu_count);
> > > > + } else {
> > > > + while (atomic_read(&patch->cpu_count) <= num_online_cpus())
> > > > + cpu_relax();
> > > > + smp_mb();
> > > > + }
> > > > +
> > > > + return ret;
> > > > +}
> > > > +
> > > > +int __kprobes riscv_patch_text(void *addr, u32 insn)
> > > > +{
> > > > + struct riscv_insn_patch patch = {
> > > > + .addr = addr,
> > > > + .insn = insn,
> > > > + .cpu_count = ATOMIC_INIT(0),
> > > > + };
> > > > +
> > > > + return stop_machine_cpuslocked(riscv_patch_text_cb,
> > > > + &patch, cpu_online_mask);
> > > > +}
> > > > --
> > > > 2.25.1
> > > >
> > >
> > >
> > > --
> > > Masami Hiramatsu <[email protected]>
>
>
> --
> Masami Hiramatsu <[email protected]>
Hi Zong,
On Fri, 3 Apr 2020 17:04:51 +0800
Zong Li <[email protected]> wrote:
> > > > > +{
> > > > > + void *waddr = addr;
> > > > > + bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
> > > > > + unsigned long flags = 0;
> > > > > + int ret;
> > > > > +
> > > > > + raw_spin_lock_irqsave(&patch_lock, flags);
> > > >
> > > > This looks a bit odd since stop_machine() is protected by its own mutex,
> > > > and also the irq is already disabled here.
> > >
> > > We need it because we don't always enter the riscv_patch_text_nosync()
> > > through stop_machine mechanism. If we call the
> > > riscv_patch_text_nosync() directly, we need a lock to protect the
> > > page.
> >
> > Oh, OK, but it leads another question. Is that safe to patch the
> > text without sync? Would you use it for UP system?
> > I think it is better to clarify "in what case user can call _nosync()"
> > and add a comment on it.
>
> The ftrace is one of the cases, as documentation of ftrace said, when
> dynamic ftrace is initialized, it calls kstop_machine to make the
> machine act like a uniprocessor so that it can freely modify code
> without worrying about other processors executing that same code. So
> the ftrace called the _nosync interface here directly.
Hmm, even though, since it already running under kstop_machine(), no
other thread will run.
Could you consider to use text_mutex instead of that? The text_mutex
is already widely used in x86 and kernel/kprobes.c etc.
(Hmm, it seems except for x86, alternative code don't care about
racing...)
Thank you,
--
Masami Hiramatsu <[email protected]>
On Sat, Apr 4, 2020 at 11:14 AM Masami Hiramatsu <[email protected]> wrote:
>
> Hi Zong,
>
> On Fri, 3 Apr 2020 17:04:51 +0800
> Zong Li <[email protected]> wrote:
>
> > > > > > +{
> > > > > > + void *waddr = addr;
> > > > > > + bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
> > > > > > + unsigned long flags = 0;
> > > > > > + int ret;
> > > > > > +
> > > > > > + raw_spin_lock_irqsave(&patch_lock, flags);
> > > > >
> > > > > This looks a bit odd since stop_machine() is protected by its own mutex,
> > > > > and also the irq is already disabled here.
> > > >
> > > > We need it because we don't always enter the riscv_patch_text_nosync()
> > > > through stop_machine mechanism. If we call the
> > > > riscv_patch_text_nosync() directly, we need a lock to protect the
> > > > page.
> > >
> > > Oh, OK, but it leads another question. Is that safe to patch the
> > > text without sync? Would you use it for UP system?
> > > I think it is better to clarify "in what case user can call _nosync()"
> > > and add a comment on it.
> >
> > The ftrace is one of the cases, as documentation of ftrace said, when
> > dynamic ftrace is initialized, it calls kstop_machine to make the
> > machine act like a uniprocessor so that it can freely modify code
> > without worrying about other processors executing that same code. So
> > the ftrace called the _nosync interface here directly.
>
> Hmm, even though, since it already running under kstop_machine(), no
> other thread will run.
> Could you consider to use text_mutex instead of that? The text_mutex
> is already widely used in x86 and kernel/kprobes.c etc.
>
> (Hmm, it seems except for x86, alternative code don't care about
> racing...)
>
Yes, text_mutex seems to be great. I'll change to use text_mutex in
the next version if it works fine after testing. Thanks.
> Thank you,
> --
> Masami Hiramatsu <[email protected]>
On Sat, Apr 4, 2020 at 8:12 PM Zong Li <[email protected]> wrote:
>
> On Sat, Apr 4, 2020 at 11:14 AM Masami Hiramatsu <[email protected]> wrote:
> >
> > Hi Zong,
> >
> > On Fri, 3 Apr 2020 17:04:51 +0800
> > Zong Li <[email protected]> wrote:
> >
> > > > > > > +{
> > > > > > > + void *waddr = addr;
> > > > > > > + bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
> > > > > > > + unsigned long flags = 0;
> > > > > > > + int ret;
> > > > > > > +
> > > > > > > + raw_spin_lock_irqsave(&patch_lock, flags);
> > > > > >
> > > > > > This looks a bit odd since stop_machine() is protected by its own mutex,
> > > > > > and also the irq is already disabled here.
> > > > >
> > > > > We need it because we don't always enter the riscv_patch_text_nosync()
> > > > > through stop_machine mechanism. If we call the
> > > > > riscv_patch_text_nosync() directly, we need a lock to protect the
> > > > > page.
> > > >
> > > > Oh, OK, but it leads another question. Is that safe to patch the
> > > > text without sync? Would you use it for UP system?
> > > > I think it is better to clarify "in what case user can call _nosync()"
> > > > and add a comment on it.
> > >
> > > The ftrace is one of the cases, as documentation of ftrace said, when
> > > dynamic ftrace is initialized, it calls kstop_machine to make the
> > > machine act like a uniprocessor so that it can freely modify code
> > > without worrying about other processors executing that same code. So
> > > the ftrace called the _nosync interface here directly.
> >
> > Hmm, even though, since it already running under kstop_machine(), no
> > other thread will run.
> > Could you consider to use text_mutex instead of that? The text_mutex
> > is already widely used in x86 and kernel/kprobes.c etc.
> >
> > (Hmm, it seems except for x86, alternative code don't care about
> > racing...)
> >
The mutex_lock doesn't seem to work in ftrace context, I think it
might be the reason why other architectures didn't use text_mutex in
somewhere.
# echo function > current_tracer
[ 28.198070] BUG: sleeping function called from invalid context at
kernel/locking/mutex.c:281
[ 28.198663] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid:
11, name: migration/0
[ 28.199491] CPU: 0 PID: 11 Comm: migration/0 Not tainted
5.6.0-00012-gd6f56a7a4be2-dirty #10
[ 28.200330] Call Trace:
[ 28.200798] [<ffffffe00060319a>] walk_stackframe+0x0/0xcc
[ 28.201395] [<ffffffe000603442>] show_stack+0x3c/0x46
[ 28.200798] [<ffffffe00060319a>] walk_stackframe+0x0/0xcc
[ 28.201395] [<ffffffe000603442>] show_stack+0x3c/0x46
[ 28.201898] [<ffffffe000d498b0>] dump_stack+0x76/0x90
[ 28.202329] [<ffffffe00062c3f0>] ___might_sleep+0x100/0x10e
[ 28.202720] [<ffffffe00062c448>] __might_sleep+0x4a/0x78
[ 28.203033] [<ffffffe000d61622>] mutex_lock+0x2c/0x54
[ 28.203397] [<ffffffe00060393e>] patch_insn_write+0x32/0xd8
[ 28.203780] [<ffffffe000603a94>] patch_text_nosync+0x10/0x32
[ 28.204139] [<ffffffe0006051b0>] __ftrace_modify_call+0x5c/0x6c
[ 28.204497] [<ffffffe0006052c6>] ftrace_update_ftrace_func+0x20/0x4a
[ 28.204919] [<ffffffe000697742>] ftrace_modify_all_code+0xa0/0x148
[ 28.205378] [<ffffffe0006977fc>] __ftrace_modify_code+0x12/0x1c
[ 28.205793] [<ffffffe0006924b6>] multi_cpu_stop+0xa2/0x158
[ 28.206147] [<ffffffe0006921b0>] cpu_stopper_thread+0xa4/0x13a
[ 28.206510] [<ffffffe000629f38>] smpboot_thread_fn+0xf8/0x1da
[ 28.206868] [<ffffffe000625f36>] kthread+0xfa/0x12a
[ 28.207201] [<ffffffe0006017e2>] ret_from_exception+0x0/0xc
>
> Yes, text_mutex seems to be great. I'll change to use text_mutex in
> the next version if it works fine after testing. Thanks.
>
> > Thank you,
> > --
> > Masami Hiramatsu <[email protected]>
On Mon, 6 Apr 2020 18:36:42 +0800
Zong Li <[email protected]> wrote:
> On Sat, Apr 4, 2020 at 8:12 PM Zong Li <[email protected]> wrote:
> >
> > On Sat, Apr 4, 2020 at 11:14 AM Masami Hiramatsu <[email protected]> wrote:
> > >
> > > Hi Zong,
> > >
> > > On Fri, 3 Apr 2020 17:04:51 +0800
> > > Zong Li <[email protected]> wrote:
> > >
> > > > > > > > +{
> > > > > > > > + void *waddr = addr;
> > > > > > > > + bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
> > > > > > > > + unsigned long flags = 0;
> > > > > > > > + int ret;
> > > > > > > > +
> > > > > > > > + raw_spin_lock_irqsave(&patch_lock, flags);
> > > > > > >
> > > > > > > This looks a bit odd since stop_machine() is protected by its own mutex,
> > > > > > > and also the irq is already disabled here.
> > > > > >
> > > > > > We need it because we don't always enter the riscv_patch_text_nosync()
> > > > > > through stop_machine mechanism. If we call the
> > > > > > riscv_patch_text_nosync() directly, we need a lock to protect the
> > > > > > page.
> > > > >
> > > > > Oh, OK, but it leads another question. Is that safe to patch the
> > > > > text without sync? Would you use it for UP system?
> > > > > I think it is better to clarify "in what case user can call _nosync()"
> > > > > and add a comment on it.
> > > >
> > > > The ftrace is one of the cases, as documentation of ftrace said, when
> > > > dynamic ftrace is initialized, it calls kstop_machine to make the
> > > > machine act like a uniprocessor so that it can freely modify code
> > > > without worrying about other processors executing that same code. So
> > > > the ftrace called the _nosync interface here directly.
> > >
> > > Hmm, even though, since it already running under kstop_machine(), no
> > > other thread will run.
> > > Could you consider to use text_mutex instead of that? The text_mutex
> > > is already widely used in x86 and kernel/kprobes.c etc.
> > >
> > > (Hmm, it seems except for x86, alternative code don't care about
> > > racing...)
> > >
>
> The mutex_lock doesn't seem to work in ftrace context, I think it
> might be the reason why other architectures didn't use text_mutex in
> somewhere.
Yes, you need to implement ftrace_arch_code_modify_prepare() and
ftrace_arch_code_modify_post_process() in arch/riscv/kernel/ftrace.c.
Please see arch/x86/kernel/ftrace.c.
Thank you,
>
> # echo function > current_tracer
> [ 28.198070] BUG: sleeping function called from invalid context at
> kernel/locking/mutex.c:281
> [ 28.198663] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid:
> 11, name: migration/0
> [ 28.199491] CPU: 0 PID: 11 Comm: migration/0 Not tainted
> 5.6.0-00012-gd6f56a7a4be2-dirty #10
> [ 28.200330] Call Trace:
> [ 28.200798] [<ffffffe00060319a>] walk_stackframe+0x0/0xcc
> [ 28.201395] [<ffffffe000603442>] show_stack+0x3c/0x46
> [ 28.200798] [<ffffffe00060319a>] walk_stackframe+0x0/0xcc
> [ 28.201395] [<ffffffe000603442>] show_stack+0x3c/0x46
> [ 28.201898] [<ffffffe000d498b0>] dump_stack+0x76/0x90
> [ 28.202329] [<ffffffe00062c3f0>] ___might_sleep+0x100/0x10e
> [ 28.202720] [<ffffffe00062c448>] __might_sleep+0x4a/0x78
> [ 28.203033] [<ffffffe000d61622>] mutex_lock+0x2c/0x54
> [ 28.203397] [<ffffffe00060393e>] patch_insn_write+0x32/0xd8
> [ 28.203780] [<ffffffe000603a94>] patch_text_nosync+0x10/0x32
> [ 28.204139] [<ffffffe0006051b0>] __ftrace_modify_call+0x5c/0x6c
> [ 28.204497] [<ffffffe0006052c6>] ftrace_update_ftrace_func+0x20/0x4a
> [ 28.204919] [<ffffffe000697742>] ftrace_modify_all_code+0xa0/0x148
> [ 28.205378] [<ffffffe0006977fc>] __ftrace_modify_code+0x12/0x1c
> [ 28.205793] [<ffffffe0006924b6>] multi_cpu_stop+0xa2/0x158
> [ 28.206147] [<ffffffe0006921b0>] cpu_stopper_thread+0xa4/0x13a
> [ 28.206510] [<ffffffe000629f38>] smpboot_thread_fn+0xf8/0x1da
> [ 28.206868] [<ffffffe000625f36>] kthread+0xfa/0x12a
> [ 28.207201] [<ffffffe0006017e2>] ret_from_exception+0x0/0xc
>
> >
> > Yes, text_mutex seems to be great. I'll change to use text_mutex in
> > the next version if it works fine after testing. Thanks.
> >
> > > Thank you,
> > > --
> > > Masami Hiramatsu <[email protected]>
--
Masami Hiramatsu <[email protected]>
On Tue, Apr 7, 2020 at 8:29 PM Masami Hiramatsu <[email protected]> wrote:
>
> On Mon, 6 Apr 2020 18:36:42 +0800
> Zong Li <[email protected]> wrote:
>
> > On Sat, Apr 4, 2020 at 8:12 PM Zong Li <[email protected]> wrote:
> > >
> > > On Sat, Apr 4, 2020 at 11:14 AM Masami Hiramatsu <[email protected]> wrote:
> > > >
> > > > Hi Zong,
> > > >
> > > > On Fri, 3 Apr 2020 17:04:51 +0800
> > > > Zong Li <[email protected]> wrote:
> > > >
> > > > > > > > > +{
> > > > > > > > > + void *waddr = addr;
> > > > > > > > > + bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
> > > > > > > > > + unsigned long flags = 0;
> > > > > > > > > + int ret;
> > > > > > > > > +
> > > > > > > > > + raw_spin_lock_irqsave(&patch_lock, flags);
> > > > > > > >
> > > > > > > > This looks a bit odd since stop_machine() is protected by its own mutex,
> > > > > > > > and also the irq is already disabled here.
> > > > > > >
> > > > > > > We need it because we don't always enter the riscv_patch_text_nosync()
> > > > > > > through stop_machine mechanism. If we call the
> > > > > > > riscv_patch_text_nosync() directly, we need a lock to protect the
> > > > > > > page.
> > > > > >
> > > > > > Oh, OK, but it leads another question. Is that safe to patch the
> > > > > > text without sync? Would you use it for UP system?
> > > > > > I think it is better to clarify "in what case user can call _nosync()"
> > > > > > and add a comment on it.
> > > > >
> > > > > The ftrace is one of the cases, as documentation of ftrace said, when
> > > > > dynamic ftrace is initialized, it calls kstop_machine to make the
> > > > > machine act like a uniprocessor so that it can freely modify code
> > > > > without worrying about other processors executing that same code. So
> > > > > the ftrace called the _nosync interface here directly.
> > > >
> > > > Hmm, even though, since it already running under kstop_machine(), no
> > > > other thread will run.
> > > > Could you consider to use text_mutex instead of that? The text_mutex
> > > > is already widely used in x86 and kernel/kprobes.c etc.
> > > >
> > > > (Hmm, it seems except for x86, alternative code don't care about
> > > > racing...)
> > > >
> >
> > The mutex_lock doesn't seem to work in ftrace context, I think it
> > might be the reason why other architectures didn't use text_mutex in
> > somewhere.
>
> Yes, you need to implement ftrace_arch_code_modify_prepare() and
> ftrace_arch_code_modify_post_process() in arch/riscv/kernel/ftrace.c.
> Please see arch/x86/kernel/ftrace.c.
>
Oh ok, I misunderstood it before, I just use text_mutex instead of
patch_lock in patch.c. Thanks.
> Thank you,
>
> >
> > # echo function > current_tracer
> > [ 28.198070] BUG: sleeping function called from invalid context at
> > kernel/locking/mutex.c:281
> > [ 28.198663] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid:
> > 11, name: migration/0
> > [ 28.199491] CPU: 0 PID: 11 Comm: migration/0 Not tainted
> > 5.6.0-00012-gd6f56a7a4be2-dirty #10
> > [ 28.200330] Call Trace:
> > [ 28.200798] [<ffffffe00060319a>] walk_stackframe+0x0/0xcc
> > [ 28.201395] [<ffffffe000603442>] show_stack+0x3c/0x46
> > [ 28.200798] [<ffffffe00060319a>] walk_stackframe+0x0/0xcc
> > [ 28.201395] [<ffffffe000603442>] show_stack+0x3c/0x46
> > [ 28.201898] [<ffffffe000d498b0>] dump_stack+0x76/0x90
> > [ 28.202329] [<ffffffe00062c3f0>] ___might_sleep+0x100/0x10e
> > [ 28.202720] [<ffffffe00062c448>] __might_sleep+0x4a/0x78
> > [ 28.203033] [<ffffffe000d61622>] mutex_lock+0x2c/0x54
> > [ 28.203397] [<ffffffe00060393e>] patch_insn_write+0x32/0xd8
> > [ 28.203780] [<ffffffe000603a94>] patch_text_nosync+0x10/0x32
> > [ 28.204139] [<ffffffe0006051b0>] __ftrace_modify_call+0x5c/0x6c
> > [ 28.204497] [<ffffffe0006052c6>] ftrace_update_ftrace_func+0x20/0x4a
> > [ 28.204919] [<ffffffe000697742>] ftrace_modify_all_code+0xa0/0x148
> > [ 28.205378] [<ffffffe0006977fc>] __ftrace_modify_code+0x12/0x1c
> > [ 28.205793] [<ffffffe0006924b6>] multi_cpu_stop+0xa2/0x158
> > [ 28.206147] [<ffffffe0006921b0>] cpu_stopper_thread+0xa4/0x13a
> > [ 28.206510] [<ffffffe000629f38>] smpboot_thread_fn+0xf8/0x1da
> > [ 28.206868] [<ffffffe000625f36>] kthread+0xfa/0x12a
> > [ 28.207201] [<ffffffe0006017e2>] ret_from_exception+0x0/0xc
> >
> > >
> > > Yes, text_mutex seems to be great. I'll change to use text_mutex in
> > > the next version if it works fine after testing. Thanks.
> > >
> > > > Thank you,
> > > > --
> > > > Masami Hiramatsu <[email protected]>
>
>
> --
> Masami Hiramatsu <[email protected]>