The previous version of CET patches can be found in the following
link:
https://lkml.org/lkml/2018/8/30/608
Summary of changes from v3:
Fixed an issue in page fault handling in do_swap_page().
Fixed issues in ptep_set_wrprotect/pmdp_set_wrprotect.
Changed WRUSS fault handling to asm_goto.
Added shadow stack to memory accounting.
Added a patch to create a guard gap between VMAs.
Added a patch to prevent merging of shadow stack areas.
Other small fixes in ELF header parsing and typos.
Yu-cheng Yu (27):
x86/cpufeatures: Add CPUIDs for Control-flow Enforcement Technology
(CET)
x86/fpu/xstate: Change some names to separate XSAVES system and user
states
x86/fpu/xstate: Enable XSAVES system states
x86/fpu/xstate: Add XSAVES system states for shadow stack
Documentation/x86: Add CET description
x86/cet: Control protection exception handler
x86/cet/shstk: Add Kconfig option for user-mode shadow stack
mm: Introduce VM_SHSTK for shadow stack memory
x86/mm: Change _PAGE_DIRTY to _PAGE_DIRTY_HW
drm/i915/gvt: Update _PAGE_DIRTY to _PAGE_DIRTY_BITS
x86/mm: Introduce _PAGE_DIRTY_SW
x86/mm: Modify ptep_set_wrprotect and pmdp_set_wrprotect for
_PAGE_DIRTY_SW
x86/mm: Shadow stack page fault error checking
mm: Handle shadow stack page fault
mm: Handle THP/HugeTLB shadow stack page fault
mm: Update can_follow_write_pte/pmd for shadow stack
mm: Introduce do_mmap_locked()
x86/cet/shstk: User-mode shadow stack support
x86/cet/shstk: Introduce WRUSS instruction
x86/cet/shstk: Signal handling for shadow stack
x86/cet/shstk: ELF header parsing of Shadow Stack
x86/cet/shstk: Handle thread shadow stack
mm/map: Add Shadow stack pages to memory accounting
mm/mmap: Create a guard area between VMAs
mm/mmap: Prevent Shadow Stack VMA merges
x86/cet/shstk: Add arch_prctl functions for Shadow Stack
x86/cet/shstk: Add Shadow Stack instructions to opcode map
.../admin-guide/kernel-parameters.txt | 6 +
Documentation/index.rst | 1 +
Documentation/x86/index.rst | 11 +
Documentation/x86/intel_cet.rst | 259 +++++++++++++
arch/x86/Kconfig | 28 ++
arch/x86/Makefile | 7 +
arch/x86/entry/entry_64.S | 2 +-
arch/x86/ia32/ia32_signal.c | 13 +
arch/x86/include/asm/cet.h | 42 +++
arch/x86/include/asm/cpufeatures.h | 2 +
arch/x86/include/asm/disabled-features.h | 8 +-
arch/x86/include/asm/elf.h | 5 +
arch/x86/include/asm/fpu/internal.h | 6 +-
arch/x86/include/asm/fpu/types.h | 22 ++
arch/x86/include/asm/fpu/xstate.h | 31 +-
arch/x86/include/asm/mmu_context.h | 3 +
arch/x86/include/asm/msr-index.h | 14 +
arch/x86/include/asm/pgtable.h | 191 ++++++++--
arch/x86/include/asm/pgtable_types.h | 31 +-
arch/x86/include/asm/processor.h | 5 +
arch/x86/include/asm/sighandling.h | 5 +
arch/x86/include/asm/special_insns.h | 32 ++
arch/x86/include/asm/traps.h | 5 +
arch/x86/include/uapi/asm/elf_property.h | 15 +
arch/x86/include/uapi/asm/prctl.h | 5 +
arch/x86/include/uapi/asm/processor-flags.h | 2 +
arch/x86/include/uapi/asm/sigcontext.h | 17 +
arch/x86/kernel/Makefile | 4 +
arch/x86/kernel/cet.c | 285 +++++++++++++++
arch/x86/kernel/cet_prctl.c | 79 ++++
arch/x86/kernel/cpu/common.c | 24 ++
arch/x86/kernel/cpu/scattered.c | 1 +
arch/x86/kernel/elf.c | 340 ++++++++++++++++++
arch/x86/kernel/fpu/core.c | 11 +-
arch/x86/kernel/fpu/init.c | 10 -
arch/x86/kernel/fpu/signal.c | 6 +-
arch/x86/kernel/fpu/xstate.c | 152 +++++---
arch/x86/kernel/idt.c | 4 +
arch/x86/kernel/process.c | 8 +
arch/x86/kernel/process_64.c | 7 +
arch/x86/kernel/relocate_kernel_64.S | 2 +-
arch/x86/kernel/signal.c | 96 +++++
arch/x86/kernel/traps.c | 58 +++
arch/x86/kvm/vmx.c | 2 +-
arch/x86/lib/x86-opcode-map.txt | 26 +-
arch/x86/mm/fault.c | 27 ++
arch/x86/mm/pgtable.c | 42 +++
drivers/gpu/drm/i915/gvt/gtt.c | 2 +-
fs/binfmt_elf.c | 15 +
fs/proc/task_mmu.c | 3 +
include/asm-generic/pgtable.h | 14 +
include/linux/mm.h | 56 ++-
include/uapi/linux/elf.h | 1 +
mm/gup.c | 8 +-
mm/huge_memory.c | 12 +-
mm/memory.c | 7 +-
mm/mmap.c | 11 +
tools/objtool/arch/x86/lib/x86-opcode-map.txt | 26 +-
58 files changed, 1946 insertions(+), 161 deletions(-)
create mode 100644 Documentation/x86/index.rst
create mode 100644 Documentation/x86/intel_cet.rst
create mode 100644 arch/x86/include/asm/cet.h
create mode 100644 arch/x86/include/uapi/asm/elf_property.h
create mode 100644 arch/x86/kernel/cet.c
create mode 100644 arch/x86/kernel/cet_prctl.c
create mode 100644 arch/x86/kernel/elf.c
--
2.17.1
Look in .note.gnu.property of an ELF file and check if Shadow Stack needs
to be enabled for the task.
Signed-off-by: H.J. Lu <[email protected]>
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/Kconfig | 4 +
arch/x86/include/asm/elf.h | 5 +
arch/x86/include/uapi/asm/elf_property.h | 15 +
arch/x86/kernel/Makefile | 2 +
arch/x86/kernel/elf.c | 340 +++++++++++++++++++++++
fs/binfmt_elf.c | 15 +
include/uapi/linux/elf.h | 1 +
7 files changed, 382 insertions(+)
create mode 100644 arch/x86/include/uapi/asm/elf_property.h
create mode 100644 arch/x86/kernel/elf.c
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 808aa3aecf3c..6377125543cc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1919,12 +1919,16 @@ config X86_INTEL_CET
config ARCH_HAS_SHSTK
def_bool n
+config ARCH_HAS_PROGRAM_PROPERTIES
+ def_bool n
+
config X86_INTEL_SHADOW_STACK_USER
prompt "Intel Shadow Stack for user-mode"
def_bool n
depends on CPU_SUP_INTEL && X86_64
select X86_INTEL_CET
select ARCH_HAS_SHSTK
+ select ARCH_HAS_PROGRAM_PROPERTIES
---help---
Shadow stack provides hardware protection against program stack
corruption. Only when all the following are true will an application
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 0d157d2a1e2a..5b5f169c5c07 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -382,4 +382,9 @@ struct va_alignment {
extern struct va_alignment va_align;
extern unsigned long align_vdso_addr(unsigned long);
+
+#ifdef CONFIG_ARCH_HAS_PROGRAM_PROPERTIES
+extern int arch_setup_features(void *ehdr, void *phdr, struct file *file,
+ bool interp);
+#endif
#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/uapi/asm/elf_property.h b/arch/x86/include/uapi/asm/elf_property.h
new file mode 100644
index 000000000000..af361207718c
--- /dev/null
+++ b/arch/x86/include/uapi/asm/elf_property.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_ASM_X86_ELF_PROPERTY_H
+#define _UAPI_ASM_X86_ELF_PROPERTY_H
+
+/*
+ * pr_type
+ */
+#define GNU_PROPERTY_X86_FEATURE_1_AND (0xc0000002)
+
+/*
+ * Bits for GNU_PROPERTY_X86_FEATURE_1_AND
+ */
+#define GNU_PROPERTY_X86_FEATURE_1_SHSTK (0x00000002)
+
+#endif /* _UAPI_ASM_X86_ELF_PROPERTY_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index fbb2d91fb756..36b14ef410c8 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -141,6 +141,8 @@ obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
obj-$(CONFIG_X86_INTEL_CET) += cet.o
+obj-$(CONFIG_ARCH_HAS_PROGRAM_PROPERTIES) += elf.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/elf.c b/arch/x86/kernel/elf.c
new file mode 100644
index 000000000000..2fddd0bc545b
--- /dev/null
+++ b/arch/x86/kernel/elf.c
@@ -0,0 +1,340 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Look at an ELF file's .note.gnu.property and determine if the file
+ * supports shadow stack and/or indirect branch tracking.
+ * The path from the ELF header to the note section is the following:
+ * elfhdr->elf_phdr->elf_note->property[].
+ */
+
+#include <asm/cet.h>
+#include <asm/elf_property.h>
+#include <asm/prctl.h>
+#include <asm/processor.h>
+#include <uapi/linux/elf-em.h>
+#include <uapi/linux/prctl.h>
+#include <linux/binfmts.h>
+#include <linux/elf.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/compat.h>
+
+/*
+ * The .note.gnu.property layout:
+ *
+ * struct elf_note {
+ * u32 n_namesz; --> sizeof(n_name[]); always (4)
+ * u32 n_ndescsz;--> sizeof(property[])
+ * u32 n_type; --> always NT_GNU_PROPERTY_TYPE_0
+ * };
+ * char n_name[4]; --> always 'GNU\0'
+ *
+ * struct {
+ * struct property_x86 {
+ * u32 pr_type;
+ * u32 pr_datasz;
+ * };
+ * u8 pr_data[pr_datasz];
+ * }[];
+ */
+
+#define BUF_SIZE (PAGE_SIZE / 4)
+
+struct property_x86 {
+ u32 pr_type;
+ u32 pr_datasz;
+};
+
+typedef bool (test_fn)(void *buf, u32 *arg);
+typedef void *(next_fn)(void *buf, u32 *arg);
+
+static inline bool test_note_type_0(void *buf, u32 *arg)
+{
+ struct elf_note *n = buf;
+
+ return ((n->n_namesz == 4) && (memcmp(n + 1, "GNU", 4) == 0) &&
+ (n->n_type == NT_GNU_PROPERTY_TYPE_0));
+}
+
+static inline void *next_note(void *buf, u32 *arg)
+{
+ struct elf_note *n = buf;
+ u32 align = *arg;
+ int size;
+
+ size = round_up(sizeof(*n) + n->n_namesz, align);
+ size = round_up(size + n->n_descsz, align);
+
+ if (buf + size < buf)
+ return NULL;
+ else
+ return (buf + size);
+}
+
+static inline bool test_property_x86(void *buf, u32 *arg)
+{
+ struct property_x86 *pr = buf;
+ u32 max_type = *arg;
+
+ if (pr->pr_type > max_type)
+ *arg = pr->pr_type;
+
+ return (pr->pr_type == GNU_PROPERTY_X86_FEATURE_1_AND);
+}
+
+static inline void *next_property(void *buf, u32 *arg)
+{
+ struct property_x86 *pr = buf;
+ u32 max_type = *arg;
+
+ if ((buf + sizeof(*pr) + pr->pr_datasz < buf) ||
+ (pr->pr_type > GNU_PROPERTY_X86_FEATURE_1_AND) ||
+ (pr->pr_type > max_type))
+ return NULL;
+ else
+ return (buf + sizeof(*pr) + pr->pr_datasz);
+}
+
+/*
+ * Scan 'buf' for a pattern; return true if found.
+ * *pos is the distance from the beginning of buf to where
+ * the searched item or the next item is located.
+ */
+static int scan(u8 *buf, u32 buf_size, int item_size,
+ test_fn test, next_fn next, u32 *arg, u32 *pos)
+{
+ int found = 0;
+ u8 *p, *max;
+
+ max = buf + buf_size;
+ if (max < buf)
+ return 0;
+
+ p = buf;
+
+ while ((p + item_size < max) && (p + item_size > buf)) {
+ if (test(p, arg)) {
+ found = 1;
+ break;
+ }
+
+ p = next(p, arg);
+ }
+
+ *pos = (p + item_size <= buf) ? 0 : (u32)(p - buf);
+ return found;
+}
+
+/*
+ * Search a NT_GNU_PROPERTY_TYPE_0 for GNU_PROPERTY_X86_FEATURE_1_AND.
+ */
+static int find_feature_x86(struct file *file, unsigned long desc_size,
+ loff_t file_offset, u8 *buf, u32 *feature)
+{
+ u32 buf_pos;
+ unsigned long read_size;
+ unsigned long done;
+ int found = 0;
+ int ret = 0;
+ u32 last_pr = 0;
+
+ *feature = 0;
+ buf_pos = 0;
+
+ for (done = 0; done < desc_size; done += buf_pos) {
+ read_size = desc_size - done;
+ if (read_size > BUF_SIZE)
+ read_size = BUF_SIZE;
+
+ ret = kernel_read(file, buf, read_size, &file_offset);
+
+ if (ret != read_size)
+ return (ret < 0) ? ret : -EIO;
+
+ ret = 0;
+ found = scan(buf, read_size, sizeof(struct property_x86),
+ test_property_x86, next_property,
+ &last_pr, &buf_pos);
+
+ if ((!buf_pos) || found)
+ break;
+
+ file_offset += buf_pos - read_size;
+ }
+
+ if (found) {
+ struct property_x86 *pr =
+ (struct property_x86 *)(buf + buf_pos);
+
+ if (pr->pr_datasz == 4) {
+ u32 *max = (u32 *)(buf + read_size);
+ u32 *data = (u32 *)((u8 *)pr + sizeof(*pr));
+
+ if (data + 1 <= max) {
+ *feature = *data;
+ } else {
+ file_offset += buf_pos - read_size;
+ file_offset += sizeof(*pr);
+ ret = kernel_read(file, feature, 4,
+ &file_offset);
+ }
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Search a PT_NOTE segment for the first NT_GNU_PROPERTY_TYPE_0.
+ */
+static int find_note_type_0(struct file *file, unsigned long note_size,
+ loff_t file_offset, u32 align, u32 *feature)
+{
+ u8 *buf;
+ u32 buf_pos;
+ unsigned long read_size;
+ unsigned long done;
+ int found = 0;
+ int ret = 0;
+
+ buf = kmalloc(BUF_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ *feature = 0;
+ buf_pos = 0;
+
+ for (done = 0; done < note_size; done += buf_pos) {
+ read_size = note_size - done;
+ if (read_size > BUF_SIZE)
+ read_size = BUF_SIZE;
+
+ ret = kernel_read(file, buf, read_size, &file_offset);
+
+ if (ret != read_size) {
+ ret = (ret < 0) ? ret : -EIO;
+ kfree(buf);
+ return ret;
+ }
+
+ /*
+ * item_size = sizeof(struct elf_note) + elf_note.n_namesz.
+ * n_namesz is 4 for the note type we look for.
+ */
+ ret = 0;
+ found += scan(buf, read_size, sizeof(struct elf_note) + 4,
+ test_note_type_0, next_note,
+ &align, &buf_pos);
+
+ file_offset += buf_pos - read_size;
+
+ if (found == 1) {
+ struct elf_note *n =
+ (struct elf_note *)(buf + buf_pos);
+ u32 start = round_up(sizeof(*n) + n->n_namesz, align);
+ u32 total = round_up(start + n->n_descsz, align);
+
+ ret = find_feature_x86(file, n->n_descsz,
+ file_offset + start,
+ buf, feature);
+ file_offset += total;
+ buf_pos += total;
+ } else if (!buf_pos) {
+ *feature = 0;
+ break;
+ }
+ }
+
+ kfree(buf);
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int check_notes_32(struct file *file, struct elf32_phdr *phdr,
+ int phnum, u32 *feature)
+{
+ int i;
+ int err = 0;
+
+ for (i = 0; i < phnum; i++, phdr++) {
+ if ((phdr->p_type != PT_NOTE) || (phdr->p_align != 4))
+ continue;
+
+ err = find_note_type_0(file, phdr->p_filesz, phdr->p_offset,
+ phdr->p_align, feature);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_X86_64
+static int check_notes_64(struct file *file, struct elf64_phdr *phdr,
+ int phnum, u32 *feature)
+{
+ int i;
+ int err = 0;
+
+ for (i = 0; i < phnum; i++, phdr++) {
+ if ((phdr->p_type != PT_NOTE) || (phdr->p_align != 8))
+ continue;
+
+ err = find_note_type_0(file, phdr->p_filesz, phdr->p_offset,
+ phdr->p_align, feature);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+#endif
+
+int arch_setup_features(void *ehdr_p, void *phdr_p,
+ struct file *file, bool interp)
+{
+ int err = 0;
+ u32 feature = 0;
+
+ struct elf64_hdr *ehdr64 = ehdr_p;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return 0;
+
+ if (ehdr64->e_ident[EI_CLASS] == ELFCLASS64) {
+ struct elf64_phdr *phdr64 = phdr_p;
+
+ err = check_notes_64(file, phdr64, ehdr64->e_phnum,
+ &feature);
+ if (err < 0)
+ goto out;
+ } else {
+#ifdef CONFIG_COMPAT
+ struct elf32_hdr *ehdr32 = ehdr_p;
+
+ if (ehdr32->e_ident[EI_CLASS] == ELFCLASS32) {
+ struct elf32_phdr *phdr32 = phdr_p;
+
+ err = check_notes_32(file, phdr32, ehdr32->e_phnum,
+ &feature);
+ if (err < 0)
+ goto out;
+ }
+#endif
+ }
+
+ memset(¤t->thread.cet, 0, sizeof(struct cet_status));
+
+ if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+ if (feature & GNU_PROPERTY_X86_FEATURE_1_SHSTK) {
+ err = cet_setup_shstk();
+ if (err < 0)
+ goto out;
+ }
+ }
+
+out:
+ return err;
+}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index efae2fb0930a..b891aa292b46 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1081,6 +1081,21 @@ static int load_elf_binary(struct linux_binprm *bprm)
goto out_free_dentry;
}
+#ifdef CONFIG_ARCH_HAS_PROGRAM_PROPERTIES
+ if (interpreter) {
+ retval = arch_setup_features(&loc->interp_elf_ex,
+ interp_elf_phdata,
+ interpreter, true);
+ } else {
+ retval = arch_setup_features(&loc->elf_ex,
+ elf_phdata,
+ bprm->file, false);
+ }
+
+ if (retval < 0)
+ goto out_free_dentry;
+#endif
+
if (elf_interpreter) {
unsigned long interp_map_addr = 0;
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index c5358e0ae7c5..5ef25a565e88 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -372,6 +372,7 @@ typedef struct elf64_shdr {
#define NT_PRFPREG 2
#define NT_PRPSINFO 3
#define NT_TASKSTRUCT 4
+#define NT_GNU_PROPERTY_TYPE_0 5
#define NT_AUXV 6
/*
* Note to userspace developers: size of NT_SIGINFO note may increase
--
2.17.1
This patch adds basic shadow stack enabling/disabling routines.
A task's shadow stack is allocated from memory with VM_SHSTK
flag set and read-only protection. The shadow stack is
allocated to a fixed size of RLIMIT_STACK.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/cet.h | 30 +++++++
arch/x86/include/asm/disabled-features.h | 8 +-
arch/x86/include/asm/msr-index.h | 14 +++
arch/x86/include/asm/processor.h | 5 ++
arch/x86/kernel/Makefile | 2 +
arch/x86/kernel/cet.c | 109 +++++++++++++++++++++++
arch/x86/kernel/cpu/common.c | 24 +++++
arch/x86/kernel/process.c | 2 +
fs/proc/task_mmu.c | 3 +
9 files changed, 196 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/include/asm/cet.h
create mode 100644 arch/x86/kernel/cet.c
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
new file mode 100644
index 000000000000..ad278c520414
--- /dev/null
+++ b/arch/x86/include/asm/cet.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CET_H
+#define _ASM_X86_CET_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+struct task_struct;
+/*
+ * Per-thread CET status
+ */
+struct cet_status {
+ unsigned long shstk_base;
+ unsigned long shstk_size;
+ unsigned int shstk_enabled:1;
+};
+
+#ifdef CONFIG_X86_INTEL_CET
+int cet_setup_shstk(void);
+void cet_disable_shstk(void);
+void cet_disable_free_shstk(struct task_struct *p);
+#else
+static inline int cet_setup_shstk(void) { return 0; }
+static inline void cet_disable_shstk(void) {}
+static inline void cet_disable_free_shstk(struct task_struct *p) {}
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_CET_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 33833d1909af..3624a11e5ba6 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -56,6 +56,12 @@
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+#define DISABLE_SHSTK 0
+#else
+#define DISABLE_SHSTK (1<<(X86_FEATURE_SHSTK & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -75,7 +81,7 @@
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
-#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP|DISABLE_SHSTK)
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4731f0cf97c5..e073801a44e0 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -777,4 +777,18 @@
#define MSR_VM_IGNNE 0xc0010115
#define MSR_VM_HSAVE_PA 0xc0010117
+/* Control-flow Enforcement Technology MSRs */
+#define MSR_IA32_U_CET 0x6a0 /* user mode cet setting */
+#define MSR_IA32_S_CET 0x6a2 /* kernel mode cet setting */
+#define MSR_IA32_PL0_SSP 0x6a4 /* kernel shstk pointer */
+#define MSR_IA32_PL3_SSP 0x6a7 /* user shstk pointer */
+#define MSR_IA32_INT_SSP_TAB 0x6a8 /* exception shstk table */
+
+/* MSR_IA32_U_CET and MSR_IA32_S_CET bits */
+#define MSR_IA32_CET_SHSTK_EN 0x0000000000000001ULL
+#define MSR_IA32_CET_WRSS_EN 0x0000000000000002ULL
+#define MSR_IA32_CET_ENDBR_EN 0x0000000000000004ULL
+#define MSR_IA32_CET_LEG_IW_EN 0x0000000000000008ULL
+#define MSR_IA32_CET_NO_TRACK_EN 0x0000000000000010ULL
+
#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d53c54b842da..63918cecf367 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -24,6 +24,7 @@ struct vm86;
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
#include <asm/unwind_hints.h>
+#include <asm/cet.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -505,6 +506,10 @@ struct thread_struct {
unsigned int sig_on_uaccess_err:1;
unsigned int uaccess_err:1; /* uaccess failed */
+#ifdef CONFIG_X86_INTEL_CET
+ struct cet_status cet;
+#endif
+
/* Floating point and extended processor state */
struct fpu fpu;
/*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8824d01c0c35..fbb2d91fb756 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -139,6 +139,8 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
+obj-$(CONFIG_X86_INTEL_CET) += cet.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
new file mode 100644
index 000000000000..ec256ae27a31
--- /dev/null
+++ b/arch/x86/kernel/cet.c
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * cet.c - Control Flow Enforcement (CET)
+ *
+ * Copyright (c) 2018, Intel Corporation.
+ * Yu-cheng Yu <[email protected]>
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <asm/msr.h>
+#include <asm/user.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/types.h>
+#include <asm/compat.h>
+#include <asm/cet.h>
+
+static int set_shstk_ptr(unsigned long addr)
+{
+ u64 r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return -1;
+
+ if ((addr >= TASK_SIZE_MAX) || (!IS_ALIGNED(addr, 4)))
+ return -1;
+
+ rdmsrl(MSR_IA32_U_CET, r);
+ wrmsrl(MSR_IA32_PL3_SSP, addr);
+ wrmsrl(MSR_IA32_U_CET, r | MSR_IA32_CET_SHSTK_EN);
+ return 0;
+}
+
+static unsigned long get_shstk_addr(void)
+{
+ unsigned long ptr;
+
+ if (!current->thread.cet.shstk_enabled)
+ return 0;
+
+ rdmsrl(MSR_IA32_PL3_SSP, ptr);
+ return ptr;
+}
+
+int cet_setup_shstk(void)
+{
+ unsigned long addr, size;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return -EOPNOTSUPP;
+
+ size = rlimit(RLIMIT_STACK);
+ addr = do_mmap_locked(0, size, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_SHSTK);
+
+ /*
+ * Return actual error from do_mmap().
+ */
+ if (addr >= TASK_SIZE_MAX)
+ return addr;
+
+ set_shstk_ptr(addr + size - sizeof(u64));
+ current->thread.cet.shstk_base = addr;
+ current->thread.cet.shstk_size = size;
+ current->thread.cet.shstk_enabled = 1;
+ return 0;
+}
+
+void cet_disable_shstk(void)
+{
+ u64 r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return;
+
+ rdmsrl(MSR_IA32_U_CET, r);
+ r &= ~(MSR_IA32_CET_SHSTK_EN);
+ wrmsrl(MSR_IA32_U_CET, r);
+ wrmsrl(MSR_IA32_PL3_SSP, 0);
+ current->thread.cet.shstk_enabled = 0;
+}
+
+void cet_disable_free_shstk(struct task_struct *tsk)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
+ !tsk->thread.cet.shstk_enabled)
+ return;
+
+ if (tsk == current)
+ cet_disable_shstk();
+
+ /*
+ * Free only when tsk is current or shares mm
+ * with current but has its own shstk.
+ */
+ if (tsk->mm && (tsk->mm == current->mm) &&
+ (tsk->thread.cet.shstk_base)) {
+ vm_munmap(tsk->thread.cet.shstk_base,
+ tsk->thread.cet.shstk_size);
+ tsk->thread.cet.shstk_base = 0;
+ tsk->thread.cet.shstk_size = 0;
+ }
+
+ tsk->thread.cet.shstk_enabled = 0;
+}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 44c4ef3d989b..bffa9ef47832 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -411,6 +411,29 @@ static __init int setup_disable_pku(char *arg)
__setup("nopku", setup_disable_pku);
#endif /* CONFIG_X86_64 */
+static __always_inline void setup_cet(struct cpuinfo_x86 *c)
+{
+ if (cpu_feature_enabled(X86_FEATURE_SHSTK))
+ cr4_set_bits(X86_CR4_CET);
+}
+
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+static __init int setup_disable_shstk(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (strlen(s))
+ return 0;
+
+ if (!boot_cpu_has(X86_FEATURE_SHSTK))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_SHSTK);
+ pr_info("x86: 'no_cet_shstk' specified, disabling Shadow Stack\n");
+ return 1;
+}
+__setup("no_cet_shstk", setup_disable_shstk);
+#endif
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -1376,6 +1399,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
x86_init_rdrand(c);
x86_init_cache_qos(c);
setup_pku(c);
+ setup_cet(c);
/*
* Clear/Set all flags overridden by options, need do it
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c93fcfdf1673..4a776da4c28c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,6 +39,7 @@
#include <asm/desc.h>
#include <asm/prctl.h>
#include <asm/spec-ctrl.h>
+#include <asm/cet.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -134,6 +135,7 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ cet_disable_shstk();
fpu__clear(&tsk->thread.fpu);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5ea1d64cb0b4..b20450dde5b7 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -652,6 +652,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_PKEY_BIT4)] = "",
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+ [ilog2(VM_SHSTK)] = "ss"
+#endif
};
size_t i;
--
2.17.1
WRUSS is a new kernel-mode instruction but writes directly
to user shadow stack memory. This is used to construct
a return address on the shadow stack for the signal
handler.
This instruction can fault if the user shadow stack is
invalid shadow stack memory. In that case, the kernel does
fixup.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/special_insns.h | 32 ++++++++++++++++++++++++++++
arch/x86/mm/fault.c | 9 ++++++++
2 files changed, 41 insertions(+)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 317fc59b512c..c04e68ef47da 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -237,6 +237,38 @@ static inline void clwb(volatile void *__p)
: [pax] "a" (p));
}
+#ifdef CONFIG_X86_INTEL_CET
+#if defined(CONFIG_IA32_EMULATION) || defined(CONFIG_X86_X32)
+static inline int write_user_shstk_32(unsigned long addr, unsigned int val)
+{
+ asm_volatile_goto("1: wrussd %1, (%0)\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: "r" (addr), "r" (val)
+ :: fail);
+ return 0;
+fail:
+ return -1;
+}
+#else
+static inline int write_user_shstk_32(unsigned long addr, unsigned int val)
+{
+ WARN_ONCE(1, "write_user_shstk_32 used but not supported.\n");
+ return -EFAULT;
+}
+#endif
+
+static inline int write_user_shstk_64(unsigned long addr, unsigned long val)
+{
+ asm_volatile_goto("1: wrussq %1, (%0)\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: "r" (addr), "r" (val)
+ :: fail);
+ return 0;
+fail:
+ return -1;
+}
+#endif /* CONFIG_X86_INTEL_CET */
+
#define nop() asm volatile ("nop")
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7c3877a982f4..4d4ac57a4ba2 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1305,6 +1305,15 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
error_code |= X86_PF_USER;
flags |= FAULT_FLAG_USER;
} else {
+ /*
+ * WRUSS is a kernel instrcution and but writes
+ * to user shadow stack. When a fault occurs,
+ * both X86_PF_USER and X86_PF_SHSTK are set.
+ * Clear X86_PF_USER here.
+ */
+ if ((error_code & (X86_PF_USER | X86_PF_SHSTK)) ==
+ (X86_PF_USER | X86_PF_SHSTK))
+ error_code &= ~X86_PF_USER;
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
--
2.17.1
When a task does fork(), its shadow stack must be duplicated for
the child. However, the child may not actually use all pages of
of the copied shadow stack. This patch implements a flow that
is similar to copy-on-write of an anonymous page, but for shadow
stack memory. A shadow stack PTE needs to be RO and dirty. We
use this dirty bit requirement to effect the copying of shadow
stack pages.
In copy_one_pte(), we clear the dirty bit from the shadow stack
PTE. On the next shadow stack access to the PTE, a page fault
occurs. At that time, we then copy/re-use the page and fix the
PTE.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/mm/pgtable.c | 15 +++++++++++++++
include/asm-generic/pgtable.h | 8 ++++++++
mm/memory.c | 7 ++++++-
3 files changed, 29 insertions(+), 1 deletion(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ae394552fb94..57eeb2230340 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -872,3 +872,18 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+inline pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHSTK)
+ return pte_mkdirty_shstk(pte);
+ else
+ return pte;
+}
+
+inline bool arch_copy_pte_mapping(vm_flags_t vm_flags)
+{
+ return (vm_flags & VM_SHSTK);
+}
+#endif /* CONFIG_X86_INTEL_SHADOW_STACK_USER */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 88ebc6102c7c..b99aa3677350 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1127,4 +1127,12 @@ static inline bool arch_has_pfn_modify_check(void)
#endif
#endif
+#ifndef CONFIG_ARCH_HAS_SHSTK
+#define pte_set_vma_features(pte, vma) pte
+#define arch_copy_pte_mapping(vma_flags) false
+#else
+inline pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma);
+bool arch_copy_pte_mapping(vm_flags_t vm_flags);
+#endif
+
#endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/mm/memory.c b/mm/memory.c
index c467102a5cbc..1fb676ec7da2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1022,7 +1022,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
- if (is_cow_mapping(vm_flags) && pte_write(pte)) {
+ if ((is_cow_mapping(vm_flags) && pte_write(pte)) ||
+ arch_copy_pte_mapping(vm_flags)) {
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
@@ -2462,6 +2463,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = pte_set_vma_features(entry, vma);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2535,6 +2537,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = pte_set_vma_features(entry, vma);
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
@@ -3045,6 +3048,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte = mk_pte(page, vma->vm_page_prot);
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ pte = pte_set_vma_features(pte, vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
@@ -3187,6 +3191,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
+ entry = pte_set_vma_features(entry, vma);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
--
2.17.1
If a page fault is triggered by a shadow stack access (e.g.
call/ret) or shadow stack management instructions (e.g.
wrussq), then bit[6] of the page fault error code is set.
In access_error(), we check if a shadow stack page fault
is within a shadow stack memory area.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/traps.h | 2 ++
arch/x86/mm/fault.c | 18 ++++++++++++++++++
2 files changed, 20 insertions(+)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 5196050ff3d5..58ea2f5722e9 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -157,6 +157,7 @@ enum {
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
+ * bit 6 == 1: shadow stack access fault
*/
enum x86_pf_error_code {
X86_PF_PROT = 1 << 0,
@@ -165,5 +166,6 @@ enum x86_pf_error_code {
X86_PF_RSVD = 1 << 3,
X86_PF_INSTR = 1 << 4,
X86_PF_PK = 1 << 5,
+ X86_PF_SHSTK = 1 << 6,
};
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 47bebfe6efa7..7c3877a982f4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1162,6 +1162,17 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
(error_code & X86_PF_INSTR), foreign))
return 1;
+ /*
+ * Verify X86_PF_SHSTK is within a shadow stack VMA.
+ * It is always an error if there is a shadow stack
+ * fault outside a shadow stack VMA.
+ */
+ if (error_code & X86_PF_SHSTK) {
+ if (!(vma->vm_flags & VM_SHSTK))
+ return 1;
+ return 0;
+ }
+
if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
@@ -1300,6 +1311,13 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ /*
+ * If the fault is caused by a shadow stack access,
+ * i.e. CALL/RET/SAVEPREVSSP/RSTORSSP, then set
+ * FAULT_FLAG_WRITE to effect copy-on-write.
+ */
+ if (error_code & X86_PF_SHSTK)
+ flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
--
2.17.1
When Shadow Stack is enabled, the [R/O + PAGE_DIRTY_HW] setting is
reserved only for the Shadow Stack. For non-Shadow Stack R/O PTEs,
we use [R/O + PAGE_DIRTY_SW].
When a PTE goes from [R/W + PAGE_DIRTY_HW] to [R/O + PAGE_DIRTY_SW],
it could become a transient Shadow Stack PTE in two cases.
The first case is that some processors can start a write but end up
seeing a read-only PTE by the time they get to the Dirty bit,
creating a transient Shadow Stack PTE. However, this will not occur
on processors supporting Shadow Stack therefore we don't need a TLB
flush here.
The second case is that when the software, without atomic, tests &
replaces PAGE_DIRTY_HW with PAGE_DIRTY_SW, a transient Shadow Stack
PTE can exist. This is prevented with cmpxchg.
Dave Hansen, Jann Horn, Andy Lutomirski, and Peter Zijlstra provided
many insights to the issue. Jann Horn provided the cmpxchg solution.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/pgtable.h | 58 ++++++++++++++++++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3ee554d81480..b6e0ee5c5503 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1203,7 +1203,36 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+ pte_t new_pte, pte = READ_ONCE(*ptep);
+
+ /*
+ * Some processors can start a write, but end up
+ * seeing a read-only PTE by the time they get
+ * to the Dirty bit. In this case, they will
+ * set the Dirty bit, leaving a read-only, Dirty
+ * PTE which looks like a Shadow Stack PTE.
+ *
+ * However, this behavior has been improved and
+ * will not occur on processors supporting
+ * Shadow Stacks. Without this guarantee, a
+ * transition to a non-present PTE and flush the
+ * TLB would be needed.
+ *
+ * When changing a writable PTE to read-only and
+ * if the PTE has _PAGE_DIRTY_HW set, we move
+ * that bit to _PAGE_DIRTY_SW so that the PTE is
+ * not a valid Shadow Stack PTE.
+ */
+ do {
+ new_pte = pte_wrprotect(pte);
+ new_pte.pte |= (new_pte.pte & _PAGE_DIRTY_HW) >>
+ _PAGE_BIT_DIRTY_HW << _PAGE_BIT_DIRTY_SW;
+ new_pte.pte &= ~_PAGE_DIRTY_HW;
+ } while (!try_cmpxchg(ptep, &pte, new_pte));
+#else
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
+#endif
}
#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
@@ -1266,7 +1295,36 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+ pmd_t new_pmd, pmd = READ_ONCE(*pmdp);
+
+ /*
+ * Some processors can start a write, but end up
+ * seeing a read-only PMD by the time they get
+ * to the Dirty bit. In this case, they will
+ * set the Dirty bit, leaving a read-only, Dirty
+ * PMD which looks like a Shadow Stack PMD.
+ *
+ * However, this behavior has been improved and
+ * will not occur on processors supporting
+ * Shadow Stacks. Without this guarantee, a
+ * transition to a non-present PMD and flush the
+ * TLB would be needed.
+ *
+ * When changing a writable PMD to read-only and
+ * if the PMD has _PAGE_DIRTY_HW set, we move
+ * that bit to _PAGE_DIRTY_SW so that the PMD is
+ * not a valid Shadow Stack PMD.
+ */
+ do {
+ new_pmd = pmd_wrprotect(pmd);
+ new_pmd.pmd |= (new_pmd.pmd & _PAGE_DIRTY_HW) >>
+ _PAGE_BIT_DIRTY_HW << _PAGE_BIT_DIRTY_SW;
+ new_pmd.pmd &= ~_PAGE_DIRTY_HW;
+ } while (!try_cmpxchg(pmdp, &pmd, new_pmd));
+#else
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+#endif
}
#define pud_write pud_write
--
2.17.1
There are a few places that need do_mmap() with mm->mmap_sem held.
Create an in-line function for that.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
include/linux/mm.h | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f40387ecd920..c4cc07baccda 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2318,6 +2318,24 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif
+static inline unsigned long do_mmap_locked(unsigned long addr,
+ unsigned long len, unsigned long prot, unsigned long flags,
+ vm_flags_t vm_flags)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long populate;
+
+ down_write(&mm->mmap_sem);
+ addr = do_mmap(NULL, addr, len, prot, flags, vm_flags, 0,
+ &populate, NULL);
+ up_write(&mm->mmap_sem);
+
+ if (populate)
+ mm_populate(addr, populate);
+
+ return addr;
+}
+
/* These take the mm semaphore themselves */
extern int __must_check vm_brk(unsigned long, unsigned long);
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
--
2.17.1
The shadow stack for clone/fork is handled as the following:
(1) If ((clone_flags & (CLONE_VFORK | CLONE_VM)) == CLONE_VM),
the kernel allocates (and frees on thread exit) a new SHSTK
for the child.
It is possible for the kernel to complete the clone syscall
and set the child's SHSTK pointer to NULL and let the child
thread allocate a SHSTK for itself. There are two issues
in this approach: It is not compatible with existing code
that does inline syscall and it cannot handle signals before
the child can successfully allocate a SHSTK.
(2) For (clone_flags & CLONE_VFORK), the child uses the existing
SHSTK.
(3) For all other cases, the SHSTK is copied/reused whenever the
parent or the child does a call/ret.
This patch handles cases (1) & (2). Case (3) is handled in
the SHSTK page fault patches.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/cet.h | 2 ++
arch/x86/include/asm/mmu_context.h | 3 +++
arch/x86/kernel/cet.c | 34 ++++++++++++++++++++++++++++++
arch/x86/kernel/process.c | 1 +
arch/x86/kernel/process_64.c | 7 ++++++
5 files changed, 47 insertions(+)
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index d9ae3d86cdd7..b7b33e1026bb 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -17,12 +17,14 @@ struct cet_status {
#ifdef CONFIG_X86_INTEL_CET
int cet_setup_shstk(void);
+int cet_setup_thread_shstk(struct task_struct *p);
void cet_disable_shstk(void);
void cet_disable_free_shstk(struct task_struct *p);
int cet_restore_signal(unsigned long ssp);
int cet_setup_signal(bool ia32, unsigned long rstor, unsigned long *new_ssp);
#else
static inline int cet_setup_shstk(void) { return 0; }
+static inline int cet_setup_thread_shstk(struct task_struct *p) { return 0; }
static inline void cet_disable_shstk(void) {}
static inline void cet_disable_free_shstk(struct task_struct *p) {}
static inline int cet_restore_signal(unsigned long ssp) { return 0; }
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index eeeb9289c764..8da7c999b7ee 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -13,6 +13,7 @@
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/mpx.h>
+#include <asm/cet.h>
extern atomic64_t last_mm_ctx_id;
@@ -223,6 +224,8 @@ do { \
#else
#define deactivate_mm(tsk, mm) \
do { \
+ if (!tsk->vfork_done) \
+ cet_disable_free_shstk(tsk); \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 5cc4be6e0982..ce0b3b7b1160 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -134,6 +134,40 @@ int cet_setup_shstk(void)
return 0;
}
+int cet_setup_thread_shstk(struct task_struct *tsk)
+{
+ unsigned long addr, size;
+ struct cet_user_state *state;
+
+ if (!current->thread.cet.shstk_enabled)
+ return 0;
+
+ state = get_xsave_addr(&tsk->thread.fpu.state.xsave,
+ XFEATURE_MASK_SHSTK_USER);
+
+ if (!state)
+ return -EINVAL;
+
+ size = tsk->thread.cet.shstk_size;
+ if (size == 0)
+ size = rlimit(RLIMIT_STACK);
+
+ addr = do_mmap_locked(0, size, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_SHSTK);
+
+ if (addr >= TASK_SIZE_MAX) {
+ tsk->thread.cet.shstk_base = 0;
+ tsk->thread.cet.shstk_size = 0;
+ tsk->thread.cet.shstk_enabled = 0;
+ return -ENOMEM;
+ }
+
+ state->user_ssp = (u64)(addr + size - sizeof(u64));
+ tsk->thread.cet.shstk_base = addr;
+ tsk->thread.cet.shstk_size = size;
+ return 0;
+}
+
void cet_disable_shstk(void)
{
u64 r;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4a776da4c28c..440f012ef925 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -125,6 +125,7 @@ void exit_thread(struct task_struct *tsk)
free_vm86(t);
+ cet_disable_free_shstk(tsk);
fpu__drop(fpu);
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ea5ea850348d..9cdbd87bb908 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -325,6 +325,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
if (sp)
childregs->sp = sp;
+ /* Allocate a new shadow stack for pthread */
+ if ((clone_flags & (CLONE_VFORK | CLONE_VM)) == CLONE_VM) {
+ err = cet_setup_thread_shstk(p);
+ if (err)
+ goto out;
+ }
+
err = -ENOMEM;
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
--
2.17.1
can_follow_write_pte/pmd look for the (RO & DIRTY) PTE/PMD to
verify an exclusive RO page still exists after a broken COW.
A shadow stack PTE is RO & PAGE_DIRTY_SW when it is shared,
otherwise RO & PAGE_DIRTY_HW.
Introduce pte_exclusive() and pmd_exclusive() to also verify a
shadow stack PTE is exclusive.
Also rename can_follow_write_pte/pmd() to can_follow_write() to
make their meaning clear; i.e. "Can we write to the page?", not
"Is the PTE writable?"
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/mm/pgtable.c | 19 +++++++++++++++++++
include/asm-generic/pgtable.h | 4 ++++
mm/gup.c | 8 +++++---
mm/huge_memory.c | 8 +++++---
4 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ccdfd3dd7163..e13a020e37db 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -894,4 +894,23 @@ inline bool arch_copy_pte_mapping(vm_flags_t vm_flags)
{
return (vm_flags & VM_SHSTK);
}
+
+inline bool pte_exclusive(pte_t pte, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHSTK)
+ return pte_dirty_hw(pte);
+ else
+ return pte_dirty(pte);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+inline bool pmd_exclusive(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHSTK)
+ return pmd_dirty_hw(pmd);
+ else
+ return pmd_dirty(pmd);
+}
+#endif
+
#endif /* CONFIG_X86_INTEL_SHADOW_STACK_USER */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a91f07454ced..6223017929be 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1131,10 +1131,14 @@ static inline bool arch_has_pfn_modify_check(void)
#define pte_set_vma_features(pte, vma) pte
#define pmd_set_vma_features(pmd, vma) pmd
#define arch_copy_pte_mapping(vma_flags) false
+#define pte_exclusive(pte, vma) pte_dirty(pte)
+#define pmd_exclusive(pmd, vma) pmd_dirty(pmd)
#else
inline pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma);
inline pmd_t pmd_set_vma_features(pmd_t pmd, struct vm_area_struct *vma);
bool arch_copy_pte_mapping(vm_flags_t vm_flags);
+bool pte_exclusive(pte_t pte, struct vm_area_struct *vma);
+bool pmd_exclusive(pmd_t pmd, struct vm_area_struct *vma);
#endif
#endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/mm/gup.c b/mm/gup.c
index 1abc8b4afff6..03cb2e331f80 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -64,10 +64,12 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
* FOLL_FORCE can write to even unwritable pte's, but only
* after we've gone through a COW cycle and they are dirty.
*/
-static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
+static inline bool can_follow_write(pte_t pte, unsigned int flags,
+ struct vm_area_struct *vma)
{
return pte_write(pte) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) &&
+ pte_exclusive(pte, vma));
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -105,7 +107,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
}
if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
- if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
+ if ((flags & FOLL_WRITE) && !can_follow_write(pte, flags, vma)) {
pte_unmap_unlock(ptep, ptl);
return NULL;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index df39ae20fe40..c70aa8fa4cb2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1387,10 +1387,12 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
* FOLL_FORCE can write to even unwritable pmd's, but only
* after we've gone through a COW cycle and they are dirty.
*/
-static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+static inline bool can_follow_write(pmd_t pmd, unsigned int flags,
+ struct vm_area_struct *vma)
{
return pmd_write(pmd) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) &&
+ pmd_exclusive(pmd, vma));
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1403,7 +1405,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
assert_spin_locked(pmd_lockptr(mm, pmd));
- if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
+ if (flags & FOLL_WRITE && !can_follow_write(*pmd, flags, vma))
goto out;
/* Avoid dumping huge zero page */
--
2.17.1
When setting up a signal, the kernel creates a shadow stack
restore token at the current SHSTK address and then stores the
token's address in the signal frame, right after the FPU state.
Before restoring a signal, the kernel verifies and then uses the
restore token to set the SHSTK pointer.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/ia32/ia32_signal.c | 13 +++
arch/x86/include/asm/cet.h | 5 ++
arch/x86/include/asm/sighandling.h | 5 ++
arch/x86/include/uapi/asm/sigcontext.h | 17 ++++
arch/x86/kernel/cet.c | 115 +++++++++++++++++++++++++
arch/x86/kernel/signal.c | 96 +++++++++++++++++++++
6 files changed, 251 insertions(+)
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 86b1341cba9a..cea28d2a946e 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -34,6 +34,7 @@
#include <asm/sigframe.h>
#include <asm/sighandling.h>
#include <asm/smap.h>
+#include <asm/cet.h>
/*
* Do a signal return; undo the signal stack.
@@ -108,6 +109,9 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
err |= fpu__restore_sig(buf, 1);
+ if (!err)
+ err = restore_sigcontext_ext(buf);
+
force_iret();
return err;
@@ -234,6 +238,10 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
if (fpu->initialized) {
unsigned long fx_aligned, math_size;
+ /* sigcontext extension */
+ if (boot_cpu_has(X86_FEATURE_SHSTK))
+ sp -= (sizeof(struct sc_ext) + 8);
+
sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
*fpstate = (struct _fpstate_32 __user *) sp;
if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
@@ -277,6 +285,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
return -EFAULT;
+ if (setup_sigcontext_ext(ksig, fpstate))
+ return -EFAULT;
if (_COMPAT_NSIG_WORDS > 1) {
if (__copy_to_user(frame->extramask, &set->sig[1],
@@ -384,6 +394,9 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ if (!err)
+ err = setup_sigcontext_ext(ksig, fpstate);
+
if (err)
return -EFAULT;
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index ad278c520414..d9ae3d86cdd7 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -19,10 +19,15 @@ struct cet_status {
int cet_setup_shstk(void);
void cet_disable_shstk(void);
void cet_disable_free_shstk(struct task_struct *p);
+int cet_restore_signal(unsigned long ssp);
+int cet_setup_signal(bool ia32, unsigned long rstor, unsigned long *new_ssp);
#else
static inline int cet_setup_shstk(void) { return 0; }
static inline void cet_disable_shstk(void) {}
static inline void cet_disable_free_shstk(struct task_struct *p) {}
+static inline int cet_restore_signal(unsigned long ssp) { return 0; }
+static inline int cet_setup_signal(bool ia32, unsigned long rstor,
+ unsigned long *new_ssp) { return 0; }
#endif
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index bd26834724e5..23014b4082de 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -17,4 +17,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
struct pt_regs *regs, unsigned long mask);
+#ifdef CONFIG_X86_64
+int setup_sigcontext_ext(struct ksignal *ksig, void __user *fpu);
+int restore_sigcontext_ext(void __user *fpu);
+#endif
+
#endif /* _ASM_X86_SIGHANDLING_H */
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 844d60eb1882..74f5ea5dcd24 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -196,6 +196,23 @@ struct _xstate {
/* New processor state extensions go here: */
};
+#ifdef __x86_64__
+/*
+ * Sigcontext extension (struct sc_ext) is located after
+ * sigcontext->fpstate. Because currently only the shadow
+ * stack pointer is saved there and the shadow stack depends
+ * on XSAVES, we can find sc_ext from sigcontext->fpstate.
+ *
+ * The 64-bit fpstate has a size of fpu_user_xstate_size, plus
+ * FP_XSTATE_MAGIC2_SIZE when XSAVE* is used. The struct sc_ext
+ * is located at the end of sigcontext->fpstate, aligned to 8.
+ */
+struct sc_ext {
+ unsigned long total_size;
+ unsigned long ssp;
+};
+#endif
+
/*
* The 32-bit signal frame:
*/
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index ec256ae27a31..5cc4be6e0982 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -18,6 +18,7 @@
#include <asm/fpu/types.h>
#include <asm/compat.h>
#include <asm/cet.h>
+#include <asm/special_insns.h>
static int set_shstk_ptr(unsigned long addr)
{
@@ -46,6 +47,69 @@ static unsigned long get_shstk_addr(void)
return ptr;
}
+/*
+ * Verify the restore token at the address of 'ssp' is
+ * valid and then set shadow stack pointer according to the
+ * token.
+ */
+static int verify_rstor_token(bool ia32, unsigned long ssp,
+ unsigned long *new_ssp)
+{
+ unsigned long token;
+
+ *new_ssp = 0;
+
+ if (!IS_ALIGNED(ssp, 8))
+ return -EINVAL;
+
+ if (get_user(token, (unsigned long __user *)ssp))
+ return -EFAULT;
+
+ /* Is 64-bit mode flag correct? */
+ if (ia32 && (token & 3) != 0)
+ return -EINVAL;
+ else if ((token & 3) != 1)
+ return -EINVAL;
+
+ token &= ~(1UL);
+
+ if ((!ia32 && !IS_ALIGNED(token, 8)) || !IS_ALIGNED(token, 4))
+ return -EINVAL;
+
+ if ((ALIGN_DOWN(token, 8) - 8) != ssp)
+ return -EINVAL;
+
+ *new_ssp = token;
+ return 0;
+}
+
+/*
+ * Create a restore token on the shadow stack.
+ * A token is always 8-byte and aligned to 8.
+ */
+static int create_rstor_token(bool ia32, unsigned long ssp,
+ unsigned long *new_ssp)
+{
+ unsigned long addr;
+
+ *new_ssp = 0;
+
+ if ((!ia32 && !IS_ALIGNED(ssp, 8)) || !IS_ALIGNED(ssp, 4))
+ return -EINVAL;
+
+ addr = ALIGN_DOWN(ssp, 8) - 8;
+
+ /* Is the token for 64-bit? */
+ if (!ia32)
+ ssp |= 1;
+
+ if (write_user_shstk_64(addr, ssp))
+ return -EFAULT;
+
+ *new_ssp = addr;
+ return 0;
+}
+
int cet_setup_shstk(void)
{
unsigned long addr, size;
@@ -107,3 +171,54 @@ void cet_disable_free_shstk(struct task_struct *tsk)
tsk->thread.cet.shstk_enabled = 0;
}
+
+int cet_restore_signal(unsigned long ssp)
+{
+ unsigned long new_ssp;
+ int err;
+
+ if (!current->thread.cet.shstk_enabled)
+ return 0;
+
+ err = verify_rstor_token(in_ia32_syscall(), ssp, &new_ssp);
+
+ if (err)
+ return err;
+
+ return set_shstk_ptr(new_ssp);
+}
+
+/*
+ * Setup the shadow stack for the signal handler: first,
+ * create a restore token to keep track of the current ssp,
+ * and then the return address of the signal handler.
+ */
+int cet_setup_signal(bool ia32, unsigned long rstor_addr,
+ unsigned long *new_ssp)
+{
+ unsigned long ssp;
+ int err;
+
+ if (!current->thread.cet.shstk_enabled)
+ return 0;
+
+ ssp = get_shstk_addr();
+ err = create_rstor_token(ia32, ssp, new_ssp);
+
+ if (err)
+ return err;
+
+ if (ia32) {
+ ssp = *new_ssp - sizeof(u32);
+ err = write_user_shstk_32(ssp, (unsigned int)rstor_addr);
+ } else {
+ ssp = *new_ssp - sizeof(u64);
+ err = write_user_shstk_64(ssp, rstor_addr);
+ }
+
+ if (err)
+ return err;
+
+ set_shstk_ptr(ssp);
+ return 0;
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 92a3b312a53c..e9a85689143f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -46,6 +46,7 @@
#include <asm/sigframe.h>
#include <asm/signal.h>
+#include <asm/cet.h>
#define COPY(x) do { \
get_user_ex(regs->x, &sc->x); \
@@ -152,6 +153,10 @@ static int restore_sigcontext(struct pt_regs *regs,
err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
+#ifdef CONFIG_X86_64
+ err |= restore_sigcontext_ext(buf);
+#endif
+
force_iret();
return err;
@@ -266,6 +271,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
}
if (fpu->initialized) {
+#ifdef CONFIG_X86_64
+ /* sigcontext extension */
+ if (boot_cpu_has(X86_FEATURE_SHSTK))
+ sp -= sizeof(struct sc_ext) + 8;
+#endif
sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
&buf_fx, &math_size);
*fpstate = (void __user *)sp;
@@ -493,6 +503,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ if (!err)
+ err = setup_sigcontext_ext(ksig, fp);
+
if (err)
return -EFAULT;
@@ -576,6 +589,9 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ if (!err)
+ err = setup_sigcontext_ext(ksig, fpstate);
+
if (err)
return -EFAULT;
@@ -707,6 +723,86 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
}
}
+#ifdef CONFIG_X86_64
+static int copy_ext_from_user(struct sc_ext *ext, void __user *fpu)
+{
+ void __user *p;
+
+ if (!fpu)
+ return -EINVAL;
+
+ p = fpu + fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
+ p = (void __user *)ALIGN((unsigned long)p, 8);
+
+ if (!access_ok(VERIFY_READ, p, sizeof(*ext)))
+ return -EFAULT;
+
+ if (__copy_from_user(ext, p, sizeof(*ext)))
+ return -EFAULT;
+
+ if (ext->total_size != sizeof(*ext))
+ return -EINVAL;
+ return 0;
+}
+
+static int copy_ext_to_user(void __user *fpu, struct sc_ext *ext)
+{
+ void __user *p;
+
+ if (!fpu)
+ return -EINVAL;
+
+ if (ext->total_size != sizeof(*ext))
+ return -EINVAL;
+
+ p = fpu + fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
+ p = (void __user *)ALIGN((unsigned long)p, 8);
+
+ if (!access_ok(VERIFY_WRITE, p, sizeof(*ext)))
+ return -EFAULT;
+
+ if (__copy_to_user(p, ext, sizeof(*ext)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int restore_sigcontext_ext(void __user *fp)
+{
+ int err = 0;
+
+ if (boot_cpu_has(X86_FEATURE_SHSTK) && fp) {
+ struct sc_ext ext = {0, 0};
+
+ err = copy_ext_from_user(&ext, fp);
+
+ if (!err)
+ err = cet_restore_signal(ext.ssp);
+ }
+
+ return err;
+}
+
+int setup_sigcontext_ext(struct ksignal *ksig, void __user *fp)
+{
+ int err = 0;
+
+ if (boot_cpu_has(X86_FEATURE_SHSTK) && fp) {
+ struct sc_ext ext = {0, 0};
+ unsigned long rstor;
+
+ rstor = (unsigned long)ksig->ka.sa.sa_restorer;
+ err = cet_setup_signal(is_ia32_frame(ksig), rstor, &ext.ssp);
+ if (!err) {
+ ext.total_size = sizeof(ext);
+ err = copy_ext_to_user(fp, &ext);
+ }
+ }
+
+ return err;
+}
+#endif
+
static void
handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
--
2.17.1
We are going to create _PAGE_DIRTY_SW for non-hardware, memory
management purposes. Rename _PAGE_DIRTY to _PAGE_DIRTY_HW and
_PAGE_BIT_DIRTY to _PAGE_BIT_DIRTY_HW to make these PTE dirty
bits more clear. There are no functional changes in this
patch.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/pgtable.h | 6 +++---
arch/x86/include/asm/pgtable_types.h | 17 +++++++++--------
arch/x86/kernel/relocate_kernel_64.S | 2 +-
arch/x86/kvm/vmx.c | 2 +-
4 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 690c0307afed..95c918ad84ed 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -316,7 +316,7 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return pte_set_flags(pte, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
static inline pte_t pte_mkyoung(pte_t pte)
@@ -390,7 +390,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return pmd_set_flags(pmd, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
@@ -444,7 +444,7 @@ static inline pud_t pud_wrprotect(pud_t pud)
static inline pud_t pud_mkdirty(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return pud_set_flags(pud, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_mkdevmap(pud_t pud)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b64acb08a62b..0657a22d5216 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -15,7 +15,7 @@
#define _PAGE_BIT_PWT 3 /* page write through */
#define _PAGE_BIT_PCD 4 /* page cache disabled */
#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
-#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
+#define _PAGE_BIT_DIRTY_HW 6 /* was written to (raised by CPU) */
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
@@ -45,7 +45,7 @@
#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
-#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_DIRTY_HW (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY_HW)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
@@ -73,7 +73,7 @@
_PAGE_PKEY_BIT3)
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
+#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY_HW | _PAGE_ACCESSED)
#else
#define _PAGE_KNL_ERRATUM_MASK 0
#endif
@@ -112,9 +112,9 @@
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
- _PAGE_ACCESSED | _PAGE_DIRTY)
+ _PAGE_ACCESSED | _PAGE_DIRTY_HW)
#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \
- _PAGE_ACCESSED | _PAGE_DIRTY)
+ _PAGE_ACCESSED | _PAGE_DIRTY_HW)
/*
* Set of bits not changed in pte_modify. The pte's
@@ -123,7 +123,7 @@
* pte_modify() does modify it.
*/
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
- _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY_HW | \
_PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
@@ -168,7 +168,8 @@ enum page_cache_mode {
_PAGE_ACCESSED)
#define __PAGE_KERNEL_EXEC \
- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY_HW | _PAGE_ACCESSED | \
+ _PAGE_GLOBAL)
#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
@@ -187,7 +188,7 @@ enum page_cache_mode {
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
- _PAGE_DIRTY | _PAGE_ENC)
+ _PAGE_DIRTY_HW | _PAGE_ENC)
#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER)
#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 11eda21eb697..e7665a4767b3 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -17,7 +17,7 @@
*/
#define PTR(x) (x << 3)
-#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY_HW)
/*
* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 533a327372c8..35f01203a14b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5848,7 +5848,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
/* Set up identity-mapping pagetable for EPT in real mode */
for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ _PAGE_ACCESSED | _PAGE_DIRTY_HW | _PAGE_PSE);
r = kvm_write_guest_page(kvm, identity_map_pfn,
&tmp, i * sizeof(tmp), sizeof(tmp));
if (r < 0)
--
2.17.1
XSAVES saves both system and user states. The Linux kernel
currently does not save/restore any system states. This patch
creates the framework for supporting system states.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/fpu/internal.h | 3 +-
arch/x86/include/asm/fpu/xstate.h | 9 ++-
arch/x86/kernel/fpu/core.c | 7 +-
arch/x86/kernel/fpu/init.c | 10 ---
arch/x86/kernel/fpu/xstate.c | 112 +++++++++++++++++-----------
5 files changed, 80 insertions(+), 61 deletions(-)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index f1f9bf91a0ab..1f447865db3a 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -45,7 +45,6 @@ extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
-extern u64 fpu__get_supported_xfeatures_mask(void);
/*
* Debugging facility:
@@ -94,7 +93,7 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave)
* trigger #GP:
*/
xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
- xfeatures_mask_user;
+ xfeatures_mask_all;
}
static inline void fpstate_init_fxstate(struct fxregs_state *fx)
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 9b382e5157ed..a32dc5f8c963 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -19,10 +19,10 @@
#define XSAVE_YMM_SIZE 256
#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
-/* System features */
-#define XFEATURE_MASK_SYSTEM (XFEATURE_MASK_PT)
-
-/* All currently supported features */
+/*
+ * SUPPORTED_XFEATURES_MASK indicates all features
+ * implemented in and supported by the kernel.
+ */
#define SUPPORTED_XFEATURES_MASK (XFEATURE_MASK_FP | \
XFEATURE_MASK_SSE | \
XFEATURE_MASK_YMM | \
@@ -40,6 +40,7 @@
#endif
extern u64 xfeatures_mask_user;
+extern u64 xfeatures_mask_all;
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 4bd56079048f..9f51b0e1da25 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -365,8 +365,13 @@ void fpu__drop(struct fpu *fpu)
*/
static inline void copy_init_user_fpstate_to_fpregs(void)
{
+ /*
+ * Only XSAVES user states are copied.
+ * System states are preserved.
+ */
if (use_xsave())
- copy_kernel_to_xregs(&init_fpstate.xsave, -1);
+ copy_kernel_to_xregs(&init_fpstate.xsave,
+ xfeatures_mask_user);
else if (static_cpu_has(X86_FEATURE_FXSR))
copy_kernel_to_fxregs(&init_fpstate.fxsave);
else
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 761c3a5a9e07..eaf9d9d479a5 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -222,16 +222,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
fpu_user_xstate_size = fpu_kernel_xstate_size;
}
-/*
- * Find supported xfeatures based on cpu features and command-line input.
- * This must be called after fpu__init_parse_early_param() is called and
- * xfeatures_mask is enumerated.
- */
-u64 __init fpu__get_supported_xfeatures_mask(void)
-{
- return SUPPORTED_XFEATURES_MASK;
-}
-
/* Legacy code to initialize eager fpu mode. */
static void __init fpu__init_system_ctx_switch(void)
{
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 19f8df54c72a..dd2c561c4544 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -51,13 +51,16 @@ static short xsave_cpuid_features[] __initdata = {
};
/*
- * Mask of xstate features supported by the CPU and the kernel:
+ * Mask of xstate features supported by the CPU and the kernel.
+ * This is the result from CPUID query, SUPPORTED_XFEATURES_MASK,
+ * and boot_cpu_has().
*/
u64 xfeatures_mask_user __read_mostly;
+u64 xfeatures_mask_all __read_mostly;
static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask_user)*8];
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask_all)*8];
/*
* The XSAVE area of kernel can be in standard or compacted format;
@@ -82,7 +85,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
*/
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
- u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_user;
+ u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all;
if (unlikely(feature_name)) {
long xfeature_idx, max_idx;
@@ -164,7 +167,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
* None of the feature bits are in init state. So nothing else
* to do for us, as the memory layout is up to date.
*/
- if ((xfeatures & xfeatures_mask_user) == xfeatures_mask_user)
+ if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all)
return;
/*
@@ -219,30 +222,31 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
*/
void fpu__init_cpu_xstate(void)
{
- if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_user)
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
return;
+
+ cr4_set_bits(X86_CR4_OSXSAVE);
+
/*
- * Make it clear that XSAVES system states are not yet
- * implemented should anyone expect it to work by changing
- * bits in XFEATURE_MASK_* macros and XCR0.
+ * XCR_XFEATURE_ENABLED_MASK sets the features that are managed
+ * by XSAVE{C, OPT} and XRSTOR. Only XSAVE user states can be
+ * set here.
*/
- WARN_ONCE((xfeatures_mask_user & XFEATURE_MASK_SYSTEM),
- "x86/fpu: XSAVES system states are not yet implemented.\n");
+ xsetbv(XCR_XFEATURE_ENABLED_MASK,
+ xfeatures_mask_user);
- xfeatures_mask_user &= ~XFEATURE_MASK_SYSTEM;
-
- cr4_set_bits(X86_CR4_OSXSAVE);
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
+ /*
+ * MSR_IA32_XSS sets which XSAVES system states to be managed by
+ * XSAVES. Only XSAVES system states can be set here.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ wrmsrl(MSR_IA32_XSS,
+ xfeatures_mask_all & ~xfeatures_mask_user);
}
-/*
- * Note that in the future we will likely need a pair of
- * functions here: one for user xstates and the other for
- * system xstates. For now, they are the same.
- */
static int xfeature_enabled(enum xfeature xfeature)
{
- return !!(xfeatures_mask_user & BIT_ULL(xfeature));
+ return !!(xfeatures_mask_all & BIT_ULL(xfeature));
}
/*
@@ -348,7 +352,7 @@ static int xfeature_is_aligned(int xfeature_nr)
*/
static void __init setup_xstate_comp(void)
{
- unsigned int xstate_comp_sizes[sizeof(xfeatures_mask_user)*8];
+ unsigned int xstate_comp_sizes[sizeof(xfeatures_mask_all)*8];
int i;
/*
@@ -422,7 +426,7 @@ static void __init setup_init_fpu_buf(void)
if (boot_cpu_has(X86_FEATURE_XSAVES))
init_fpstate.xsave.header.xcomp_bv =
- BIT_ULL(63) | xfeatures_mask_user;
+ BIT_ULL(63) | xfeatures_mask_all;
/*
* Init all the features state with header.xfeatures being 0x0
@@ -441,11 +445,10 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
u32 eax, ebx, ecx, edx;
/*
- * Only XSAVES supports system states and it uses compacted
- * format. Checking a system state's uncompacted offset is
- * an error.
+ * Checking a system or unsupported state's uncompacted offset
+ * is an error.
*/
- if (XFEATURE_MASK_SYSTEM & (1 << xfeature_nr)) {
+ if (~xfeatures_mask_user & BIT_ULL(xfeature_nr)) {
WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
return -1;
}
@@ -482,7 +485,7 @@ int using_compacted_format(void)
int validate_xstate_header(const struct xstate_header *hdr)
{
/* No unknown or system features may be set */
- if (hdr->xfeatures & (~xfeatures_mask_user | XFEATURE_MASK_SYSTEM))
+ if (hdr->xfeatures & ~xfeatures_mask_user)
return -EINVAL;
/* Userspace must use the uncompacted format */
@@ -617,15 +620,12 @@ static void do_extra_xstate_size_checks(void)
/*
- * Get total size of enabled xstates in XCR0/xfeatures_mask_user.
+ * Get total size of enabled xstates in XCR0 | IA32_XSS.
*
* Note the SDM's wording here. "sub-function 0" only enumerates
* the size of the *user* states. If we use it to size a buffer
* that we use 'XSAVES' on, we could potentially overflow the
* buffer because 'XSAVES' saves system states too.
- *
- * Note that we do not currently set any bits on IA32_XSS so
- * 'XCR0 | IA32_XSS == XCR0' for now.
*/
static unsigned int __init get_xsaves_size(void)
{
@@ -707,6 +707,7 @@ static int init_xstate_size(void)
*/
static void fpu__init_disable_system_xstate(void)
{
+ xfeatures_mask_all = 0;
xfeatures_mask_user = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
fpu__xstate_clear_all_cpu_caps();
@@ -722,6 +723,8 @@ void __init fpu__init_system_xstate(void)
static int on_boot_cpu __initdata = 1;
int err;
int i;
+ u64 cpu_user_xfeatures_mask;
+ u64 cpu_system_xfeatures_mask;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -742,10 +745,24 @@ void __init fpu__init_system_xstate(void)
return;
}
+ /*
+ * Find user states supported by the processor.
+ * Only these bits can be set in XCR0.
+ */
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xfeatures_mask_user = eax + ((u64)edx << 32);
+ cpu_user_xfeatures_mask = eax + ((u64)edx << 32);
+
+ /*
+ * Find system states supported by the processor.
+ * Only these bits can be set in IA32_XSS MSR.
+ */
+ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+ cpu_system_xfeatures_mask = ecx + ((u64)edx << 32);
- if ((xfeatures_mask_user & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ xfeatures_mask_all = cpu_user_xfeatures_mask |
+ cpu_system_xfeatures_mask;
+
+ if ((xfeatures_mask_all & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
/*
* This indicates that something really unexpected happened
* with the enumeration. Disable XSAVE and try to continue
@@ -760,10 +777,11 @@ void __init fpu__init_system_xstate(void)
*/
for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
if (!boot_cpu_has(xsave_cpuid_features[i]))
- xfeatures_mask_user &= ~BIT_ULL(i);
+ xfeatures_mask_all &= ~BIT_ULL(i);
}
- xfeatures_mask_user &= fpu__get_supported_xfeatures_mask();
+ xfeatures_mask_all &= SUPPORTED_XFEATURES_MASK;
+ xfeatures_mask_user = xfeatures_mask_all & cpu_user_xfeatures_mask;
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
@@ -775,8 +793,7 @@ void __init fpu__init_system_xstate(void)
* Update info used for ptrace frames; use standard-format size and no
* system xstates:
*/
- update_regset_xstate_info(fpu_user_xstate_size,
- xfeatures_mask_user & ~XFEATURE_MASK_SYSTEM);
+ update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user);
fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
@@ -784,7 +801,7 @@ void __init fpu__init_system_xstate(void)
print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
- xfeatures_mask_user,
+ xfeatures_mask_all,
fpu_kernel_xstate_size,
boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
return;
@@ -804,6 +821,13 @@ void fpu__resume_cpu(void)
*/
if (boot_cpu_has(X86_FEATURE_XSAVE))
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
+
+ /*
+ * Restore IA32_XSS
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ wrmsrl(MSR_IA32_XSS,
+ xfeatures_mask_all & ~xfeatures_mask_user);
}
/*
@@ -853,9 +877,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
/*
* We should not ever be requesting features that we
* have not enabled. Remember that pcntxt_mask is
- * what we write to the XCR0 register.
+ * what we write to the XCR0 | IA32_XSS registers.
*/
- WARN_ONCE(!(xfeatures_mask_user & xstate_feature),
+ WARN_ONCE(!(xfeatures_mask_all & xstate_feature),
"get of unsupported state");
/*
* This assumes the last 'xsave*' instruction to
@@ -1005,7 +1029,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
*/
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= ~XFEATURE_MASK_SYSTEM;
+ header.xfeatures &= xfeatures_mask_user;
/*
* Copy xregs_state->header:
@@ -1089,7 +1113,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
*/
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= ~XFEATURE_MASK_SYSTEM;
+ header.xfeatures &= xfeatures_mask_user;
/*
* Copy xregs_state->header:
@@ -1182,7 +1206,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
*/
- xsave->header.xfeatures &= XFEATURE_MASK_SYSTEM;
+ xsave->header.xfeatures &= (xfeatures_mask_all & ~xfeatures_mask_user);
/*
* Add back in the features that came in from userspace:
@@ -1238,7 +1262,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
*/
- xsave->header.xfeatures &= XFEATURE_MASK_SYSTEM;
+ xsave->header.xfeatures &= (xfeatures_mask_all & ~xfeatures_mask_user);
/*
* Add back in the features that came in from userspace:
--
2.17.1
arch_prctl(ARCH_CET_STATUS, unsigned long *addr)
Return CET feature status.
The parameter 'addr' is a pointer to a user buffer.
On returning to the caller, the kernel fills the following
information:
*addr = SHSTK/IBT status
*(addr + 1) = SHSTK base address
*(addr + 2) = SHSTK size
arch_prctl(ARCH_CET_DISABLE, unsigned long features)
Disable CET features specified in 'features'. Return
-EPERM if CET is locked.
arch_prctl(ARCH_CET_LOCK)
Lock in CET feature.
arch_prctl(ARCH_CET_ALLOC_SHSTK, unsigned long *addr)
Allocate a new SHSTK.
The parameter 'addr' is a pointer to a user buffer and indicates
the desired SHSTK size to allocate. On returning to the caller
the buffer contains the address of the new SHSTK.
Signed-off-by: H.J. Lu <[email protected]>
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/cet.h | 5 ++
arch/x86/include/uapi/asm/prctl.h | 5 ++
arch/x86/kernel/Makefile | 2 +-
arch/x86/kernel/cet.c | 27 +++++++++++
arch/x86/kernel/cet_prctl.c | 79 +++++++++++++++++++++++++++++++
arch/x86/kernel/process.c | 5 ++
6 files changed, 122 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/kernel/cet_prctl.c
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index b7b33e1026bb..212bd68e31d3 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -12,19 +12,24 @@ struct task_struct;
struct cet_status {
unsigned long shstk_base;
unsigned long shstk_size;
+ unsigned int locked:1;
unsigned int shstk_enabled:1;
};
#ifdef CONFIG_X86_INTEL_CET
+int prctl_cet(int option, unsigned long arg2);
int cet_setup_shstk(void);
int cet_setup_thread_shstk(struct task_struct *p);
+int cet_alloc_shstk(unsigned long *arg);
void cet_disable_shstk(void);
void cet_disable_free_shstk(struct task_struct *p);
int cet_restore_signal(unsigned long ssp);
int cet_setup_signal(bool ia32, unsigned long rstor, unsigned long *new_ssp);
#else
+static inline int prctl_cet(int option, unsigned long arg2) { return 0; }
static inline int cet_setup_shstk(void) { return 0; }
static inline int cet_setup_thread_shstk(struct task_struct *p) { return 0; }
+static inline int cet_alloc_shstk(unsigned long *arg) { return -EINVAL; }
static inline void cet_disable_shstk(void) {}
static inline void cet_disable_free_shstk(struct task_struct *p) {}
static inline int cet_restore_signal(unsigned long ssp) { return 0; }
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..3aec1088e01d 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -14,4 +14,9 @@
#define ARCH_MAP_VDSO_32 0x2002
#define ARCH_MAP_VDSO_64 0x2003
+#define ARCH_CET_STATUS 0x3001
+#define ARCH_CET_DISABLE 0x3002
+#define ARCH_CET_LOCK 0x3003
+#define ARCH_CET_ALLOC_SHSTK 0x3004
+
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 36b14ef410c8..b9e6cdc6b4f7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -139,7 +139,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-obj-$(CONFIG_X86_INTEL_CET) += cet.o
+obj-$(CONFIG_X86_INTEL_CET) += cet.o cet_prctl.o
obj-$(CONFIG_ARCH_HAS_PROGRAM_PROPERTIES) += elf.o
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index ce0b3b7b1160..1c2689738604 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -110,6 +110,33 @@ static int create_rstor_token(bool ia32, unsigned long ssp,
return 0;
}
+int cet_alloc_shstk(unsigned long *arg)
+{
+ unsigned long len = *arg;
+ unsigned long addr;
+ unsigned long token;
+ unsigned long ssp;
+
+ addr = do_mmap_locked(0, len, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_SHSTK);
+ if (addr >= TASK_SIZE_MAX)
+ return -ENOMEM;
+
+ /* Restore token is 8 bytes and aligned to 8 bytes */
+ ssp = addr + len;
+ token = ssp;
+
+ if (!in_ia32_syscall())
+ token |= 1;
+ ssp -= 8;
+
+ if (write_user_shstk_64(ssp, token))
+ return -EINVAL;
+
+ *arg = addr;
+ return 0;
+}
+
int cet_setup_shstk(void)
{
unsigned long addr, size;
diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
new file mode 100644
index 000000000000..c4b7c19f5040
--- /dev/null
+++ b/arch/x86/kernel/cet_prctl.c
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/errno.h>
+#include <linux/uaccess.h>
+#include <linux/prctl.h>
+#include <linux/compat.h>
+#include <asm/processor.h>
+#include <asm/prctl.h>
+#include <asm/elf.h>
+#include <asm/elf_property.h>
+#include <asm/cet.h>
+
+/* See Documentation/x86/intel_cet.txt. */
+
+static int handle_get_status(unsigned long arg2)
+{
+ unsigned int features = 0;
+ unsigned long shstk_base, shstk_size;
+ unsigned long buf[3];
+
+ if (current->thread.cet.shstk_enabled)
+ features |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+
+ shstk_base = current->thread.cet.shstk_base;
+ shstk_size = current->thread.cet.shstk_size;
+
+ buf[0] = (unsigned long)features;
+ buf[1] = shstk_base;
+ buf[2] = shstk_size;
+ return copy_to_user((unsigned long __user *)arg2, buf,
+ sizeof(buf));
+}
+
+static int handle_alloc_shstk(unsigned long arg2)
+{
+ int err = 0;
+ unsigned long shstk_size = 0;
+
+ if (get_user(shstk_size, (unsigned long __user *)arg2))
+ return -EFAULT;
+
+ err = cet_alloc_shstk(&shstk_size);
+ if (err)
+ return err;
+
+ if (put_user(shstk_size, (unsigned long __user *)arg2))
+ return -EFAULT;
+
+ return 0;
+}
+
+int prctl_cet(int option, unsigned long arg2)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return -EINVAL;
+
+ switch (option) {
+ case ARCH_CET_STATUS:
+ return handle_get_status(arg2);
+
+ case ARCH_CET_DISABLE:
+ if (current->thread.cet.locked)
+ return -EPERM;
+ if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+ cet_disable_free_shstk(current);
+
+ return 0;
+
+ case ARCH_CET_LOCK:
+ current->thread.cet.locked = 1;
+ return 0;
+
+ case ARCH_CET_ALLOC_SHSTK:
+ return handle_alloc_shstk(arg2);
+
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 440f012ef925..251b8714f9a3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -792,6 +792,11 @@ long do_arch_prctl_common(struct task_struct *task, int option,
return get_cpuid_mode();
case ARCH_SET_CPUID:
return set_cpuid_mode(task, cpuid_enabled);
+ case ARCH_CET_STATUS:
+ case ARCH_CET_DISABLE:
+ case ARCH_CET_LOCK:
+ case ARCH_CET_ALLOC_SHSTK:
+ return prctl_cet(option, cpuid_enabled);
}
return -EINVAL;
--
2.17.1
Add shadow stack pages to memory accounting.
Also check if the system has enough memory before enabling CET.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu.intel.com>
---
mm/mmap.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c
index 5f2b2b184c60..de2d0faa1c61 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1671,6 +1671,9 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
if (file && is_file_hugepages(file))
return 0;
+ if (arch_copy_pte_mapping(vm_flags))
+ return 1;
+
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
@@ -3261,6 +3264,8 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
mm->stack_vm += npages;
else if (is_data_mapping(flags))
mm->data_vm += npages;
+ else if (arch_copy_pte_mapping(flags))
+ mm->data_vm += npages;
}
static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
--
2.17.1
Update _PAGE_DIRTY to _PAGE_DIRTY_BITS in split_2MB_gtt_entry().
In order to support Control Flow Enforcement (CET), _PAGE_DIRTY
is now _PAGE_DIRTY_HW or _PAGE_DIRTY_SW.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
drivers/gpu/drm/i915/gvt/gtt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 00aad8164dec..2d6ba1462dd8 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -1170,7 +1170,7 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu,
}
/* Clear dirty field. */
- se->val64 &= ~_PAGE_DIRTY;
+ se->val64 &= ~_PAGE_DIRTY_BITS;
ops->clear_pse(se);
ops->clear_ips(se);
--
2.17.1
Introduce Kconfig option X86_INTEL_SHADOW_STACK_USER.
An application has shadow stack protection when all the following are
true:
(1) The kernel has X86_INTEL_SHADOW_STACK_USER enabled,
(2) The running processor supports the shadow stack,
(3) The application is built with shadow stack enabled tools & libs
and, and at runtime, all dependent shared libs can support shadow
stack.
If this kernel config option is enabled, but (2) or (3) above is not
true, the application runs without the shadow stack protection.
Existing legacy applications will continue to work without the shadow
stack protection.
The user-mode shadow stack protection is only implemented for the
64-bit kernel. Thirty-two bit applications are supported under the
compatibility mode.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/Kconfig | 24 ++++++++++++++++++++++++
arch/x86/Makefile | 7 +++++++
2 files changed, 31 insertions(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1a0be022f91d..808aa3aecf3c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1913,6 +1913,30 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
If unsure, say y.
+config X86_INTEL_CET
+ def_bool n
+
+config ARCH_HAS_SHSTK
+ def_bool n
+
+config X86_INTEL_SHADOW_STACK_USER
+ prompt "Intel Shadow Stack for user-mode"
+ def_bool n
+ depends on CPU_SUP_INTEL && X86_64
+ select X86_INTEL_CET
+ select ARCH_HAS_SHSTK
+ ---help---
+ Shadow stack provides hardware protection against program stack
+ corruption. Only when all the following are true will an application
+ have the shadow stack protection: the kernel supports it (i.e. this
+ feature is enabled), the application is compiled and linked with
+ shadow stack enabled, and the processor supports this feature.
+ When the kernel has this configuration enabled, existing non shadow
+ stack applications will continue to work, but without shadow stack
+ protection.
+
+ If unsure, say y.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 8f6e7eb8ae9f..b28842b80295 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -152,6 +152,13 @@ ifdef CONFIG_X86_X32
endif
export CONFIG_X86_X32_ABI
+# Check assembler shadow stack suppot
+ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+ ifeq ($(call as-instr, saveprevssp, y),)
+ $(error CONFIG_X86_INTEL_SHADOW_STACK_USER not supported by the assembler)
+ endif
+endif
+
#
# If the function graph tracer is used with mcount instead of fentry,
# '-maccumulate-outgoing-args' is needed to prevent a GCC bug
--
2.17.1
This patch implements THP shadow stack memory copying in the same
way as the previous patch for regular PTE.
In copy_huge_pmd(), we clear the dirty bit from the PMD. On the
next shadow stack access to the PMD, a page fault occurs. At
that time, the page is copied/re-used and the PMD is fixed.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/mm/pgtable.c | 8 ++++++++
include/asm-generic/pgtable.h | 2 ++
mm/huge_memory.c | 4 ++++
3 files changed, 14 insertions(+)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 57eeb2230340..ccdfd3dd7163 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -882,6 +882,14 @@ inline pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma)
return pte;
}
+inline pmd_t pmd_set_vma_features(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHSTK)
+ return pmd_mkdirty_shstk(pmd);
+ else
+ return pmd;
+}
+
inline bool arch_copy_pte_mapping(vm_flags_t vm_flags)
{
return (vm_flags & VM_SHSTK);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index b99aa3677350..a91f07454ced 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1129,9 +1129,11 @@ static inline bool arch_has_pfn_modify_check(void)
#ifndef CONFIG_ARCH_HAS_SHSTK
#define pte_set_vma_features(pte, vma) pte
+#define pmd_set_vma_features(pmd, vma) pmd
#define arch_copy_pte_mapping(vma_flags) false
#else
inline pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma);
+inline pmd_t pmd_set_vma_features(pmd_t pmd, struct vm_area_struct *vma);
bool arch_copy_pte_mapping(vm_flags_t vm_flags);
#endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 533f9b00147d..df39ae20fe40 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -597,6 +597,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_set_vma_features(entry, vma);
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
@@ -1194,6 +1195,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
pte_t entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = pte_set_vma_features(entry, vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
@@ -1278,6 +1280,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_set_vma_features(entry, vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
@@ -1349,6 +1352,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_set_vma_features(entry, vma);
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
--
2.17.1
VM_SHSTK indicates a shadow stack memory area.
The shadow stack is implemented only for the 64-bit kernel.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
include/linux/mm.h | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a61ebe8ad4ca..f40387ecd920 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -224,11 +224,13 @@ extern unsigned int kobjsize(const void *objp);
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -266,6 +268,12 @@ extern unsigned int kobjsize(const void *objp);
# define VM_MPX VM_NONE
#endif
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+# define VM_SHSTK VM_HIGH_ARCH_5
+#else
+# define VM_SHSTK VM_NONE
+#endif
+
#ifndef VM_GROWSUP
# define VM_GROWSUP VM_NONE
#endif
--
2.17.1
Function returns could unwind stacks beyond its allocated area.
We do not merge shadow stack areas.
This and VMA guards prevent shadow stack underflow.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
mm/mmap.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c
index de2d0faa1c61..fa581ced3f56 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1123,6 +1123,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
+ /*
+ * Do not merge shadow stack areas.
+ */
+ if (vm_flags & VM_SHSTK)
+ return NULL;
+
if (prev)
next = prev->vm_next;
else
--
2.17.1
Add CPUIDs for Control-flow Enforcement Technology (CET).
CPUID.(EAX=7,ECX=0):ECX[bit 7] Shadow stack
CPUID.(EAX=7,ECX=0):EDX[bit 20] Indirect branch tracking
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/cpufeatures.h | 2 ++
arch/x86/kernel/cpu/scattered.c | 1 +
2 files changed, 3 insertions(+)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 89a048c2faec..fa69651a017e 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -221,6 +221,7 @@
#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
+#define X86_FEATURE_IBT ( 7*32+31) /* Indirect Branch Tracking */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -321,6 +322,7 @@
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow Stack */
#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 772c219b6889..63cbb4d9938e 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -21,6 +21,7 @@ struct cpuid_bit {
static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_IBT, CPUID_EDX, 20, 0x00000007, 0},
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
--
2.17.1
Create a guard area between VMAs, to detect memory corruption.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
include/linux/mm.h | 30 ++++++++++++++++++++----------
1 file changed, 20 insertions(+), 10 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c4cc07baccda..3a823bdae09d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2443,24 +2443,34 @@ static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * m
static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
{
unsigned long vm_start = vma->vm_start;
+ unsigned long gap;
+
+ if (vma->vm_flags & VM_GROWSDOWN)
+ gap = stack_guard_gap;
+ else
+ gap = PAGE_SIZE;
+
+ vm_start -= gap;
+ if (vm_start > vma->vm_start)
+ vm_start = 0;
- if (vma->vm_flags & VM_GROWSDOWN) {
- vm_start -= stack_guard_gap;
- if (vm_start > vma->vm_start)
- vm_start = 0;
- }
return vm_start;
}
static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
{
unsigned long vm_end = vma->vm_end;
+ unsigned long gap;
+
+ if (vma->vm_flags & VM_GROWSUP)
+ gap = stack_guard_gap;
+ else
+ gap = PAGE_SIZE;
+
+ vm_end += gap;
+ if (vm_end < vma->vm_end)
+ vm_end = -PAGE_SIZE;
- if (vma->vm_flags & VM_GROWSUP) {
- vm_end += stack_guard_gap;
- if (vm_end < vma->vm_end)
- vm_end = -PAGE_SIZE;
- }
return vm_end;
}
--
2.17.1
Intel Control-flow Enforcement Technology (CET) introduces the
following MSRs into the XSAVES system states.
IA32_U_CET (user-mode CET settings),
IA32_PL3_SSP (user-mode shadow stack),
IA32_PL0_SSP (kernel-mode shadow stack),
IA32_PL1_SSP (ring-1 shadow stack),
IA32_PL2_SSP (ring-2 shadow stack).
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/fpu/types.h | 22 +++++++++++++++++++++
arch/x86/include/asm/fpu/xstate.h | 4 +++-
arch/x86/include/uapi/asm/processor-flags.h | 2 ++
arch/x86/kernel/fpu/xstate.c | 10 ++++++++++
4 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 202c53918ecf..e55d51d172f1 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -114,6 +114,9 @@ enum xfeature {
XFEATURE_Hi16_ZMM,
XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
XFEATURE_PKRU,
+ XFEATURE_RESERVED,
+ XFEATURE_SHSTK_USER,
+ XFEATURE_SHSTK_KERNEL,
XFEATURE_MAX,
};
@@ -128,6 +131,8 @@ enum xfeature {
#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM)
#define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
+#define XFEATURE_MASK_SHSTK_USER (1 << XFEATURE_SHSTK_USER)
+#define XFEATURE_MASK_SHSTK_KERNEL (1 << XFEATURE_SHSTK_KERNEL)
#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
@@ -229,6 +234,23 @@ struct pkru_state {
u32 pad;
} __packed;
+/*
+ * State component 11 is Control flow Enforcement user states
+ */
+struct cet_user_state {
+ u64 u_cet; /* user control flow settings */
+ u64 user_ssp; /* user shadow stack pointer */
+} __packed;
+
+/*
+ * State component 12 is Control flow Enforcement kernel states
+ */
+struct cet_kernel_state {
+ u64 kernel_ssp; /* kernel shadow stack */
+ u64 pl1_ssp; /* ring-1 shadow stack */
+ u64 pl2_ssp; /* ring-2 shadow stack */
+} __packed;
+
struct xstate_header {
u64 xfeatures;
u64 xcomp_bv;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index a32dc5f8c963..662562cbafe9 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -31,7 +31,9 @@
XFEATURE_MASK_Hi16_ZMM | \
XFEATURE_MASK_PKRU | \
XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR)
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_SHSTK_USER | \
+ XFEATURE_MASK_SHSTK_KERNEL)
#ifdef CONFIG_X86_64
#define REX_PREFIX "0x48, "
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index bcba3c643e63..25311ec4b731 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -130,6 +130,8 @@
#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT)
#define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */
#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
+#define X86_CR4_CET_BIT 23 /* enable Control flow Enforcement */
+#define X86_CR4_CET _BITUL(X86_CR4_CET_BIT)
/*
* x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index dd2c561c4544..91c0f665567b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -35,6 +35,9 @@ static const char *xfeature_names[] =
"Processor Trace (unused)" ,
"Protection Keys User registers",
"unknown xstate feature" ,
+ "Control flow User registers" ,
+ "Control flow Kernel registers" ,
+ "unknown xstate feature" ,
};
static short xsave_cpuid_features[] __initdata = {
@@ -48,6 +51,9 @@ static short xsave_cpuid_features[] __initdata = {
X86_FEATURE_AVX512F,
X86_FEATURE_INTEL_PT,
X86_FEATURE_PKU,
+ 0, /* Unused */
+ X86_FEATURE_SHSTK, /* XFEATURE_SHSTK_USER */
+ X86_FEATURE_SHSTK, /* XFEATURE_SHSTK_KERNEL */
};
/*
@@ -316,6 +322,8 @@ static void __init print_xstate_features(void)
print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
print_xstate_feature(XFEATURE_MASK_PKRU);
+ print_xstate_feature(XFEATURE_MASK_SHSTK_USER);
+ print_xstate_feature(XFEATURE_MASK_SHSTK_KERNEL);
}
/*
@@ -562,6 +570,8 @@ static void check_xstate_against_struct(int nr)
XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
+ XCHECK_SZ(sz, nr, XFEATURE_SHSTK_USER, struct cet_user_state);
+ XCHECK_SZ(sz, nr, XFEATURE_SHSTK_KERNEL, struct cet_kernel_state);
/*
* Make *SURE* to add any feature numbers in below if
--
2.17.1
Explain how CET works and the no_cet_shstk/no_cet_ibt kernel
parameters.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
.../admin-guide/kernel-parameters.txt | 6 +
Documentation/index.rst | 1 +
Documentation/x86/index.rst | 11 +
Documentation/x86/intel_cet.rst | 259 ++++++++++++++++++
4 files changed, 277 insertions(+)
create mode 100644 Documentation/x86/index.rst
create mode 100644 Documentation/x86/intel_cet.rst
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 92eb1f42240d..3854423f7c86 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2764,6 +2764,12 @@
noexec=on: enable non-executable mappings (default)
noexec=off: disable non-executable mappings
+ no_cet_ibt [X86-64] Disable indirect branch tracking for user-mode
+ applications
+
+ no_cet_shstk [X86-64] Disable shadow stack support for user-mode
+ applications
+
nosmap [X86]
Disable SMAP (Supervisor Mode Access Prevention)
even if it is supported by processor.
diff --git a/Documentation/index.rst b/Documentation/index.rst
index 5db7e87c7cb1..1cdc139adb40 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -104,6 +104,7 @@ implementation.
:maxdepth: 2
sh/index
+ x86/index
Filesystem Documentation
------------------------
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
new file mode 100644
index 000000000000..9c34d8cbc8f0
--- /dev/null
+++ b/Documentation/x86/index.rst
@@ -0,0 +1,11 @@
+=======================
+X86 Documentation
+=======================
+
+Control Flow Enforcement
+========================
+
+.. toctree::
+ :maxdepth: 1
+
+ intel_cet
diff --git a/Documentation/x86/intel_cet.rst b/Documentation/x86/intel_cet.rst
new file mode 100644
index 000000000000..56e724fce920
--- /dev/null
+++ b/Documentation/x86/intel_cet.rst
@@ -0,0 +1,259 @@
+=========================================
+Control Flow Enforcement Technology (CET)
+=========================================
+
+[1] Overview
+============
+
+Control Flow Enforcement Technology (CET) provides protection against
+return/jump-oriented programming (ROP) attacks. It can be implemented
+to protect both the kernel and applications. In the first phase,
+only the user-mode protection is implemented on the 64-bit kernel.
+However, 32-bit applications are supported under the compatibility
+mode.
+
+CET includes shadow stack (SHSTK) and indirect branch tracking (IBT).
+The SHSTK is a secondary stack allocated from memory. The processor
+automatically pushes/pops a secure copy to the SHSTK every return
+address and, by comparing the secure copy to the program stack copy,
+verifies function returns are as intended. The IBT verifies all
+indirect CALL/JMP targets are intended and marked by the compiler with
+'ENDBR' op codes.
+
+There are two kernel configuration options:
+
+ INTEL_X86_SHADOW_STACK_USER, and
+ INTEL_X86_BRANCH_TRACKING_USER.
+
+To build a CET-enabled kernel, Binutils v2.31 and GCC v8.1 or later
+are required. To build a CET-enabled application, GLIBC v2.28 or
+later is also required.
+
+There are two command-line options for disabling CET features:
+
+ no_cet_shstk - disables SHSTK, and
+ no_cet_ibt - disables IBT.
+
+At run time, /proc/cpuinfo shows the availability of SHSTK and IBT.
+
+[2] CET assembly instructions
+=============================
+
+RDSSP %r
+ Read the SHSTK pointer into %r.
+
+INCSSP %r
+ Unwind (increment) the SHSTK pointer (0 ~ 255) steps as indicated
+ in the operand register. The GLIBC longjmp uses INCSSP to unwind
+ the SHSTK until that matches the program stack. When it is
+ necessary to unwind beyond 255 steps, longjmp divides and repeats
+ the process.
+
+RSTORSSP (%r)
+ Switch to the SHSTK indicated in the 'restore token' pointed by
+ the operand register and replace the 'restore token' with a new
+ token to be saved (with SAVEPREVSSP) for the outgoing SHSTK.
+
+::
+
+ Before RSTORSSP
+
+ Incoming SHSTK Current/Outgoing SHSTK
+
+ |----------------------| |----------------------|
+ addr=x | | ssp-> | |
+ |----------------------| |----------------------|
+ (%r)-> | rstor_token=(x|Lg) | addr=y-8 | |
+ |----------------------| |----------------------|
+
+ After RSTORSSP
+
+ |----------------------| |----------------------|
+ | | | |
+ |----------------------| |----------------------|
+ ssp-> | rstor_token=(y|Bz|Lg)| addr=y-8 | |
+ |----------------------| |----------------------|
+
+ note:
+ 1. Only valid addresses and restore tokens can be on the
+ user-mode SHSTK.
+ 2. A token is always of type u64 and must align to u64.
+ 3. The incoming SHSTK pointer in a rstor_token must point to
+ immediately above the token.
+ 4. 'Lg' is bit[0] of a rstor_token indicating a 64-bit SHSTK.
+ 5. 'Bz' is bit[1] of a rstor_token indicating the token is to
+ be used only for the next SAVEPREVSSP and invalid for the
+ RSTORSSP.
+
+SAVEPREVSSP
+ Store the SHSTK 'restore token' pointed by
+ (current_SHSTK_pointer + 8).
+
+::
+
+ After SAVEPREVSSP
+
+ |----------------------| |----------------------|
+ ssp-> | | | |
+ |----------------------| |----------------------|
+ | rstor_token=(y|Bz|Lg)| addr=y-8 | rstor_token(y|Lg) |
+ |----------------------| |----------------------|
+
+WRUSS %r0, (%r1)
+ Write the value in %r0 to the SHSTK address pointed by (%r1).
+ This is a kernel-mode only instruction.
+
+ENDBR
+ The compiler inserts an ENDBR at all valid branch targets. Any
+ CALL/JMP to a target without an ENDBR triggers a control
+ protection fault.
+
+[3] Application Enabling
+========================
+
+An application's CET capability is marked in its ELF header and can
+be verified from the following command output, in the
+NT_GNU_PROPERTY_TYPE_0 field:
+
+ readelf -n <application>
+
+If an application supports CET and is statically linked, it will run
+with CET protection. If the application needs any shared libraries,
+the loader checks all dependencies and enables CET only when all
+requirements are met.
+
+[4] Legacy Libraries
+====================
+
+GLIBC provides a few tunables for backward compatibility.
+
+GLIBC_TUNABLES=glibc.tune.hwcaps=-SHSTK,-IBT
+ Turn off SHSTK/IBT for the current shell.
+
+GLIBC_TUNABLES=glibc.tune.x86_shstk=<on, permissive>
+ This controls how dlopen() handles SHSTK legacy libraries:
+ on: continue with SHSTK enabled;
+ permissive: continue with SHSTK off.
+
+[5] CET system calls
+====================
+
+The following arch_prctl() system calls are added for CET:
+
+arch_prctl(ARCH_CET_STATUS, unsigned long *addr)
+ Return CET feature status.
+
+ The parameter 'addr' is a pointer to a user buffer.
+ On returning to the caller, the kernel fills the following
+ information:
+
+ *addr = SHSTK/IBT status
+ *(addr + 1) = SHSTK base address
+ *(addr + 2) = SHSTK size
+
+arch_prctl(ARCH_CET_DISABLE, unsigned long features)
+ Disable SHSTK and/or IBT specified in 'features'. Return -EPERM
+ if CET is locked.
+
+arch_prctl(ARCH_CET_LOCK)
+ Lock in CET feature.
+
+arch_prctl(ARCH_CET_ALLOC_SHSTK, unsigned long *addr)
+ Allocate a new SHSTK and put a restore token at top.
+
+ The parameter 'addr' is a pointer to a user buffer and indicates
+ the desired SHSTK size to allocate. On returning to the caller,
+ the kernel fills *addr with the base address of the new SHSTK.
+
+arch_prctl(ARCH_CET_LEGACY_BITMAP, unsigned long *addr)
+ Allocate an IBT legacy code bitmap if the current task does not
+ have one.
+
+ The parameter 'addr' is a pointer to a user buffer.
+ On returning to the caller, the kernel fills the following
+ information:
+
+ *addr = IBT bitmap base address
+ *(addr + 1) = IBT bitmap size
+
+[6] The implementation of the SHSTK
+===================================
+
+SHSTK size
+----------
+
+A task's SHSTK is allocated from memory to a fixed size of
+RLIMIT_STACK.
+
+Signal
+------
+
+The main program and its signal handlers use the same SHSTK. Because
+the SHSTK stores only return addresses, we can use a large SHSTK to
+cover the condition that both the program stack and the sigaltstack
+run out.
+
+The kernel creates a restore token at the SHSTK restoring address and
+verifies that token when restoring from the signal handler.
+
+Fork
+----
+
+The SHSTK's vma has VM_SHSTK flag set; its PTEs are required to be
+read-only and dirty. When a SHSTK PTE is not present, RO, and dirty,
+a SHSTK access triggers a page fault with an additional SHSTK bit set
+in the page fault error code.
+
+When a task forks a child, its SHSTK PTEs are copied and both the
+parent's and the child's SHSTK PTEs are cleared of the dirty bit.
+Upon the next SHSTK access, the resulting SHSTK page fault is handled
+by page copy/re-use.
+
+When a pthread child is created, the kernel allocates a new SHSTK for
+the new thread.
+
+Setjmp/Longjmp
+--------------
+
+Longjmp unwinds SHSTK until it matches the program stack.
+
+Ucontext
+--------
+
+In GLIBC, getcontext/setcontext is implemented in similar way as
+setjmp/longjmp.
+
+When makecontext creates a new ucontext, a new SHSTK is allocated for
+that context with ARCH_CET_ALLOC_SHSTK the syscall. The kernel
+creates a restore token at the top of the new SHSTK and the user-mode
+code switches to the new SHSTK with the RSTORSSP instruction.
+
+[7] The management of read-only & dirty PTEs for SHSTK
+======================================================
+
+A RO and dirty PTE exists in the following cases:
+
+(a) A page is modified and then shared with a fork()'ed child;
+(b) A R/O page that has been COW'ed;
+(c) A SHSTK page.
+
+The processor only checks the dirty bit for (c). To prevent the use
+of non-SHSTK memory as SHSTK, we use a spare bit of the 64-bit PTE as
+DIRTY_SW for (a) and (b) above. This results to the following PTE
+settings:
+
+Modified PTE: (R/W + DIRTY_HW)
+Modified and shared PTE: (R/O + DIRTY_SW)
+R/O PTE, COW'ed: (R/O + DIRTY_SW)
+SHSTK PTE: (R/O + DIRTY_HW)
+SHSTK PTE, COW'ed: (R/O + DIRTY_HW)
+SHSTK PTE, shared: (R/O + DIRTY_SW)
+
+Note that DIRTY_SW is only used in R/O PTEs but not R/W PTEs.
+
+[8] The implementation of IBT
+=============================
+
+The kernel provides IBT support in mmap() of the legacy code bit map.
+However, the management of the bitmap is done in the GLIBC or the
+application.
--
2.17.1
Add the following shadow stack management instructions.
INCSSP:
Increment shadow stack pointer by the steps specified.
RDSSP:
Read SSP register into a GPR.
SAVEPREVSSP:
Use "prev ssp" token at top of current shadow stack to
create a "restore token" on previous shadow stack.
RSTORSSP:
Restore from a "restore token" pointed by a GPR to SSP.
WRSS:
Write to kernel-mode shadow stack (kernel-mode instruction).
WRUSS:
Write to user-mode shadow stack (kernel-mode instruction).
SETSSBSY:
Verify the "supervisor token" pointed by IA32_PL0_SSP MSR,
if valid, set the token to busy, and set SSP to the value
of IA32_PL0_SSP MSR.
CLRSSBSY:
Verify the "supervisor token" pointed by a GPR, if valid,
clear the busy bit from the token.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/lib/x86-opcode-map.txt | 26 +++++++++++++------
tools/objtool/arch/x86/lib/x86-opcode-map.txt | 26 +++++++++++++------
2 files changed, 36 insertions(+), 16 deletions(-)
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index e0b85930dd77..c5e825d44766 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -366,7 +366,7 @@ AVXcode: 1
1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv
1c:
1d:
-1e:
+1e: RDSSP Rd (F3),REX.W
1f: NOP Ev
# 0x0f 0x20-0x2f
20: MOV Rd,Cd
@@ -610,7 +610,17 @@ fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
ff: UD0
EndTable
-Table: 3-byte opcode 1 (0x0f 0x38)
+Table: 3-byte opcode 1 (0x0f 0x01)
+Referrer:
+AVXcode:
+# Skip 0x00-0xe7
+e8: SETSSBSY (f3)
+e9:
+ea: SAVEPREVSSP (f3)
+# Skip 0xeb-0xff
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x38)
Referrer: 3-byte escape 1
AVXcode: 2
# 0x0f 0x38 0x00-0x0f
@@ -789,12 +799,12 @@ f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
f2: ANDN Gy,By,Ey (v)
f3: Grp17 (1A)
-f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
-f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSS Pq,Qq (66),REX.W
+f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSS Pq,Qq (66),REX.W
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
EndTable
-Table: 3-byte opcode 2 (0x0f 0x3a)
+Table: 3-byte opcode 3 (0x0f 0x3a)
Referrer: 3-byte escape 2
AVXcode: 3
# 0x0f 0x3a 0x00-0xff
@@ -948,7 +958,7 @@ GrpTable: Grp7
2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
3: LIDT Ms
4: SMSW Mw/Rv
-5: rdpkru (110),(11B) | wrpkru (111),(11B)
+5: rdpkru (110),(11B) | wrpkru (111),(11B) | RSTORSSP Mq (F3)
6: LMSW Ew
7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
EndTable
@@ -1019,8 +1029,8 @@ GrpTable: Grp15
2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
4: XSAVE | ptwrite Ey (F3),(11B)
-5: XRSTOR | lfence (11B)
-6: XSAVEOPT | clwb (66) | mfence (11B)
+5: XRSTOR | lfence (11B) | INCSSP Rd (F3),REX.W
+6: XSAVEOPT | clwb (66) | mfence (11B) | CLRSSBSY Mq (F3)
7: clflush | clflushopt (66) | sfence (11B)
EndTable
diff --git a/tools/objtool/arch/x86/lib/x86-opcode-map.txt b/tools/objtool/arch/x86/lib/x86-opcode-map.txt
index e0b85930dd77..c5e825d44766 100644
--- a/tools/objtool/arch/x86/lib/x86-opcode-map.txt
+++ b/tools/objtool/arch/x86/lib/x86-opcode-map.txt
@@ -366,7 +366,7 @@ AVXcode: 1
1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv
1c:
1d:
-1e:
+1e: RDSSP Rd (F3),REX.W
1f: NOP Ev
# 0x0f 0x20-0x2f
20: MOV Rd,Cd
@@ -610,7 +610,17 @@ fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
ff: UD0
EndTable
-Table: 3-byte opcode 1 (0x0f 0x38)
+Table: 3-byte opcode 1 (0x0f 0x01)
+Referrer:
+AVXcode:
+# Skip 0x00-0xe7
+e8: SETSSBSY (f3)
+e9:
+ea: SAVEPREVSSP (f3)
+# Skip 0xeb-0xff
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x38)
Referrer: 3-byte escape 1
AVXcode: 2
# 0x0f 0x38 0x00-0x0f
@@ -789,12 +799,12 @@ f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
f2: ANDN Gy,By,Ey (v)
f3: Grp17 (1A)
-f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
-f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSS Pq,Qq (66),REX.W
+f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSS Pq,Qq (66),REX.W
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
EndTable
-Table: 3-byte opcode 2 (0x0f 0x3a)
+Table: 3-byte opcode 3 (0x0f 0x3a)
Referrer: 3-byte escape 2
AVXcode: 3
# 0x0f 0x3a 0x00-0xff
@@ -948,7 +958,7 @@ GrpTable: Grp7
2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
3: LIDT Ms
4: SMSW Mw/Rv
-5: rdpkru (110),(11B) | wrpkru (111),(11B)
+5: rdpkru (110),(11B) | wrpkru (111),(11B) | RSTORSSP Mq (F3)
6: LMSW Ew
7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
EndTable
@@ -1019,8 +1029,8 @@ GrpTable: Grp15
2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
4: XSAVE | ptwrite Ey (F3),(11B)
-5: XRSTOR | lfence (11B)
-6: XSAVEOPT | clwb (66) | mfence (11B)
+5: XRSTOR | lfence (11B) | INCSSP Rd (F3),REX.W
+6: XSAVEOPT | clwb (66) | mfence (11B) | CLRSSBSY Mq (F3)
7: clflush | clflushopt (66) | sfence (11B)
EndTable
--
2.17.1
A RO and dirty PTE exists in the following cases:
(a) A page is modified and then shared with a fork()'ed child;
(b) A R/O page that has been COW'ed;
(c) A SHSTK page.
The processor does not read the dirty bit for (a) and (b), but
checks the dirty bit for (c). To prevent the use of non-SHSTK
memory as SHSTK, we introduce a spare bit of the 64-bit PTE as
_PAGE_BIT_DIRTY_SW and use that for (a) and (b). This results
to the following possible PTE settings:
Modified PTE: (R/W + DIRTY_HW)
Modified and shared PTE: (R/O + DIRTY_SW)
R/O PTE COW'ed: (R/O + DIRTY_SW)
SHSTK PTE: (R/O + DIRTY_HW)
SHSTK PTE COW'ed: (R/O + DIRTY_HW)
SHSTK PTE shared: (R/O + DIRTY_SW)
Note that _PAGE_BIT_DRITY_SW is only used in R/O PTEs but
not R/W PTEs.
When this patch is applied, there are six free bits left in
the 64-bit PTE. There is no more free bit in the 32-bit
PTE (except for PAE) and shadow stack is not implemented
for the 32-bit kernel.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/pgtable.h | 129 ++++++++++++++++++++++-----
arch/x86/include/asm/pgtable_types.h | 14 ++-
2 files changed, 121 insertions(+), 22 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 95c918ad84ed..3ee554d81480 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -119,9 +119,9 @@ extern pmdval_t early_pmd_flags;
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
{
- return pte_flags(pte) & _PAGE_DIRTY;
+ return pte_flags(pte) & _PAGE_DIRTY_BITS;
}
@@ -143,9 +143,9 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
-static inline int pmd_dirty(pmd_t pmd)
+static inline bool pmd_dirty(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_DIRTY;
+ return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
}
static inline int pmd_young(pmd_t pmd)
@@ -153,9 +153,9 @@ static inline int pmd_young(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_ACCESSED;
}
-static inline int pud_dirty(pud_t pud)
+static inline bool pud_dirty(pud_t pud)
{
- return pud_flags(pud) & _PAGE_DIRTY;
+ return pud_flags(pud) & _PAGE_DIRTY_BITS;
}
static inline int pud_young(pud_t pud)
@@ -294,9 +294,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+static inline pte_t pte_move_flags(pte_t pte, pteval_t from, pteval_t to)
+{
+ if (pte_flags(pte) & from)
+ pte = pte_set_flags(pte_clear_flags(pte, from), to);
+ return pte;
+}
+#else
+static inline pte_t pte_move_flags(pte_t pte, pteval_t from, pteval_t to)
+{
+ return pte;
+}
+#endif
+
static inline pte_t pte_mkclean(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}
static inline pte_t pte_mkold(pte_t pte)
@@ -306,6 +320,7 @@ static inline pte_t pte_mkold(pte_t pte)
static inline pte_t pte_wrprotect(pte_t pte)
{
+ pte = pte_move_flags(pte, _PAGE_DIRTY_HW, _PAGE_DIRTY_SW);
return pte_clear_flags(pte, _PAGE_RW);
}
@@ -316,9 +331,24 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
+ pteval_t dirty = (!IS_ENABLED(CONFIG_X86_INTEL_SHADOW_STACK_USER) ||
+ pte_write(pte)) ? _PAGE_DIRTY_HW:_PAGE_DIRTY_SW;
+ return pte_set_flags(pte, dirty | _PAGE_SOFT_DIRTY);
+}
+
+#ifdef CONFIG_ARCH_HAS_SHSTK
+static inline pte_t pte_mkdirty_shstk(pte_t pte)
+{
+ pte = pte_clear_flags(pte, _PAGE_DIRTY_SW);
return pte_set_flags(pte, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
+static inline bool pte_dirty_hw(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_DIRTY_HW;
+}
+#endif
+
static inline pte_t pte_mkyoung(pte_t pte)
{
return pte_set_flags(pte, _PAGE_ACCESSED);
@@ -326,6 +356,7 @@ static inline pte_t pte_mkyoung(pte_t pte)
static inline pte_t pte_mkwrite(pte_t pte)
{
+ pte = pte_move_flags(pte, _PAGE_DIRTY_SW, _PAGE_DIRTY_HW);
return pte_set_flags(pte, _PAGE_RW);
}
@@ -373,6 +404,20 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+static inline pmd_t pmd_move_flags(pmd_t pmd, pmdval_t from, pmdval_t to)
+{
+ if (pmd_flags(pmd) & from)
+ pmd = pmd_set_flags(pmd_clear_flags(pmd, from), to);
+ return pmd;
+}
+#else
+static inline pmd_t pmd_move_flags(pmd_t pmd, pmdval_t from, pmdval_t to)
+{
+ return pmd;
+}
+#endif
+
static inline pmd_t pmd_mkold(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
@@ -380,19 +425,36 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_DIRTY);
+ return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
+ pmd = pmd_move_flags(pmd, _PAGE_DIRTY_HW, _PAGE_DIRTY_SW);
return pmd_clear_flags(pmd, _PAGE_RW);
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
+ pmdval_t dirty = (!IS_ENABLED(CONFIG_X86_INTEL_SHADOW_STACK_USER) ||
+ (pmd_flags(pmd) & _PAGE_RW)) ?
+ _PAGE_DIRTY_HW:_PAGE_DIRTY_SW;
+ return pmd_set_flags(pmd, dirty | _PAGE_SOFT_DIRTY);
+}
+
+#ifdef CONFIG_ARCH_HAS_SHSTK
+static inline pmd_t pmd_mkdirty_shstk(pmd_t pmd)
+{
+ pmd = pmd_clear_flags(pmd, _PAGE_DIRTY_SW);
return pmd_set_flags(pmd, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
+static inline bool pmd_dirty_hw(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_DIRTY_HW;
+}
+#endif
+
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_DEVMAP);
@@ -410,6 +472,7 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
+ pmd = pmd_move_flags(pmd, _PAGE_DIRTY_SW, _PAGE_DIRTY_HW);
return pmd_set_flags(pmd, _PAGE_RW);
}
@@ -427,6 +490,20 @@ static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
return native_make_pud(v & ~clear);
}
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+static inline pud_t pud_move_flags(pud_t pud, pudval_t from, pudval_t to)
+{
+ if (pud_flags(pud) & from)
+ pud = pud_set_flags(pud_clear_flags(pud, from), to);
+ return pud;
+}
+#else
+static inline pud_t pud_move_flags(pud_t pud, pudval_t from, pudval_t to)
+{
+ return pud;
+}
+#endif
+
static inline pud_t pud_mkold(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_ACCESSED);
@@ -434,17 +511,22 @@ static inline pud_t pud_mkold(pud_t pud)
static inline pud_t pud_mkclean(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_DIRTY);
+ return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}
static inline pud_t pud_wrprotect(pud_t pud)
{
+ pud = pud_move_flags(pud, _PAGE_DIRTY_HW, _PAGE_DIRTY_SW);
return pud_clear_flags(pud, _PAGE_RW);
}
static inline pud_t pud_mkdirty(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
+ pudval_t dirty = (!IS_ENABLED(CONFIG_X86_INTEL_SHADOW_STACK_USER) ||
+ (pud_flags(pud) & _PAGE_RW)) ?
+ _PAGE_DIRTY_HW:_PAGE_DIRTY_SW;
+
+ return pud_set_flags(pud, dirty | _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_mkdevmap(pud_t pud)
@@ -464,6 +546,7 @@ static inline pud_t pud_mkyoung(pud_t pud)
static inline pud_t pud_mkwrite(pud_t pud)
{
+ pud = pud_move_flags(pud, _PAGE_DIRTY_SW, _PAGE_DIRTY_HW);
return pud_set_flags(pud, _PAGE_RW);
}
@@ -595,19 +678,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
val &= _PAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
+ if ((pte_write(pte) && !(pgprot_val(newprot) & _PAGE_RW)))
+ return pte_move_flags(__pte(val), _PAGE_DIRTY_HW,
+ _PAGE_DIRTY_SW);
return __pte(val);
}
-static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
- pmdval_t val = pmd_val(pmd), oldval = val;
-
- val &= _HPAGE_CHG_MASK;
- val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
- val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
- return __pmd(val);
-}
-
/* mprotect needs to preserve PAT bits when updating vm_page_prot */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -1159,6 +1235,19 @@ static inline int pmd_write(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_RW;
}
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ pmdval_t val = pmd_val(pmd), oldval = val;
+
+ val &= _HPAGE_CHG_MASK;
+ val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+ val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
+ if ((pmd_write(pmd) && !(pgprot_val(newprot) & _PAGE_RW)))
+ return pmd_move_flags(__pmd(val), _PAGE_DIRTY_HW,
+ _PAGE_DIRTY_SW);
+ return __pmd(val);
+}
+
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 0657a22d5216..f47bbc1f9c45 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -23,6 +23,7 @@
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SOFTW5 57 /* available for programmer */
#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
@@ -34,6 +35,7 @@
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
+#define _PAGE_BIT_DIRTY_SW _PAGE_BIT_SOFTW5 /* was written to */
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -109,6 +111,14 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+#define _PAGE_DIRTY_SW (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY_SW)
+#else
+#define _PAGE_DIRTY_SW (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_DIRTY_BITS (_PAGE_DIRTY_HW | _PAGE_DIRTY_SW)
+
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
@@ -122,9 +132,9 @@
* instance, and is *not* included in this mask since
* pte_modify() does modify it.
*/
-#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY_HW | \
- _PAGE_SOFT_DIRTY)
+ _PAGE_DIRTY_SW | _PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/*
--
2.17.1
A control protection exception is triggered when a control flow transfer
attempt violated shadow stack or indirect branch tracking constraints.
For example, the return address for a RET instruction differs from the
safe copy on the shadow stack; or a JMP instruction arrives at a non-
ENDBR instruction.
The control protection exception handler works in a similar way as the
general protection fault handler.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/entry/entry_64.S | 2 +-
arch/x86/include/asm/traps.h | 3 ++
arch/x86/kernel/idt.c | 4 +++
arch/x86/kernel/traps.c | 58 ++++++++++++++++++++++++++++++++++++
4 files changed, 66 insertions(+), 1 deletion(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 957dfb693ecc..5f4914e988df 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1000,7 +1000,7 @@ idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
idtentry coprocessor_error do_coprocessor_error has_error_code=0
idtentry alignment_check do_alignment_check has_error_code=1
idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
-
+idtentry control_protection do_control_protection has_error_code=1
/*
* Reload gs selector with exception handling
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 3de69330e6c5..5196050ff3d5 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -26,6 +26,7 @@ asmlinkage void invalid_TSS(void);
asmlinkage void segment_not_present(void);
asmlinkage void stack_segment(void);
asmlinkage void general_protection(void);
+asmlinkage void control_protection(void);
asmlinkage void page_fault(void);
asmlinkage void async_page_fault(void);
asmlinkage void spurious_interrupt_bug(void);
@@ -77,6 +78,7 @@ dotraplinkage void do_stack_segment(struct pt_regs *, long);
dotraplinkage void do_double_fault(struct pt_regs *, long);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
+dotraplinkage void do_control_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
@@ -142,6 +144,7 @@ enum {
X86_TRAP_AC, /* 17, Alignment Check */
X86_TRAP_MC, /* 18, Machine Check */
X86_TRAP_XF, /* 19, SIMD Floating-Point Exception */
+ X86_TRAP_CP = 21, /* 21 Control Protection Fault */
X86_TRAP_IRET = 32, /* 32, IRET Exception */
};
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 01adea278a71..66ebc8cb16e2 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -104,6 +104,10 @@ static const __initconst struct idt_data def_idts[] = {
#elif defined(CONFIG_X86_32)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
#endif
+
+#ifdef CONFIG_X86_64
+ INTG(X86_TRAP_CP, control_protection),
+#endif
};
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index e6db475164ed..873765adc244 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -578,6 +578,64 @@ do_general_protection(struct pt_regs *regs, long error_code)
}
NOKPROBE_SYMBOL(do_general_protection);
+static const char *control_protection_err[] =
+{
+ "unknown",
+ "near-ret",
+ "far-ret/iret",
+ "endbranch",
+ "rstorssp",
+ "setssbsy",
+};
+
+/*
+ * When a control protection exception occurs, send a signal
+ * to the responsible application. Currently, control
+ * protection is only enabled for the user mode. This
+ * exception should not come from the kernel mode.
+ */
+dotraplinkage void
+do_control_protection(struct pt_regs *regs, long error_code)
+{
+ struct task_struct *tsk;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ if (notify_die(DIE_TRAP, "control protection fault", regs,
+ error_code, X86_TRAP_CP, SIGSEGV) == NOTIFY_STOP)
+ return;
+ cond_local_irq_enable(regs);
+
+ if (!user_mode(regs))
+ die("kernel control protection fault", regs, error_code);
+
+ if (!static_cpu_has(X86_FEATURE_SHSTK) &&
+ !static_cpu_has(X86_FEATURE_IBT))
+ WARN_ONCE(1, "CET is disabled but got control "
+ "protection fault\n");
+
+ tsk = current;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_CP;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ unsigned int max_err;
+
+ max_err = ARRAY_SIZE(control_protection_err) - 1;
+ if ((error_code < 0) || (error_code > max_err))
+ error_code = 0;
+ pr_info("%s[%d] control protection ip:%lx sp:%lx error:%lx(%s)",
+ tsk->comm, task_pid_nr(tsk),
+ regs->ip, regs->sp, error_code,
+ control_protection_err[error_code]);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
+}
+NOKPROBE_SYMBOL(do_control_protection);
+
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
#ifdef CONFIG_DYNAMIC_FTRACE
--
2.17.1
To support XSAVES system states, change some names to distinguish
user and system states.
Change:
supervisor to system
copy_init_fpstate_to_fpregs() to copy_init_user_fpstate_to_fpregs()
xfeatures_mask to xfeatures_mask_user
XCNTXT_MASK to SUPPORTED_XFEATURES_MASK (states supported)
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/fpu/internal.h | 5 +-
arch/x86/include/asm/fpu/xstate.h | 24 ++++----
arch/x86/kernel/fpu/core.c | 4 +-
arch/x86/kernel/fpu/init.c | 2 +-
arch/x86/kernel/fpu/signal.c | 6 +-
arch/x86/kernel/fpu/xstate.c | 88 +++++++++++++++--------------
6 files changed, 66 insertions(+), 63 deletions(-)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a38bf5a1e37a..f1f9bf91a0ab 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -93,7 +93,8 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave)
* XRSTORS requires these bits set in xcomp_bv, or it will
* trigger #GP:
*/
- xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask;
+ xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
+ xfeatures_mask_user;
}
static inline void fpstate_init_fxstate(struct fxregs_state *fx)
@@ -233,7 +234,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
/*
* If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
- * format and supervisor states in addition to modified optimization in
+ * format and system states in addition to modified optimization in
* XSAVEOPT.
*
* Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 48581988d78c..9b382e5157ed 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -19,19 +19,19 @@
#define XSAVE_YMM_SIZE 256
#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
-/* Supervisor features */
-#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT)
+/* System features */
+#define XFEATURE_MASK_SYSTEM (XFEATURE_MASK_PT)
/* All currently supported features */
-#define XCNTXT_MASK (XFEATURE_MASK_FP | \
- XFEATURE_MASK_SSE | \
- XFEATURE_MASK_YMM | \
- XFEATURE_MASK_OPMASK | \
- XFEATURE_MASK_ZMM_Hi256 | \
- XFEATURE_MASK_Hi16_ZMM | \
- XFEATURE_MASK_PKRU | \
- XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR)
+#define SUPPORTED_XFEATURES_MASK (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR)
#ifdef CONFIG_X86_64
#define REX_PREFIX "0x48, "
@@ -39,7 +39,7 @@
#define REX_PREFIX
#endif
-extern u64 xfeatures_mask;
+extern u64 xfeatures_mask_user;
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 2ea85b32421a..4bd56079048f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -363,7 +363,7 @@ void fpu__drop(struct fpu *fpu)
* Clear FPU registers by setting them up from
* the init fpstate:
*/
-static inline void copy_init_fpstate_to_fpregs(void)
+static inline void copy_init_user_fpstate_to_fpregs(void)
{
if (use_xsave())
copy_kernel_to_xregs(&init_fpstate.xsave, -1);
@@ -395,7 +395,7 @@ void fpu__clear(struct fpu *fpu)
preempt_disable();
fpu__initialize(fpu);
user_fpu_begin();
- copy_init_fpstate_to_fpregs();
+ copy_init_user_fpstate_to_fpregs();
preempt_enable();
}
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6abd83572b01..761c3a5a9e07 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -229,7 +229,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
u64 __init fpu__get_supported_xfeatures_mask(void)
{
- return XCNTXT_MASK;
+ return SUPPORTED_XFEATURES_MASK;
}
/* Legacy code to initialize eager fpu mode. */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 23f1691670b6..f77aa76ba675 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -249,11 +249,11 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_
{
if (use_xsave()) {
if ((unsigned long)buf % 64 || fx_only) {
- u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+ u64 init_bv = xfeatures_mask_user & ~XFEATURE_MASK_FPSSE;
copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
return copy_user_to_fxregs(buf);
} else {
- u64 init_bv = xfeatures_mask & ~xbv;
+ u64 init_bv = xfeatures_mask_user & ~xbv;
if (unlikely(init_bv))
copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
return copy_user_to_xregs(buf, xbv);
@@ -417,7 +417,7 @@ void fpu__init_prepare_fx_sw_frame(void)
fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
fx_sw_reserved.extended_size = size;
- fx_sw_reserved.xfeatures = xfeatures_mask;
+ fx_sw_reserved.xfeatures = xfeatures_mask_user;
fx_sw_reserved.xstate_size = fpu_user_xstate_size;
if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 87a57b7642d3..19f8df54c72a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -53,11 +53,11 @@ static short xsave_cpuid_features[] __initdata = {
/*
* Mask of xstate features supported by the CPU and the kernel:
*/
-u64 xfeatures_mask __read_mostly;
+u64 xfeatures_mask_user __read_mostly;
static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask_user)*8];
/*
* The XSAVE area of kernel can be in standard or compacted format;
@@ -82,7 +82,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
*/
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
- u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+ u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_user;
if (unlikely(feature_name)) {
long xfeature_idx, max_idx;
@@ -113,14 +113,14 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
-static int xfeature_is_supervisor(int xfeature_nr)
+static int xfeature_is_system(int xfeature_nr)
{
/*
- * We currently do not support supervisor states, but if
+ * We currently do not support system states, but if
* we did, we could find out like this.
*
* SDM says: If state component 'i' is a user state component,
- * ECX[0] return 0; if state component i is a supervisor
+ * ECX[0] return 0; if state component i is a system
* state component, ECX[0] returns 1.
*/
u32 eax, ebx, ecx, edx;
@@ -131,7 +131,7 @@ static int xfeature_is_supervisor(int xfeature_nr)
static int xfeature_is_user(int xfeature_nr)
{
- return !xfeature_is_supervisor(xfeature_nr);
+ return !xfeature_is_system(xfeature_nr);
}
/*
@@ -164,7 +164,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
* None of the feature bits are in init state. So nothing else
* to do for us, as the memory layout is up to date.
*/
- if ((xfeatures & xfeatures_mask) == xfeatures_mask)
+ if ((xfeatures & xfeatures_mask_user) == xfeatures_mask_user)
return;
/*
@@ -191,7 +191,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
* in a special way already:
*/
feature_bit = 0x2;
- xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
+ xfeatures = (xfeatures_mask_user & ~xfeatures) >> 2;
/*
* Update all the remaining memory layouts according to their
@@ -219,20 +219,20 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
*/
void fpu__init_cpu_xstate(void)
{
- if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_user)
return;
/*
- * Make it clear that XSAVES supervisor states are not yet
+ * Make it clear that XSAVES system states are not yet
* implemented should anyone expect it to work by changing
* bits in XFEATURE_MASK_* macros and XCR0.
*/
- WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
- "x86/fpu: XSAVES supervisor states are not yet implemented.\n");
+ WARN_ONCE((xfeatures_mask_user & XFEATURE_MASK_SYSTEM),
+ "x86/fpu: XSAVES system states are not yet implemented.\n");
- xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
+ xfeatures_mask_user &= ~XFEATURE_MASK_SYSTEM;
cr4_set_bits(X86_CR4_OSXSAVE);
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
}
/*
@@ -242,7 +242,7 @@ void fpu__init_cpu_xstate(void)
*/
static int xfeature_enabled(enum xfeature xfeature)
{
- return !!(xfeatures_mask & (1UL << xfeature));
+ return !!(xfeatures_mask_user & BIT_ULL(xfeature));
}
/*
@@ -272,7 +272,7 @@ static void __init setup_xstate_features(void)
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
/*
- * If an xfeature is supervisor state, the offset
+ * If an xfeature is system state, the offset
* in EBX is invalid. We leave it to -1.
*/
if (xfeature_is_user(i))
@@ -348,7 +348,7 @@ static int xfeature_is_aligned(int xfeature_nr)
*/
static void __init setup_xstate_comp(void)
{
- unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+ unsigned int xstate_comp_sizes[sizeof(xfeatures_mask_user)*8];
int i;
/*
@@ -421,7 +421,8 @@ static void __init setup_init_fpu_buf(void)
print_xstate_features();
if (boot_cpu_has(X86_FEATURE_XSAVES))
- init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+ init_fpstate.xsave.header.xcomp_bv =
+ BIT_ULL(63) | xfeatures_mask_user;
/*
* Init all the features state with header.xfeatures being 0x0
@@ -440,11 +441,11 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
u32 eax, ebx, ecx, edx;
/*
- * Only XSAVES supports supervisor states and it uses compacted
- * format. Checking a supervisor state's uncompacted offset is
+ * Only XSAVES supports system states and it uses compacted
+ * format. Checking a system state's uncompacted offset is
* an error.
*/
- if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) {
+ if (XFEATURE_MASK_SYSTEM & (1 << xfeature_nr)) {
WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
return -1;
}
@@ -465,7 +466,7 @@ static int xfeature_size(int xfeature_nr)
/*
* 'XSAVES' implies two different things:
- * 1. saving of supervisor/system state
+ * 1. saving of system state
* 2. using the compacted format
*
* Use this function when dealing with the compacted format so
@@ -480,8 +481,8 @@ int using_compacted_format(void)
/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
int validate_xstate_header(const struct xstate_header *hdr)
{
- /* No unknown or supervisor features may be set */
- if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
+ /* No unknown or system features may be set */
+ if (hdr->xfeatures & (~xfeatures_mask_user | XFEATURE_MASK_SYSTEM))
return -EINVAL;
/* Userspace must use the uncompacted format */
@@ -588,11 +589,11 @@ static void do_extra_xstate_size_checks(void)
check_xstate_against_struct(i);
/*
- * Supervisor state components can be managed only by
+ * System state components can be managed only by
* XSAVES, which is compacted-format only.
*/
if (!using_compacted_format())
- XSTATE_WARN_ON(xfeature_is_supervisor(i));
+ XSTATE_WARN_ON(xfeature_is_system(i));
/* Align from the end of the previous feature */
if (xfeature_is_aligned(i))
@@ -616,7 +617,7 @@ static void do_extra_xstate_size_checks(void)
/*
- * Get total size of enabled xstates in XCR0/xfeatures_mask.
+ * Get total size of enabled xstates in XCR0/xfeatures_mask_user.
*
* Note the SDM's wording here. "sub-function 0" only enumerates
* the size of the *user* states. If we use it to size a buffer
@@ -706,7 +707,7 @@ static int init_xstate_size(void)
*/
static void fpu__init_disable_system_xstate(void)
{
- xfeatures_mask = 0;
+ xfeatures_mask_user = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
fpu__xstate_clear_all_cpu_caps();
}
@@ -742,15 +743,15 @@ void __init fpu__init_system_xstate(void)
}
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xfeatures_mask = eax + ((u64)edx << 32);
+ xfeatures_mask_user = eax + ((u64)edx << 32);
- if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ if ((xfeatures_mask_user & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
/*
* This indicates that something really unexpected happened
* with the enumeration. Disable XSAVE and try to continue
* booting without it. This is too early to BUG().
*/
- pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask_user);
goto out_disable;
}
@@ -759,10 +760,10 @@ void __init fpu__init_system_xstate(void)
*/
for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
if (!boot_cpu_has(xsave_cpuid_features[i]))
- xfeatures_mask &= ~BIT(i);
+ xfeatures_mask_user &= ~BIT_ULL(i);
}
- xfeatures_mask &= fpu__get_supported_xfeatures_mask();
+ xfeatures_mask_user &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
@@ -772,9 +773,10 @@ void __init fpu__init_system_xstate(void)
/*
* Update info used for ptrace frames; use standard-format size and no
- * supervisor xstates:
+ * system xstates:
*/
- update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
+ update_regset_xstate_info(fpu_user_xstate_size,
+ xfeatures_mask_user & ~XFEATURE_MASK_SYSTEM);
fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
@@ -782,7 +784,7 @@ void __init fpu__init_system_xstate(void)
print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
- xfeatures_mask,
+ xfeatures_mask_user,
fpu_kernel_xstate_size,
boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
return;
@@ -801,7 +803,7 @@ void fpu__resume_cpu(void)
* Restore XCR0 on xsave capable CPUs:
*/
if (boot_cpu_has(X86_FEATURE_XSAVE))
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
}
/*
@@ -853,7 +855,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
* have not enabled. Remember that pcntxt_mask is
* what we write to the XCR0 register.
*/
- WARN_ONCE(!(xfeatures_mask & xstate_feature),
+ WARN_ONCE(!(xfeatures_mask_user & xstate_feature),
"get of unsupported state");
/*
* This assumes the last 'xsave*' instruction to
@@ -1003,7 +1005,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
*/
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+ header.xfeatures &= ~XFEATURE_MASK_SYSTEM;
/*
* Copy xregs_state->header:
@@ -1087,7 +1089,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
*/
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+ header.xfeatures &= ~XFEATURE_MASK_SYSTEM;
/*
* Copy xregs_state->header:
@@ -1180,7 +1182,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
*/
- xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+ xsave->header.xfeatures &= XFEATURE_MASK_SYSTEM;
/*
* Add back in the features that came in from userspace:
@@ -1236,7 +1238,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
*/
- xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+ xsave->header.xfeatures &= XFEATURE_MASK_SYSTEM;
/*
* Add back in the features that came in from userspace:
--
2.17.1
On 9/21/18 8:03 AM, Yu-cheng Yu wrote:
> Add shadow stack pages to memory accounting.
> Also check if the system has enough memory before enabling CET.
>
> Signed-off-by: Yu-cheng Yu <yu-cheng.yu.intel.com>
oops. typo above.
> ---
> mm/mmap.c | 5 +++++
> 1 file changed, 5 insertions(+)
--
~Randy
On Fri, 2018-09-21 at 09:55 -0700, Randy Dunlap wrote:
> On 9/21/18 8:03 AM, Yu-cheng Yu wrote:
> > Add shadow stack pages to memory accounting.
> > Also check if the system has enough memory before enabling CET.
> >
> > Signed-off-by: Yu-cheng Yu <yu-cheng.yu.intel.com>
>
> oops. typo above.
>
I will fix it. Thanks!
On 09/21/2018 08:03 AM, Yu-cheng Yu wrote:
> The previous version of CET patches can be found in the following
> link:
>
> https://lkml.org/lkml/2018/8/30/608
So, this is an RFC, but there no mention of what you want comments *on*. :)
What do you want folks to review? What needs to get settled before this
is merged?
On Fri, 2018-09-21 at 15:53 -0700, Dave Hansen wrote:
> On 09/21/2018 08:03 AM, Yu-cheng Yu wrote:
> > The previous version of CET patches can be found in the following
> > link:
> >
> > https://lkml.org/lkml/2018/8/30/608
>
> So, this is an RFC, but there no mention of what you want comments *on*. :)
>
> What do you want folks to review? What needs to get settled before this
> is merged?
Thanks, Dave!
These patches passed GLIBC built-in tests and more tests HJ and I put together
at https://github.com/hjl-tools/cet-smoke-test.
I made some changes since V3 as outlined in the cover letter.
In particular there are two new patches for the VMA guard and preventing shadow
stack merging. Does anyone have comments on those and the whole Shadow
Stack/IBT series in general?
Thanks,
Yu-cheng
On Fri, Sep 21, 2018 at 08:03:25AM -0700, Yu-cheng Yu wrote:
> diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
> index 772c219b6889..63cbb4d9938e 100644
> --- a/arch/x86/kernel/cpu/scattered.c
> +++ b/arch/x86/kernel/cpu/scattered.c
> @@ -21,6 +21,7 @@ struct cpuid_bit {
> static const struct cpuid_bit cpuid_bits[] = {
> { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
> { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
> + { X86_FEATURE_IBT, CPUID_EDX, 20, 0x00000007, 0},
^^
missing white space at the end there.
> { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
> { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
> { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
> --
> 2.17.1
>
On Tue, 2018-09-25 at 18:27 +0200, Peter Zijlstra wrote:
> On Fri, Sep 21, 2018 at 08:03:25AM -0700, Yu-cheng Yu wrote:
>
> > diff --git a/arch/x86/kernel/cpu/scattered.c
> > b/arch/x86/kernel/cpu/scattered.c
> > index 772c219b6889..63cbb4d9938e 100644
> > --- a/arch/x86/kernel/cpu/scattered.c
> > +++ b/arch/x86/kernel/cpu/scattered.c
> > @@ -21,6 +21,7 @@ struct cpuid_bit {
> > static const struct cpuid_bit cpuid_bits[] = {
> > { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
> > { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
> > + { X86_FEATURE_IBT, CPUID_EDX, 20, 0x00000007, 0},
>
> ^^
> missing white space at the end there.
I will fix it. Thanks!
On Fri, Sep 21, 2018 at 08:03:26AM -0700, Yu-cheng Yu wrote:
> diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
> index a38bf5a1e37a..f1f9bf91a0ab 100644
> --- a/arch/x86/include/asm/fpu/internal.h
> +++ b/arch/x86/include/asm/fpu/internal.h
> @@ -93,7 +93,8 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave)
> * XRSTORS requires these bits set in xcomp_bv, or it will
> * trigger #GP:
> */
> - xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask;
> + xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
> + xfeatures_mask_user;
I would be OK with that line extending to 82 characters..
> }
>
> static inline void fpstate_init_fxstate(struct fxregs_state *fx)
> diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
> index 87a57b7642d3..19f8df54c72a 100644
> --- a/arch/x86/kernel/fpu/xstate.c
> +++ b/arch/x86/kernel/fpu/xstate.c
> @@ -421,7 +421,8 @@ static void __init setup_init_fpu_buf(void)
> print_xstate_features();
>
> if (boot_cpu_has(X86_FEATURE_XSAVES))
> - init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
> + init_fpstate.xsave.header.xcomp_bv =
> + BIT_ULL(63) | xfeatures_mask_user;
If you do that, the if () needs { } per coding style.
>
> /*
> * Init all the features state with header.xfeatures being 0x0
On Fri, Sep 21, 2018 at 08:03:27AM -0700, Yu-cheng Yu wrote:
> diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
> index 4bd56079048f..9f51b0e1da25 100644
> --- a/arch/x86/kernel/fpu/core.c
> +++ b/arch/x86/kernel/fpu/core.c
> @@ -365,8 +365,13 @@ void fpu__drop(struct fpu *fpu)
> */
> static inline void copy_init_user_fpstate_to_fpregs(void)
> {
> + /*
> + * Only XSAVES user states are copied.
> + * System states are preserved.
> + */
> if (use_xsave())
> - copy_kernel_to_xregs(&init_fpstate.xsave, -1);
> + copy_kernel_to_xregs(&init_fpstate.xsave,
> + xfeatures_mask_user);
By my counting, that doesn't qualify for a line-break, it hits 80.
If you were to do this line-break, coding style would have you liberally
sprinkle {} around.
> else if (static_cpu_has(X86_FEATURE_FXSR))
> copy_kernel_to_fxregs(&init_fpstate.fxsave);
> else
On Tue, 2018-09-25 at 19:03 +0200, Peter Zijlstra wrote:
> On Fri, Sep 21, 2018 at 08:03:27AM -0700, Yu-cheng Yu wrote:
> > diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
> > index 4bd56079048f..9f51b0e1da25 100644
> > --- a/arch/x86/kernel/fpu/core.c
> > +++ b/arch/x86/kernel/fpu/core.c
> > @@ -365,8 +365,13 @@ void fpu__drop(struct fpu *fpu)
> > */
> > static inline void copy_init_user_fpstate_to_fpregs(void)
> > {
> > + /*
> > + * Only XSAVES user states are copied.
> > + * System states are preserved.
> > + */
> > if (use_xsave())
> > - copy_kernel_to_xregs(&init_fpstate.xsave, -1);
> > + copy_kernel_to_xregs(&init_fpstate.xsave,
> > + xfeatures_mask_user);
>
> By my counting, that doesn't qualify for a line-break, it hits 80.
>
> If you were to do this line-break, coding style would have you liberally
> sprinkle {} around.
Ok, will fix it.
On Fri, Sep 21, 2018 at 08:03:25AM -0700, Yu-cheng Yu wrote:
> Add CPUIDs for Control-flow Enforcement Technology (CET).
>
> CPUID.(EAX=7,ECX=0):ECX[bit 7] Shadow stack
> CPUID.(EAX=7,ECX=0):EDX[bit 20] Indirect branch tracking
>
> Signed-off-by: Yu-cheng Yu <[email protected]>
> ---
> arch/x86/include/asm/cpufeatures.h | 2 ++
> arch/x86/kernel/cpu/scattered.c | 1 +
> 2 files changed, 3 insertions(+)
>
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index 89a048c2faec..fa69651a017e 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -221,6 +221,7 @@
> #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
> #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
> #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
> +#define X86_FEATURE_IBT ( 7*32+31) /* Indirect Branch Tracking */
>
> /* Virtualization flags: Linux defined, word 8 */
> #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
> @@ -321,6 +322,7 @@
> #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
> #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
> #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
> +#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow Stack */
> #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
> #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
> #define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
> diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
> index 772c219b6889..63cbb4d9938e 100644
> --- a/arch/x86/kernel/cpu/scattered.c
> +++ b/arch/x86/kernel/cpu/scattered.c
> @@ -21,6 +21,7 @@ struct cpuid_bit {
> static const struct cpuid_bit cpuid_bits[] = {
> { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
> { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
> + { X86_FEATURE_IBT, CPUID_EDX, 20, 0x00000007, 0},
If you haven't noticed, there's already a separate leaf:
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
in arch/x86/include/asm/cpufeatures.h
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Fri, 2018-09-28 at 18:51 +0200, Borislav Petkov wrote:
> On Fri, Sep 21, 2018 at 08:03:25AM -0700, Yu-cheng Yu wrote:
> > Add CPUIDs for Control-flow Enforcement Technology (CET).
> >
> > CPUID.(EAX=7,ECX=0):ECX[bit 7] Shadow stack
> > CPUID.(EAX=7,ECX=0):EDX[bit 20] Indirect branch tracking
> >
> > Signed-off-by: Yu-cheng Yu <[email protected]>
> > ---
> > arch/x86/include/asm/cpufeatures.h | 2 ++
> > arch/x86/kernel/cpu/scattered.c | 1 +
> > 2 files changed, 3 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/cpufeatures.h
> > b/arch/x86/include/asm/cpufeatures.h
> > index 89a048c2faec..fa69651a017e 100644
> > --- a/arch/x86/include/asm/cpufeatures.h
> > +++ b/arch/x86/include/asm/cpufeatures.h
> > @@ -221,6 +221,7 @@
> > #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD
> > family 0x17 (Zen) */
> > #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF
> > workaround PTE inversion */
> > #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
> > +#define X86_FEATURE_IBT ( 7*32+31) /* Indirect
> > Branch Tracking */
> >
> > /* Virtualization flags: Linux defined, word 8 */
> > #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR
> > Shadow */
> > @@ -321,6 +322,7 @@
> > #define X86_FEATURE_PKU (16*32+ 3) /* Protection
> > Keys for Userspace */
> > #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys
> > Enable */
> > #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512
> > Vector Bit Manipulation Instructions */
> > +#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow Stack */
> > #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New
> > Instructions */
> > #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
> > #define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less
> > Multiplication Double Quadword */
> > diff --git a/arch/x86/kernel/cpu/scattered.c
> > b/arch/x86/kernel/cpu/scattered.c
> > index 772c219b6889..63cbb4d9938e 100644
> > --- a/arch/x86/kernel/cpu/scattered.c
> > +++ b/arch/x86/kernel/cpu/scattered.c
> > @@ -21,6 +21,7 @@ struct cpuid_bit {
> > static const struct cpuid_bit cpuid_bits[] = {
> > { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
> > { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
> > + { X86_FEATURE_IBT, CPUID_EDX, 20, 0x00000007, 0},
>
> If you haven't noticed, there's already a separate leaf:
>
> /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
>
> in arch/x86/include/asm/cpufeatures.h
>
I will change to that one. Thanks!
Yu-cheng
On Fri, Sep 21, 2018 at 08:03:26AM -0700, Yu-cheng Yu wrote:
> To support XSAVES system states, change some names to distinguish
> user and system states.
I don't understand what the logic here is. SDM says:
XSAVES—Save Processor Extended States Supervisor
the stress being on "Supervisor" - why does it need to be renamed to
"system" now?
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Tue, 2018-10-02 at 17:29 +0200, Borislav Petkov wrote:
> On Fri, Sep 21, 2018 at 08:03:26AM -0700, Yu-cheng Yu wrote:
> > To support XSAVES system states, change some names to distinguish
> > user and system states.
>
> I don't understand what the logic here is. SDM says:
>
> XSAVES—Save Processor Extended States Supervisor
>
> the stress being on "Supervisor" - why does it need to be renamed to
> "system" now?
>
Good point. However, "system" is more indicative; CET states are per-task and
not "Supervisor". Do we want to go back to "Supervisor" or add comments?
Yu-cheng
On 10/02/2018 09:21 AM, Yu-cheng Yu wrote:
> On Tue, 2018-10-02 at 17:29 +0200, Borislav Petkov wrote:
>> On Fri, Sep 21, 2018 at 08:03:26AM -0700, Yu-cheng Yu wrote:
>>> To support XSAVES system states, change some names to distinguish
>>> user and system states.
>> I don't understand what the logic here is. SDM says:
>>
>> XSAVES—Save Processor Extended States Supervisor
>>
>> the stress being on "Supervisor" - why does it need to be renamed to
>> "system" now?
>>
> Good point. However, "system" is more indicative; CET states are per-task and
> not "Supervisor". Do we want to go back to "Supervisor" or add comments?
This is one of those things where the SDM language does not match what
we use in the kernel. I think it's fine to call them "system" or
"kernel" states to make it consistent with our existing in-kernel
nomenclature.
I say add comments to clarify what the SDM calls it vs. what we do.
On Tue, Oct 02, 2018 at 09:30:52AM -0700, Dave Hansen wrote:
> > Good point. However, "system" is more indicative; CET states are per-task and
> > not "Supervisor". Do we want to go back to "Supervisor" or add comments?
>
> This is one of those things where the SDM language does not match what
> we use in the kernel. I think it's fine to call them "system" or
> "kernel" states to make it consistent with our existing in-kernel
> nomenclature.
>
> I say add comments to clarify what the SDM calls it vs. what we do.
So AFAIU, the difference is that XSAVES is a CPL0 insn. Thus the
supervisor thing, I'd guess.
Now it looks like CET uses XSAVES (from skimming the patchset forward)
but then what our nomenclature is and how it all gets tied together,
needs to be explained somewhere prominent so that we're all on the same
page.
This patch's commit message is not even close. So I'd very much
appreciate a more verbose explanation, even if it repeats itself at
places.
Thx.
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On 10/02/2018 09:37 AM, Borislav Petkov wrote:
> This patch's commit message is not even close. So I'd very much
> appreciate a more verbose explanation, even if it repeats itself at
> places.
Yep, totally agree.
On Tue, 2018-10-02 at 09:39 -0700, Dave Hansen wrote:
> On 10/02/2018 09:37 AM, Borislav Petkov wrote:
> > This patch's commit message is not even close. So I'd very much
> > appreciate a more verbose explanation, even if it repeats itself at
> > places.
>
> Yep, totally agree.
Ok, I will work on that.
Yu-cheng
On Fri, Sep 21, 2018 at 08:03:27AM -0700, Yu-cheng Yu wrote:
> XSAVES saves both system and user states. The Linux kernel
> currently does not save/restore any system states. This patch
> creates the framework for supporting system states.
... and needs a lot more text explaining *why* it is doing that.
>
> Signed-off-by: Yu-cheng Yu <[email protected]>
> ---
> arch/x86/include/asm/fpu/internal.h | 3 +-
> arch/x86/include/asm/fpu/xstate.h | 9 ++-
> arch/x86/kernel/fpu/core.c | 7 +-
> arch/x86/kernel/fpu/init.c | 10 ---
> arch/x86/kernel/fpu/xstate.c | 112 +++++++++++++++++-----------
> 5 files changed, 80 insertions(+), 61 deletions(-)
>
> diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
> index f1f9bf91a0ab..1f447865db3a 100644
> --- a/arch/x86/include/asm/fpu/internal.h
> +++ b/arch/x86/include/asm/fpu/internal.h
> @@ -45,7 +45,6 @@ extern void fpu__init_cpu_xstate(void);
> extern void fpu__init_system(struct cpuinfo_x86 *c);
> extern void fpu__init_check_bugs(void);
> extern void fpu__resume_cpu(void);
> -extern u64 fpu__get_supported_xfeatures_mask(void);
>
> /*
> * Debugging facility:
> @@ -94,7 +93,7 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave)
> * trigger #GP:
> */
> xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
> - xfeatures_mask_user;
> + xfeatures_mask_all;
> }
>
> static inline void fpstate_init_fxstate(struct fxregs_state *fx)
> diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
> index 9b382e5157ed..a32dc5f8c963 100644
> --- a/arch/x86/include/asm/fpu/xstate.h
> +++ b/arch/x86/include/asm/fpu/xstate.h
> @@ -19,10 +19,10 @@
> #define XSAVE_YMM_SIZE 256
> #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
>
> -/* System features */
> -#define XFEATURE_MASK_SYSTEM (XFEATURE_MASK_PT)
Previous patch renames it, this patch deletes it. Why do we need all
that unnecessary churn?
Also, this patch is trying to do a couple of things at once and
reviewing it is not trivial. Please split the changes logically.
> diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
> index 19f8df54c72a..dd2c561c4544 100644
> --- a/arch/x86/kernel/fpu/xstate.c
> +++ b/arch/x86/kernel/fpu/xstate.c
> @@ -51,13 +51,16 @@ static short xsave_cpuid_features[] __initdata = {
> };
>
> /*
> - * Mask of xstate features supported by the CPU and the kernel:
> + * Mask of xstate features supported by the CPU and the kernel.
> + * This is the result from CPUID query, SUPPORTED_XFEATURES_MASK,
> + * and boot_cpu_has().
> */
This needs to explain what both masks are - user and system. "CPU" and
"kernel" is not "user" and "all".
> u64 xfeatures_mask_user __read_mostly;
> +u64 xfeatures_mask_all __read_mostly;
> @@ -219,30 +222,31 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
> */
> void fpu__init_cpu_xstate(void)
> {
> - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_user)
> + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
> return;
> +
> + cr4_set_bits(X86_CR4_OSXSAVE);
> +
> /*
> - * Make it clear that XSAVES system states are not yet
> - * implemented should anyone expect it to work by changing
> - * bits in XFEATURE_MASK_* macros and XCR0.
> + * XCR_XFEATURE_ENABLED_MASK sets the features that are managed
> + * by XSAVE{C, OPT} and XRSTOR. Only XSAVE user states can be
> + * set here.
> */
> - WARN_ONCE((xfeatures_mask_user & XFEATURE_MASK_SYSTEM),
> - "x86/fpu: XSAVES system states are not yet implemented.\n");
> + xsetbv(XCR_XFEATURE_ENABLED_MASK,
> + xfeatures_mask_user);
No need to break the line here.
Also, you have a couple more places in your patches where you
unnecessarily break lines. Please don't do that, even if it exceeds 80
cols by a couple of chars.
>
> - xfeatures_mask_user &= ~XFEATURE_MASK_SYSTEM;
> -
> - cr4_set_bits(X86_CR4_OSXSAVE);
> - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
> + /*
> + * MSR_IA32_XSS sets which XSAVES system states to be managed by
> + * XSAVES. Only XSAVES system states can be set here.
> + */
> + if (boot_cpu_has(X86_FEATURE_XSAVES))
> + wrmsrl(MSR_IA32_XSS,
> + xfeatures_mask_all & ~xfeatures_mask_user);
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Fri, Sep 21, 2018 at 08:03:43AM -0700, Yu-cheng Yu wrote:
> WRUSS is a new kernel-mode instruction but writes directly
> to user shadow stack memory. This is used to construct
> a return address on the shadow stack for the signal
> handler.
>
> This instruction can fault if the user shadow stack is
> invalid shadow stack memory. In that case, the kernel does
> fixup.
"a fixup"
>
> Signed-off-by: Yu-cheng Yu <[email protected]>
> ---
> arch/x86/include/asm/special_insns.h | 32 ++++++++++++++++++++++++++++
> arch/x86/mm/fault.c | 9 ++++++++
> 2 files changed, 41 insertions(+)
>
> diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
> index 317fc59b512c..c04e68ef47da 100644
> --- a/arch/x86/include/asm/special_insns.h
> +++ b/arch/x86/include/asm/special_insns.h
> @@ -237,6 +237,38 @@ static inline void clwb(volatile void *__p)
> : [pax] "a" (p));
> }
>
> +#ifdef CONFIG_X86_INTEL_CET
> +#if defined(CONFIG_IA32_EMULATION) || defined(CONFIG_X86_X32)
> +static inline int write_user_shstk_32(unsigned long addr, unsigned int val)
> +{
> + asm_volatile_goto("1: wrussd %1, (%0)\n"
> + _ASM_EXTABLE(1b, %l[fail])
> + :: "r" (addr), "r" (val)
> + :: fail);
> + return 0;
> +fail:
> + return -1;
Should it...
> +}
> +#else
> +static inline int write_user_shstk_32(unsigned long addr, unsigned int val)
> +{
> + WARN_ONCE(1, "write_user_shstk_32 used but not supported.\n");
"is/was used"
> + return -EFAULT;
> +}
> +#endif
> +
> +static inline int write_user_shstk_64(unsigned long addr, unsigned long val)
> +{
> + asm_volatile_goto("1: wrussq %1, (%0)\n"
> + _ASM_EXTABLE(1b, %l[fail])
> + :: "r" (addr), "r" (val)
> + :: fail);
> + return 0;
> +fail:
> + return -1;
...and it be -EPERM, if -EFAULT was returned earlier for write_user_shstk_32?
> +}
> +#endif /* CONFIG_X86_INTEL_CET */
> +
> #define nop() asm volatile ("nop")
>
>
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 7c3877a982f4..4d4ac57a4ba2 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -1305,6 +1305,15 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
> error_code |= X86_PF_USER;
> flags |= FAULT_FLAG_USER;
> } else {
> + /*
> + * WRUSS is a kernel instrcution and but writes
"WRUSS is a kernel instruction but writes"
> + * to user shadow stack. When a fault occurs,
> + * both X86_PF_USER and X86_PF_SHSTK are set.
> + * Clear X86_PF_USER here.
> + */
> + if ((error_code & (X86_PF_USER | X86_PF_SHSTK)) ==
> + (X86_PF_USER | X86_PF_SHSTK))
> + error_code &= ~X86_PF_USER;
> if (regs->flags & X86_EFLAGS_IF)
> local_irq_enable();
> }
> --
> 2.17.1
>
On Fri, Sep 21, 2018 at 08:03:48AM -0700, Yu-cheng Yu wrote:
> Create a guard area between VMAs, to detect memory corruption.
Do I understand correctly that with this patch a user space program
no longer be able to place two mappings back to back? If it is so,
it will likely break a lot of things; for example, it's a common ring
buffer implementations technique, to map buffer memory twice back
to back in order to avoid special handling of items wrapping its end.
On Tue, Oct 2, 2018 at 9:55 PM Eugene Syromiatnikov <[email protected]> wrote:
>
> On Fri, Sep 21, 2018 at 08:03:48AM -0700, Yu-cheng Yu wrote:
> > Create a guard area between VMAs, to detect memory corruption.
>
> Do I understand correctly that with this patch a user space program
> no longer be able to place two mappings back to back? If it is so,
> it will likely break a lot of things; for example, it's a common ring
> buffer implementations technique, to map buffer memory twice back
> to back in order to avoid special handling of items wrapping its end.
I haven't checked what the patch actually does, but it shouldn't have
any affect on MAP_FIXED or the new no-replace MAP_FIXED variant.
--Andy
On Fri, Sep 21, 2018 at 08:03:30AM -0700, Yu-cheng Yu wrote:
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index e6db475164ed..873765adc244 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -578,6 +578,64 @@ do_general_protection(struct pt_regs *regs, long error_code)
> }
> NOKPROBE_SYMBOL(do_general_protection);
>
> +static const char *control_protection_err[] =
> +{
> + "unknown",
> + "near-ret",
> + "far-ret/iret",
> + "endbranch",
> + "rstorssp",
> + "setssbsy",
> +};
> +
> +/*
> + * When a control protection exception occurs, send a signal
> + * to the responsible application. Currently, control
> + * protection is only enabled for the user mode. This
> + * exception should not come from the kernel mode.
> + */
> +dotraplinkage void
> +do_control_protection(struct pt_regs *regs, long error_code)
> +{
> + struct task_struct *tsk;
> +
> + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> + if (notify_die(DIE_TRAP, "control protection fault", regs,
> + error_code, X86_TRAP_CP, SIGSEGV) == NOTIFY_STOP)
> + return;
> + cond_local_irq_enable(regs);
> +
> + if (!user_mode(regs))
> + die("kernel control protection fault", regs, error_code);
> +
> + if (!static_cpu_has(X86_FEATURE_SHSTK) &&
> + !static_cpu_has(X86_FEATURE_IBT))
> + WARN_ONCE(1, "CET is disabled but got control "
> + "protection fault\n");
> +
> + tsk = current;
> + tsk->thread.error_code = error_code;
> + tsk->thread.trap_nr = X86_TRAP_CP;
> +
> + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
> + printk_ratelimit()) {
> + unsigned int max_err;
> +
> + max_err = ARRAY_SIZE(control_protection_err) - 1;
> + if ((error_code < 0) || (error_code > max_err))
> + error_code = 0;
> + pr_info("%s[%d] control protection ip:%lx sp:%lx error:%lx(%s)",
> + tsk->comm, task_pid_nr(tsk),
> + regs->ip, regs->sp, error_code,
> + control_protection_err[error_code]);
> + print_vma_addr(KERN_CONT " in ", regs->ip);
> + pr_cont("\n");
> + }
> +
> + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
That way, no information is provided to userspace (both application and
debugger), which is rather unfortunate. It would be nice if a new SEGV_*
code was added at least, and CET error (with error code constant provided
in UAPI) is passed via si_errno. (Having ip/sp/*ssp would be even
better, but I'm not exactly sure about ramifications of providing this
kind of information to user space).
On Fri, Sep 21, 2018 at 08:03:34AM -0700, Yu-cheng Yu wrote:
> Update _PAGE_DIRTY to _PAGE_DIRTY_BITS in split_2MB_gtt_entry().
>
> In order to support Control Flow Enforcement (CET), _PAGE_DIRTY
> is now _PAGE_DIRTY_HW or _PAGE_DIRTY_SW.
>
> Signed-off-by: Yu-cheng Yu <[email protected]>
> ---
> drivers/gpu/drm/i915/gvt/gtt.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
> index 00aad8164dec..2d6ba1462dd8 100644
> --- a/drivers/gpu/drm/i915/gvt/gtt.c
> +++ b/drivers/gpu/drm/i915/gvt/gtt.c
> @@ -1170,7 +1170,7 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu,
> }
>
> /* Clear dirty field. */
> - se->val64 &= ~_PAGE_DIRTY;
> + se->val64 &= ~_PAGE_DIRTY_BITS;
_PAGE_DIRTY_BITS is defined only in "[RFC PATCH v4 11/27] x86/mm:
Introduce _PAGE_DIRTY_SW",
On Fri, Sep 21, 2018 at 08:03:33AM -0700, Yu-cheng Yu wrote:
> We are going to create _PAGE_DIRTY_SW for non-hardware, memory
> management purposes. Rename _PAGE_DIRTY to _PAGE_DIRTY_HW and
> _PAGE_BIT_DIRTY to _PAGE_BIT_DIRTY_HW to make these PTE dirty
> bits more clear. There are no functional changes in this
> patch.
I would like there to be some documentation in this patchset which
explains the difference between PAGE_SOFT_DIRTY and PAGE_DIRTY_SW.
Also, is it really necessary to rename PAGE_DIRTY? It feels like a
lot of churn.
On 10/03/2018 06:38 AM, Matthew Wilcox wrote:
> On Fri, Sep 21, 2018 at 08:03:33AM -0700, Yu-cheng Yu wrote:
>> We are going to create _PAGE_DIRTY_SW for non-hardware, memory
>> management purposes. Rename _PAGE_DIRTY to _PAGE_DIRTY_HW and
>> _PAGE_BIT_DIRTY to _PAGE_BIT_DIRTY_HW to make these PTE dirty
>> bits more clear. There are no functional changes in this
>> patch.
> I would like there to be some documentation in this patchset which
> explains the difference between PAGE_SOFT_DIRTY and PAGE_DIRTY_SW.
>
> Also, is it really necessary to rename PAGE_DIRTY? It feels like a
> lot of churn.
This is a lot of churn? Are we looking a the same patch? :)
arch/x86/include/asm/pgtable.h | 6 +++---
arch/x86/include/asm/pgtable_types.h | 17 +++++++++--------
arch/x86/kernel/relocate_kernel_64.S | 2 +-
arch/x86/kvm/vmx.c | 2 +-
4 files changed, 14 insertions(+), 13 deletions(-)
But, yeah, I think we need to. While it will take a little adjustment
in the brains of us old-timers and a bit of pain when switching from old
kernels to new, this makes it a lot more clear what is going on.
On Fri, Sep 21, 2018 at 08:03:44AM -0700, Yu-cheng Yu wrote:
> When setting up a signal, the kernel creates a shadow stack
> restore token at the current SHSTK address and then stores the
> token's address in the signal frame, right after the FPU state.
> Before restoring a signal, the kernel verifies and then uses the
> restore token to set the SHSTK pointer.
> diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
> index ec256ae27a31..5cc4be6e0982 100644
> --- a/arch/x86/kernel/cet.c
> +++ b/arch/x86/kernel/cet.c
> @@ -46,6 +47,69 @@ static unsigned long get_shstk_addr(void)
> return ptr;
> }
>
> +/*
> + * Verify the restore token at the address of 'ssp' is
> + * valid and then set shadow stack pointer according to the
> + * token.
> + */
> +static int verify_rstor_token(bool ia32, unsigned long ssp,
> + unsigned long *new_ssp)
> +{
> + unsigned long token;
> +
> + *new_ssp = 0;
> +
> + if (!IS_ALIGNED(ssp, 8))
> + return -EINVAL;
> +
> + if (get_user(token, (unsigned long __user *)ssp))
> + return -EFAULT;
> +
> + /* Is 64-bit mode flag correct? */
> + if (ia32 && (token & 3) != 0)
> + return -EINVAL;
> + else if ((token & 3) != 1)
> + return -EINVAL;
It is probably worth adding constant names for these flags, example,
there's Section 2.4 in the currently available description[1], and
it took some time before I decided to look into other patches
and find the patch with the documentation (or finally notice section 2.7).
[1] https://software.intel.com/sites/default/files/managed/4d/2a/control-flow-enforcement-technology-preview.pdf
> + token &= ~(1UL);
> +
> + if ((!ia32 && !IS_ALIGNED(token, 8)) || !IS_ALIGNED(token, 4))
> + return -EINVAL;
> +
> + if ((ALIGN_DOWN(token, 8) - 8) != ssp)
> + return -EINVAL;
> +
> + *new_ssp = token;
> + return 0;
> +}
> +
> +/*
> + * Create a restore token on the shadow stack.
> + * A token is always 8-byte and aligned to 8.
> + */
> +static int create_rstor_token(bool ia32, unsigned long ssp,
> + unsigned long *new_ssp)
> +{
> + unsigned long addr;
> +
> + *new_ssp = 0;
> +
> + if ((!ia32 && !IS_ALIGNED(ssp, 8)) || !IS_ALIGNED(ssp, 4))
> + return -EINVAL;
Maybe refactor this check into a separate function/macro?
> +
> + addr = ALIGN_DOWN(ssp, 8) - 8;
> +
> + /* Is the token for 64-bit? */
> + if (!ia32)
> + ssp |= 1;
Again, usage of a named constant might document it better.
> +
> + if (write_user_shstk_64(addr, ssp))
This function is defined in "[RFC PATCH v4 19/27] x86/cet/shstk:
Introduce WRUSS instruction"
> + return -EFAULT;
> +
> + *new_ssp = addr;
> + return 0;
> +}
> +
> int cet_setup_shstk(void)
> {
> unsigned long addr, size;
> @@ -107,3 +171,54 @@ void cet_disable_free_shstk(struct task_struct *tsk)
>
> tsk->thread.cet.shstk_enabled = 0;
> }
> +
> +int cet_restore_signal(unsigned long ssp)
> +{
> + unsigned long new_ssp;
> + int err;
> +
> + if (!current->thread.cet.shstk_enabled)
> + return 0;
> +
> + err = verify_rstor_token(in_ia32_syscall(), ssp, &new_ssp);
> +
> + if (err)
> + return err;
> +
> + return set_shstk_ptr(new_ssp);
> +}
> +
> +/*
> + * Setup the shadow stack for the signal handler: first,
> + * create a restore token to keep track of the current ssp,
> + * and then the return address of the signal handler.
> + */
> +int cet_setup_signal(bool ia32, unsigned long rstor_addr,
> + unsigned long *new_ssp)
> +{
> + unsigned long ssp;
> + int err;
> +
> + if (!current->thread.cet.shstk_enabled)
> + return 0;
> +
> + ssp = get_shstk_addr();
> + err = create_rstor_token(ia32, ssp, new_ssp);
> +
> + if (err)
> + return err;
> +
> + if (ia32) {
> + ssp = *new_ssp - sizeof(u32);
> + err = write_user_shstk_32(ssp, (unsigned int)rstor_addr);
> + } else {
> + ssp = *new_ssp - sizeof(u64);
> + err = write_user_shstk_64(ssp, rstor_addr);
> + }
> +
> + if (err)
> + return err;
> +
> + set_shstk_ptr(ssp);
> + return 0;
> +}
> diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
> index 92a3b312a53c..e9a85689143f 100644
> --- a/arch/x86/kernel/signal.c
> +++ b/arch/x86/kernel/signal.c
> @@ -46,6 +46,7 @@
>
> #include <asm/sigframe.h>
> #include <asm/signal.h>
> +#include <asm/cet.h>
>
> #define COPY(x) do { \
> get_user_ex(regs->x, &sc->x); \
> @@ -152,6 +153,10 @@ static int restore_sigcontext(struct pt_regs *regs,
>
> err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
>
> +#ifdef CONFIG_X86_64
> + err |= restore_sigcontext_ext(buf);
> +#endif
> +
> force_iret();
>
> return err;
> @@ -266,6 +271,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
> }
>
> if (fpu->initialized) {
> +#ifdef CONFIG_X86_64
> + /* sigcontext extension */
> + if (boot_cpu_has(X86_FEATURE_SHSTK))
> + sp -= sizeof(struct sc_ext) + 8;
> +#endif
> sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
> &buf_fx, &math_size);
That might be refactored in a separate function.
Also, it looks like that possible padding for 8-byte alignment
(copy_ext_{to,from}_user) is not accounted here.
> *fpstate = (void __user *)sp;
> @@ -493,6 +503,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
> err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
> err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
>
> + if (!err)
> + err = setup_sigcontext_ext(ksig, fp);
> +
Why is this not in setup_sigcontext, for example?
> if (err)
> return -EFAULT;
>
> @@ -576,6 +589,9 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
> regs, set->sig[0]);
> err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
>
> + if (!err)
> + err = setup_sigcontext_ext(ksig, fpstate);
> +
> if (err)
> return -EFAULT;
>
> @@ -707,6 +723,86 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
> }
> }
>
> +#ifdef CONFIG_X86_64
> +static int copy_ext_from_user(struct sc_ext *ext, void __user *fpu)
> +{
> + void __user *p;
> +
> + if (!fpu)
> + return -EINVAL;
> +
> + p = fpu + fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
> + p = (void __user *)ALIGN((unsigned long)p, 8);
> +
> + if (!access_ok(VERIFY_READ, p, sizeof(*ext)))
> + return -EFAULT;
> +
> + if (__copy_from_user(ext, p, sizeof(*ext)))
> + return -EFAULT;
> +
> + if (ext->total_size != sizeof(*ext))
> + return -EINVAL;
> + return 0;
> +}
> +
> +static int copy_ext_to_user(void __user *fpu, struct sc_ext *ext)
> +{
> + void __user *p;
> +
> + if (!fpu)
> + return -EINVAL;
> +
> + if (ext->total_size != sizeof(*ext))
> + return -EINVAL;
> +
> + p = fpu + fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
> + p = (void __user *)ALIGN((unsigned long)p, 8);
> +
> + if (!access_ok(VERIFY_WRITE, p, sizeof(*ext)))
> + return -EFAULT;
> +
> + if (__copy_to_user(p, ext, sizeof(*ext)))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +int restore_sigcontext_ext(void __user *fp)
> +{
> + int err = 0;
> +
> + if (boot_cpu_has(X86_FEATURE_SHSTK) && fp) {
> + struct sc_ext ext = {0, 0};
> +
> + err = copy_ext_from_user(&ext, fp);
> +
> + if (!err)
> + err = cet_restore_signal(ext.ssp);
> + }
> +
> + return err;
> +}
> +
> +int setup_sigcontext_ext(struct ksignal *ksig, void __user *fp)
> +{
> + int err = 0;
> +
> + if (boot_cpu_has(X86_FEATURE_SHSTK) && fp) {
> + struct sc_ext ext = {0, 0};
> + unsigned long rstor;
> +
> + rstor = (unsigned long)ksig->ka.sa.sa_restorer;
> + err = cet_setup_signal(is_ia32_frame(ksig), rstor, &ext.ssp);
> + if (!err) {
> + ext.total_size = sizeof(ext);
> + err = copy_ext_to_user(fp, &ext);
> + }
> + }
> +
> + return err;
> +}
> +#endif
> +
> static void
> handle_signal(struct ksignal *ksig, struct pt_regs *regs)
> {
> --
> 2.17.1
>
On Fri, Sep 21, 2018 at 08:03:42AM -0700, Yu-cheng Yu wrote:
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 5ea1d64cb0b4..b20450dde5b7 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -652,6 +652,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
> [ilog2(VM_PKEY_BIT4)] = "",
> #endif
> #endif /* CONFIG_ARCH_HAS_PKEYS */
> +#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
> + [ilog2(VM_SHSTK)] = "ss"
> +#endif
It's probably makes sense to have this hunk as a part of "x86/cet/shstk:
Add Kconfig option for user-mode shadow stack", where VM_SHSTK was
initially introduced.
On Wed, 2018-10-03 at 17:08 +0200, Eugene Syromiatnikov wrote:
> On Fri, Sep 21, 2018 at 08:03:42AM -0700, Yu-cheng Yu wrote:
>
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 5ea1d64cb0b4..b20450dde5b7 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -652,6 +652,9 @@ static void show_smap_vma_flags(struct seq_file *m,
> > struct vm_area_struct *vma)
> > [ilog2(VM_PKEY_BIT4)] = "",
> > #endif
> > #endif /* CONFIG_ARCH_HAS_PKEYS */
> > +#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
> > + [ilog2(VM_SHSTK)] = "ss"
> > +#endif
>
> It's probably makes sense to have this hunk as a part of "x86/cet/shstk:
> Add Kconfig option for user-mode shadow stack", where VM_SHSTK was
> initially introduced.
Yes, move it to "mm/Introduce VM_SHSTK for shadow stack memory".
Yu-cheng
On Tue, 2018-10-02 at 22:36 -0700, Andy Lutomirski wrote:
> On Tue, Oct 2, 2018 at 9:55 PM Eugene Syromiatnikov <[email protected]> wrote:
> >
> > On Fri, Sep 21, 2018 at 08:03:48AM -0700, Yu-cheng Yu wrote:
> > > Create a guard area between VMAs, to detect memory corruption.
> >
> > Do I understand correctly that with this patch a user space program
> > no longer be able to place two mappings back to back? If it is so,
> > it will likely break a lot of things; for example, it's a common ring
> > buffer implementations technique, to map buffer memory twice back
> > to back in order to avoid special handling of items wrapping its end.
>
> I haven't checked what the patch actually does, but it shouldn't have
> any affect on MAP_FIXED or the new no-replace MAP_FIXED variant.
>
> --Andy
I did some mmap tests with/without MAP_FIXED, and it works as intended.
In addition to the ring buffer, are there other test cases?
Yu-cheng
On Wed, 2018-10-03 at 06:38 -0700, Matthew Wilcox wrote:
> On Fri, Sep 21, 2018 at 08:03:33AM -0700, Yu-cheng Yu wrote:
> > We are going to create _PAGE_DIRTY_SW for non-hardware, memory
> > management purposes. Rename _PAGE_DIRTY to _PAGE_DIRTY_HW and
> > _PAGE_BIT_DIRTY to _PAGE_BIT_DIRTY_HW to make these PTE dirty
> > bits more clear. There are no functional changes in this
> > patch.
>
> I would like there to be some documentation in this patchset which
> explains the difference between PAGE_SOFT_DIRTY and PAGE_DIRTY_SW.
I will add some comments for the difference between PAGE_SOFT_DIRTY and
PAGE_DIRTY_SW.
Yu-cheng
On Wed, Oct 3, 2018 at 9:06 AM Yu-cheng Yu <[email protected]> wrote:
>
> On Tue, 2018-10-02 at 22:36 -0700, Andy Lutomirski wrote:
> > On Tue, Oct 2, 2018 at 9:55 PM Eugene Syromiatnikov <[email protected]> wrote:
> > >
> > > On Fri, Sep 21, 2018 at 08:03:48AM -0700, Yu-cheng Yu wrote:
> > > > Create a guard area between VMAs, to detect memory corruption.
> > >
> > > Do I understand correctly that with this patch a user space program
> > > no longer be able to place two mappings back to back? If it is so,
> > > it will likely break a lot of things; for example, it's a common ring
> > > buffer implementations technique, to map buffer memory twice back
> > > to back in order to avoid special handling of items wrapping its end.
> >
> > I haven't checked what the patch actually does, but it shouldn't have
> > any affect on MAP_FIXED or the new no-replace MAP_FIXED variant.
> >
> > --Andy
>
> I did some mmap tests with/without MAP_FIXED, and it works as intended.
> In addition to the ring buffer, are there other test cases?
>
Various ELF loaders, perhaps? Do they use MAP_FIXED or do they just
use address hints?
On Wed, 2018-10-03 at 12:39 +0200, Eugene Syromiatnikov wrote:
> On Fri, Sep 21, 2018 at 08:03:30AM -0700, Yu-cheng Yu wrote:
> > +dotraplinkage void
> > +do_control_protection(struct pt_regs *regs, long error_code)
> > +{
> > + struct task_struct *tsk;
> > +
> > + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> > + if (notify_die(DIE_TRAP, "control protection fault", regs,
> > + error_code, X86_TRAP_CP, SIGSEGV) == NOTIFY_STOP)
> > + return;
> > + cond_local_irq_enable(regs);
> > +
> > + if (!user_mode(regs))
> > + die("kernel control protection fault", regs, error_code);
> > +
> > + if (!static_cpu_has(X86_FEATURE_SHSTK) &&
> > + !static_cpu_has(X86_FEATURE_IBT))
> > + WARN_ONCE(1, "CET is disabled but got control "
> > + "protection fault\n");
> > +
> > + tsk = current;
> > + tsk->thread.error_code = error_code;
> > + tsk->thread.trap_nr = X86_TRAP_CP;
> > +
> > + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
> > + printk_ratelimit()) {
> > + unsigned int max_err;
> > +
> > + max_err = ARRAY_SIZE(control_protection_err) - 1;
> > + if ((error_code < 0) || (error_code > max_err))
> > + error_code = 0;
> > + pr_info("%s[%d] control protection ip:%lx sp:%lx
> > error:%lx(%s)",
> > + tsk->comm, task_pid_nr(tsk),
> > + regs->ip, regs->sp, error_code,
> > + control_protection_err[error_code]);
> > + print_vma_addr(KERN_CONT " in ", regs->ip);
> > + pr_cont("\n");
> > + }
> > +
> > + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
>
> That way, no information is provided to userspace (both application and
> debugger), which is rather unfortunate. It would be nice if a new SEGV_*
> code was added at least, and CET error (with error code constant provided
> in UAPI) is passed via si_errno. (Having ip/sp/*ssp would be even
> better, but I'm not exactly sure about ramifications of providing this
> kind of information to user space).
Ok, I will add that.
Yu-cheng
On Wed, Oct 03, 2018 at 09:00:04AM -0700, Yu-cheng Yu wrote:
> On Tue, 2018-10-02 at 22:36 -0700, Andy Lutomirski wrote:
> > On Tue, Oct 2, 2018 at 9:55 PM Eugene Syromiatnikov <[email protected]> wrote:
> > >
> > > On Fri, Sep 21, 2018 at 08:03:48AM -0700, Yu-cheng Yu wrote:
> > > > Create a guard area between VMAs, to detect memory corruption.
> > >
> > > Do I understand correctly that with this patch a user space program
> > > no longer be able to place two mappings back to back? If it is so,
> > > it will likely break a lot of things; for example, it's a common ring
> > > buffer implementations technique, to map buffer memory twice back
> > > to back in order to avoid special handling of items wrapping its end.
> >
> > I haven't checked what the patch actually does, but it shouldn't have
> > any affect on MAP_FIXED or the new no-replace MAP_FIXED variant.
> >
> > --Andy
>
> I did some mmap tests with/without MAP_FIXED, and it works as intended.
> In addition to the ring buffer, are there other test cases?
Right, after some more code reading I figured out that it indeed
shouldn't affect MAP_FIXED, thank you for confirmation.
I'm not sure, however, whether such a change that provides no ability
to configure or affect it will go well with all the supported
architectures.
On Wed, 2018-10-03 at 18:32 +0200, Eugene Syromiatnikov wrote:
> On Wed, Oct 03, 2018 at 09:00:04AM -0700, Yu-cheng Yu wrote:
> > On Tue, 2018-10-02 at 22:36 -0700, Andy Lutomirski wrote:
> > > On Tue, Oct 2, 2018 at 9:55 PM Eugene Syromiatnikov <[email protected]>
> > > wrote:
> > > >
> > > > On Fri, Sep 21, 2018 at 08:03:48AM -0700, Yu-cheng Yu wrote:
> > > > > Create a guard area between VMAs, to detect memory corruption.
> > > >
> > > > Do I understand correctly that with this patch a user space program
> > > > no longer be able to place two mappings back to back? If it is so,
> > > > it will likely break a lot of things; for example, it's a common ring
> > > > buffer implementations technique, to map buffer memory twice back
> > > > to back in order to avoid special handling of items wrapping its end.
> > >
> > > I haven't checked what the patch actually does, but it shouldn't have
> > > any affect on MAP_FIXED or the new no-replace MAP_FIXED variant.
> > >
> > > --Andy
> >
> > I did some mmap tests with/without MAP_FIXED, and it works as intended.
> > In addition to the ring buffer, are there other test cases?
>
> Right, after some more code reading I figured out that it indeed
> shouldn't affect MAP_FIXED, thank you for confirmation.
>
> I'm not sure, however, whether such a change that provides no ability
> to configure or affect it will go well with all the supported
> architectures.
Can we do CONFIG_MMAP_GUARD_GAP?
On Fri, Sep 21, 2018 at 5:09 PM Yu-cheng Yu <[email protected]> wrote:
> When setting up a signal, the kernel creates a shadow stack
> restore token at the current SHSTK address and then stores the
> token's address in the signal frame, right after the FPU state.
> Before restoring a signal, the kernel verifies and then uses the
> restore token to set the SHSTK pointer.
[...]
> +#ifdef CONFIG_X86_64
> +static int copy_ext_from_user(struct sc_ext *ext, void __user *fpu)
> +{
> + void __user *p;
> +
> + if (!fpu)
> + return -EINVAL;
> +
> + p = fpu + fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
> + p = (void __user *)ALIGN((unsigned long)p, 8);
> +
> + if (!access_ok(VERIFY_READ, p, sizeof(*ext)))
> + return -EFAULT;
> +
> + if (__copy_from_user(ext, p, sizeof(*ext)))
> + return -EFAULT;
Why do you first manually call access_ok(), then call
__copy_from_user() with the same size? Just use "if
(copy_from_user(ext, p, sizeof(*ext)))" (without underscores) and get
rid of the access_ok().
> + if (ext->total_size != sizeof(*ext))
> + return -EINVAL;
> + return 0;
> +}
> +
> +static int copy_ext_to_user(void __user *fpu, struct sc_ext *ext)
> +{
> + void __user *p;
> +
> + if (!fpu)
> + return -EINVAL;
> +
> + if (ext->total_size != sizeof(*ext))
> + return -EINVAL;
> +
> + p = fpu + fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
> + p = (void __user *)ALIGN((unsigned long)p, 8);
> +
> + if (!access_ok(VERIFY_WRITE, p, sizeof(*ext)))
> + return -EFAULT;
> +
> + if (__copy_to_user(p, ext, sizeof(*ext)))
> + return -EFAULT;
Same as above.
> + return 0;
> +}
On Wed, Oct 3, 2018 at 6:32 PM Eugene Syromiatnikov <[email protected]> wrote:
> On Wed, Oct 03, 2018 at 09:00:04AM -0700, Yu-cheng Yu wrote:
> > On Tue, 2018-10-02 at 22:36 -0700, Andy Lutomirski wrote:
> > > On Tue, Oct 2, 2018 at 9:55 PM Eugene Syromiatnikov <[email protected]> wrote:
> > > >
> > > > On Fri, Sep 21, 2018 at 08:03:48AM -0700, Yu-cheng Yu wrote:
> > > > > Create a guard area between VMAs, to detect memory corruption.
> > > >
> > > > Do I understand correctly that with this patch a user space program
> > > > no longer be able to place two mappings back to back? If it is so,
> > > > it will likely break a lot of things; for example, it's a common ring
> > > > buffer implementations technique, to map buffer memory twice back
> > > > to back in order to avoid special handling of items wrapping its end.
> > >
> > > I haven't checked what the patch actually does, but it shouldn't have
> > > any affect on MAP_FIXED or the new no-replace MAP_FIXED variant.
> > >
> > > --Andy
> >
> > I did some mmap tests with/without MAP_FIXED, and it works as intended.
> > In addition to the ring buffer, are there other test cases?
>
> Right, after some more code reading I figured out that it indeed
> shouldn't affect MAP_FIXED, thank you for confirmation.
>
> I'm not sure, however, whether such a change that provides no ability
> to configure or affect it will go well with all the supported
> architectures.
Is there a concrete reason why you think an architecture might not
like this? As far as I can tell, the virtual address space overhead
should be insignificant even for 32-bit systems.
On Fri, Sep 21, 2018 at 08:03:50AM -0700, Yu-cheng Yu wrote:
> arch_prctl(ARCH_CET_STATUS, unsigned long *addr)
> Return CET feature status.
>
> The parameter 'addr' is a pointer to a user buffer.
> On returning to the caller, the kernel fills the following
> information:
>
> *addr = SHSTK/IBT status
> *(addr + 1) = SHSTK base address
> *(addr + 2) = SHSTK size
The subtle detail here is that x32 binaries will get 64-bit value, which
is not entirely obvious. I think, it might be better to define
a structure type for it as a part of UAPI, for example:
struct user_cet_status {
__u32 struct_size;
__u32 features;
__kernel_ulong_t shstk_base;
__kernel_ulong_t shstk_size;
};
Adding "struct_size" field along with appropriate checks will also
allow for possible extensions, if they ever appear.
> arch_prctl(ARCH_CET_DISABLE, unsigned long features)
> Disable CET features specified in 'features'. Return
> -EPERM if CET is locked.
While x86_64 and x32 will have 64-bit space for feature bits, IA-32 will
have only 32 bits.
> arch_prctl(ARCH_CET_LOCK)
> Lock in CET feature.
>
> arch_prctl(ARCH_CET_ALLOC_SHSTK, unsigned long *addr)
> Allocate a new SHSTK.
>
> The parameter 'addr' is a pointer to a user buffer and indicates
> the desired SHSTK size to allocate. On returning to the caller
> the buffer contains the address of the new SHSTK.
Again, on x32 that will be a pointer to a 64-bit value, which is not
entirely obvious from this description.
It's not clear whether inability to enable some CET feature in runtime
is unavailable by design or by omission; same for setting (an allocated)
shadow stack as task's shadow stack.
>
> Signed-off-by: H.J. Lu <[email protected]>
> Signed-off-by: Yu-cheng Yu <[email protected]>
> ---
> arch/x86/include/asm/cet.h | 5 ++
> arch/x86/include/uapi/asm/prctl.h | 5 ++
> arch/x86/kernel/Makefile | 2 +-
> arch/x86/kernel/cet.c | 27 +++++++++++
> arch/x86/kernel/cet_prctl.c | 79 +++++++++++++++++++++++++++++++
> arch/x86/kernel/process.c | 5 ++
> 6 files changed, 122 insertions(+), 1 deletion(-)
> create mode 100644 arch/x86/kernel/cet_prctl.c
>
> diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
> index b7b33e1026bb..212bd68e31d3 100644
> --- a/arch/x86/include/asm/cet.h
> +++ b/arch/x86/include/asm/cet.h
> @@ -12,19 +12,24 @@ struct task_struct;
> struct cet_status {
> unsigned long shstk_base;
> unsigned long shstk_size;
> + unsigned int locked:1;
> unsigned int shstk_enabled:1;
> };
>
> #ifdef CONFIG_X86_INTEL_CET
> +int prctl_cet(int option, unsigned long arg2);
> int cet_setup_shstk(void);
> int cet_setup_thread_shstk(struct task_struct *p);
> +int cet_alloc_shstk(unsigned long *arg);
> void cet_disable_shstk(void);
> void cet_disable_free_shstk(struct task_struct *p);
> int cet_restore_signal(unsigned long ssp);
> int cet_setup_signal(bool ia32, unsigned long rstor, unsigned long *new_ssp);
> #else
> +static inline int prctl_cet(int option, unsigned long arg2) { return 0; }
Why 0 and not -EINVAL?
> static inline int cet_setup_shstk(void) { return 0; }
0 here also looks strange.
> static inline int cet_setup_thread_shstk(struct task_struct *p) { return 0; }
And here.
> +static inline int cet_alloc_shstk(unsigned long *arg) { return -EINVAL; }
> static inline void cet_disable_shstk(void) {}
> static inline void cet_disable_free_shstk(struct task_struct *p) {}
> static inline int cet_restore_signal(unsigned long ssp) { return 0; }
> diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
> index 5a6aac9fa41f..3aec1088e01d 100644
> --- a/arch/x86/include/uapi/asm/prctl.h
> +++ b/arch/x86/include/uapi/asm/prctl.h
> @@ -14,4 +14,9 @@
> #define ARCH_MAP_VDSO_32 0x2002
> #define ARCH_MAP_VDSO_64 0x2003
>
> +#define ARCH_CET_STATUS 0x3001
> +#define ARCH_CET_DISABLE 0x3002
> +#define ARCH_CET_LOCK 0x3003
> +#define ARCH_CET_ALLOC_SHSTK 0x3004
> +
> #endif /* _ASM_X86_PRCTL_H */
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 36b14ef410c8..b9e6cdc6b4f7 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -139,7 +139,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
> obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
> obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
>
> -obj-$(CONFIG_X86_INTEL_CET) += cet.o
> +obj-$(CONFIG_X86_INTEL_CET) += cet.o cet_prctl.o
>
> obj-$(CONFIG_ARCH_HAS_PROGRAM_PROPERTIES) += elf.o
>
> diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
> index ce0b3b7b1160..1c2689738604 100644
> --- a/arch/x86/kernel/cet.c
> +++ b/arch/x86/kernel/cet.c
> @@ -110,6 +110,33 @@ static int create_rstor_token(bool ia32, unsigned long ssp,
> return 0;
> }
>
> +int cet_alloc_shstk(unsigned long *arg)
> +{
> + unsigned long len = *arg;
> + unsigned long addr;
> + unsigned long token;
> + unsigned long ssp;
> +
> + addr = do_mmap_locked(0, len, PROT_READ,
> + MAP_ANONYMOUS | MAP_PRIVATE, VM_SHSTK);
> + if (addr >= TASK_SIZE_MAX)
> + return -ENOMEM;
> +
> + /* Restore token is 8 bytes and aligned to 8 bytes */
> + ssp = addr + len;
> + token = ssp;
> +
> + if (!in_ia32_syscall())
> + token |= 1;
This pair of check and bit or'ing definitely asks for a macro or a
wrapper function.
> + ssp -= 8;
> +
> + if (write_user_shstk_64(ssp, token))
> + return -EINVAL;
Shouldn't addr be unmapped on error?
> + *arg = addr;
> + return 0;
> +}
> +
> int cet_setup_shstk(void)
> {
> unsigned long addr, size;
> diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
> new file mode 100644
> index 000000000000..c4b7c19f5040
> --- /dev/null
> +++ b/arch/x86/kernel/cet_prctl.c
> @@ -0,0 +1,79 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/errno.h>
> +#include <linux/uaccess.h>
> +#include <linux/prctl.h>
> +#include <linux/compat.h>
> +#include <asm/processor.h>
> +#include <asm/prctl.h>
> +#include <asm/elf.h>
> +#include <asm/elf_property.h>
> +#include <asm/cet.h>
> +
> +/* See Documentation/x86/intel_cet.txt. */
> +
> +static int handle_get_status(unsigned long arg2)
> +{
> + unsigned int features = 0;
> + unsigned long shstk_base, shstk_size;
> + unsigned long buf[3];
> +
> + if (current->thread.cet.shstk_enabled)
> + features |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
> +
> + shstk_base = current->thread.cet.shstk_base;
> + shstk_size = current->thread.cet.shstk_size;
> +
> + buf[0] = (unsigned long)features;
> + buf[1] = shstk_base;
> + buf[2] = shstk_size;
> + return copy_to_user((unsigned long __user *)arg2, buf,
> + sizeof(buf));
> +}
> +
> +static int handle_alloc_shstk(unsigned long arg2)
> +{
> + int err = 0;
> + unsigned long shstk_size = 0;
> +
> + if (get_user(shstk_size, (unsigned long __user *)arg2))
> + return -EFAULT;
> +
> + err = cet_alloc_shstk(&shstk_size);
> + if (err)
> + return err;
> +
> + if (put_user(shstk_size, (unsigned long __user *)arg2))
Again, leaking allocated stack.
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +int prctl_cet(int option, unsigned long arg2)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
> + return -EINVAL;
> +
> + switch (option) {
> + case ARCH_CET_STATUS:
> + return handle_get_status(arg2);
> +
> + case ARCH_CET_DISABLE:
> + if (current->thread.cet.locked)
> + return -EPERM;
> + if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
> + cet_disable_free_shstk(current);
The rest of bits in arg2 should be 0, otherwise this interface won't be
possible to extend.
> + return 0;
> +
> + case ARCH_CET_LOCK:
> + current->thread.cet.locked = 1;
> + return 0;
> +
> + case ARCH_CET_ALLOC_SHSTK:
> + return handle_alloc_shstk(arg2);
> +
> + default:
> + return -EINVAL;
> + }
> +}
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 440f012ef925..251b8714f9a3 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -792,6 +792,11 @@ long do_arch_prctl_common(struct task_struct *task, int option,
> return get_cpuid_mode();
> case ARCH_SET_CPUID:
> return set_cpuid_mode(task, cpuid_enabled);
> + case ARCH_CET_STATUS:
> + case ARCH_CET_DISABLE:
> + case ARCH_CET_LOCK:
> + case ARCH_CET_ALLOC_SHSTK:
> + return prctl_cet(option, cpuid_enabled);
It's probably a good opportunity to change the strange name for an argument
of a dispatch call.
On Wed, Oct 03, 2018 at 06:52:40PM +0200, Jann Horn wrote:
> On Wed, Oct 3, 2018 at 6:32 PM Eugene Syromiatnikov <[email protected]> wrote:
> > I'm not sure, however, whether such a change that provides no ability
> > to configure or affect it will go well with all the supported
> > architectures.
>
> Is there a concrete reason why you think an architecture might not
> like this? As far as I can tell, the virtual address space overhead
> should be insignificant even for 32-bit systems.
Not really, and not architectures per se, but judging by some past
experiences with enabling ASLR, I would expect that all kinds of weird
applications may start to behave in all kinds of strange ways.
Not that I have anything more than this doubt, however; but this sort of
change without any ability to tune or revert it still looks unusual to me.
On Fri, Sep 21, 2018 at 08:03:45AM -0700, Yu-cheng Yu wrote:
> Look in .note.gnu.property of an ELF file and check if Shadow Stack needs
> to be enabled for the task.
>
> Signed-off-by: H.J. Lu <[email protected]>
> Signed-off-by: Yu-cheng Yu <[email protected]>
> ---
> arch/x86/Kconfig | 4 +
> arch/x86/include/asm/elf.h | 5 +
> arch/x86/include/uapi/asm/elf_property.h | 15 +
> arch/x86/kernel/Makefile | 2 +
> arch/x86/kernel/elf.c | 340 +++++++++++++++++++++++
> fs/binfmt_elf.c | 15 +
> include/uapi/linux/elf.h | 1 +
> 7 files changed, 382 insertions(+)
> create mode 100644 arch/x86/include/uapi/asm/elf_property.h
> create mode 100644 arch/x86/kernel/elf.c
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 808aa3aecf3c..6377125543cc 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1919,12 +1919,16 @@ config X86_INTEL_CET
> config ARCH_HAS_SHSTK
> def_bool n
>
> +config ARCH_HAS_PROGRAM_PROPERTIES
> + def_bool n
> +
> config X86_INTEL_SHADOW_STACK_USER
> prompt "Intel Shadow Stack for user-mode"
> def_bool n
> depends on CPU_SUP_INTEL && X86_64
> select X86_INTEL_CET
> select ARCH_HAS_SHSTK
> + select ARCH_HAS_PROGRAM_PROPERTIES
> ---help---
> Shadow stack provides hardware protection against program stack
> corruption. Only when all the following are true will an application
> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> index 0d157d2a1e2a..5b5f169c5c07 100644
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -382,4 +382,9 @@ struct va_alignment {
>
> extern struct va_alignment va_align;
> extern unsigned long align_vdso_addr(unsigned long);
> +
> +#ifdef CONFIG_ARCH_HAS_PROGRAM_PROPERTIES
> +extern int arch_setup_features(void *ehdr, void *phdr, struct file *file,
> + bool interp);
> +#endif
> #endif /* _ASM_X86_ELF_H */
> diff --git a/arch/x86/include/uapi/asm/elf_property.h b/arch/x86/include/uapi/asm/elf_property.h
> new file mode 100644
> index 000000000000..af361207718c
> --- /dev/null
> +++ b/arch/x86/include/uapi/asm/elf_property.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _UAPI_ASM_X86_ELF_PROPERTY_H
> +#define _UAPI_ASM_X86_ELF_PROPERTY_H
> +
> +/*
> + * pr_type
> + */
> +#define GNU_PROPERTY_X86_FEATURE_1_AND (0xc0000002)
> +
> +/*
> + * Bits for GNU_PROPERTY_X86_FEATURE_1_AND
> + */
> +#define GNU_PROPERTY_X86_FEATURE_1_SHSTK (0x00000002)
Hm, these defeinitions aren't much different comparing to NT_*
definitions in include/uapi/linux/elf.h, is it expected that those
properties have to be parsed individually for each architecture?
> +
> +#endif /* _UAPI_ASM_X86_ELF_PROPERTY_H */
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index fbb2d91fb756..36b14ef410c8 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -141,6 +141,8 @@ obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
>
> obj-$(CONFIG_X86_INTEL_CET) += cet.o
>
> +obj-$(CONFIG_ARCH_HAS_PROGRAM_PROPERTIES) += elf.o
Same thing here, enablement of program properties per se seems rather generic.
> diff --git a/arch/x86/kernel/elf.c b/arch/x86/kernel/elf.c
> new file mode 100644
> index 000000000000..2fddd0bc545b
> --- /dev/null
> +++ b/arch/x86/kernel/elf.c
> @@ -0,0 +1,340 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Look at an ELF file's .note.gnu.property and determine if the file
> + * supports shadow stack and/or indirect branch tracking.
> + * The path from the ELF header to the note section is the following:
> + * elfhdr->elf_phdr->elf_note->property[].
> + */
> +
> +#include <asm/cet.h>
> +#include <asm/elf_property.h>
> +#include <asm/prctl.h>
> +#include <asm/processor.h>
> +#include <uapi/linux/elf-em.h>
> +#include <uapi/linux/prctl.h>
> +#include <linux/binfmts.h>
> +#include <linux/elf.h>
> +#include <linux/slab.h>
> +#include <linux/fs.h>
> +#include <linux/uaccess.h>
> +#include <linux/string.h>
> +#include <linux/compat.h>
> +
> +/*
> + * The .note.gnu.property layout:
> + *
> + * struct elf_note {
> + * u32 n_namesz; --> sizeof(n_name[]); always (4)
> + * u32 n_ndescsz;--> sizeof(property[])
> + * u32 n_type; --> always NT_GNU_PROPERTY_TYPE_0
> + * };
> + * char n_name[4]; --> always 'GNU\0'
> + *
> + * struct {
> + * struct property_x86 {
> + * u32 pr_type;
> + * u32 pr_datasz;
> + * };
> + * u8 pr_data[pr_datasz];
> + * }[];
> + */
> +
> +#define BUF_SIZE (PAGE_SIZE / 4)
> +
> +struct property_x86 {
> + u32 pr_type;
> + u32 pr_datasz;
> +};
> +
> +typedef bool (test_fn)(void *buf, u32 *arg);
> +typedef void *(next_fn)(void *buf, u32 *arg);
> +
> +static inline bool test_note_type_0(void *buf, u32 *arg)
> +{
> + struct elf_note *n = buf;
> +
> + return ((n->n_namesz == 4) && (memcmp(n + 1, "GNU", 4) == 0) &&
> + (n->n_type == NT_GNU_PROPERTY_TYPE_0));
> +}
> +
> +static inline void *next_note(void *buf, u32 *arg)
> +{
> + struct elf_note *n = buf;
> + u32 align = *arg;
> + int size;
> +
> + size = round_up(sizeof(*n) + n->n_namesz, align);
> + size = round_up(size + n->n_descsz, align);
> +
> + if (buf + size < buf)
> + return NULL;
> + else
> + return (buf + size);
> +}
> +
> +static inline bool test_property_x86(void *buf, u32 *arg)
> +{
> + struct property_x86 *pr = buf;
> + u32 max_type = *arg;
> +
> + if (pr->pr_type > max_type)
> + *arg = pr->pr_type;
> +
> + return (pr->pr_type == GNU_PROPERTY_X86_FEATURE_1_AND);
> +}
> +
> +static inline void *next_property(void *buf, u32 *arg)
> +{
> + struct property_x86 *pr = buf;
> + u32 max_type = *arg;
> +
> + if ((buf + sizeof(*pr) + pr->pr_datasz < buf) ||
> + (pr->pr_type > GNU_PROPERTY_X86_FEATURE_1_AND) ||
> + (pr->pr_type > max_type))
> + return NULL;
> + else
> + return (buf + sizeof(*pr) + pr->pr_datasz);
> +}
> +
> +/*
> + * Scan 'buf' for a pattern; return true if found.
> + * *pos is the distance from the beginning of buf to where
> + * the searched item or the next item is located.
> + */
> +static int scan(u8 *buf, u32 buf_size, int item_size,
> + test_fn test, next_fn next, u32 *arg, u32 *pos)
> +{
> + int found = 0;
> + u8 *p, *max;
> +
> + max = buf + buf_size;
> + if (max < buf)
> + return 0;
> +
> + p = buf;
> +
> + while ((p + item_size < max) && (p + item_size > buf)) {
> + if (test(p, arg)) {
> + found = 1;
> + break;
> + }
> +
> + p = next(p, arg);
> + }
> +
> + *pos = (p + item_size <= buf) ? 0 : (u32)(p - buf);
> + return found;
> +}
> +
> +/*
> + * Search a NT_GNU_PROPERTY_TYPE_0 for GNU_PROPERTY_X86_FEATURE_1_AND.
> + */
> +static int find_feature_x86(struct file *file, unsigned long desc_size,
> + loff_t file_offset, u8 *buf, u32 *feature)
> +{
> + u32 buf_pos;
> + unsigned long read_size;
> + unsigned long done;
> + int found = 0;
> + int ret = 0;
> + u32 last_pr = 0;
> +
> + *feature = 0;
> + buf_pos = 0;
> +
> + for (done = 0; done < desc_size; done += buf_pos) {
> + read_size = desc_size - done;
> + if (read_size > BUF_SIZE)
> + read_size = BUF_SIZE;
> +
> + ret = kernel_read(file, buf, read_size, &file_offset);
> +
> + if (ret != read_size)
> + return (ret < 0) ? ret : -EIO;
> +
> + ret = 0;
> + found = scan(buf, read_size, sizeof(struct property_x86),
> + test_property_x86, next_property,
> + &last_pr, &buf_pos);
> +
> + if ((!buf_pos) || found)
> + break;
> +
> + file_offset += buf_pos - read_size;
> + }
> +
> + if (found) {
> + struct property_x86 *pr =
> + (struct property_x86 *)(buf + buf_pos);
> +
> + if (pr->pr_datasz == 4) {
> + u32 *max = (u32 *)(buf + read_size);
> + u32 *data = (u32 *)((u8 *)pr + sizeof(*pr));
> +
> + if (data + 1 <= max) {
> + *feature = *data;
> + } else {
> + file_offset += buf_pos - read_size;
> + file_offset += sizeof(*pr);
> + ret = kernel_read(file, feature, 4,
> + &file_offset);
> + }
> + }
> + }
> +
> + return ret;
> +}
> +
> +/*
> + * Search a PT_NOTE segment for the first NT_GNU_PROPERTY_TYPE_0.
> + */
> +static int find_note_type_0(struct file *file, unsigned long note_size,
> + loff_t file_offset, u32 align, u32 *feature)
> +{
> + u8 *buf;
> + u32 buf_pos;
> + unsigned long read_size;
> + unsigned long done;
> + int found = 0;
> + int ret = 0;
> +
> + buf = kmalloc(BUF_SIZE, GFP_KERNEL);
> + if (!buf)
> + return -ENOMEM;
> +
> + *feature = 0;
> + buf_pos = 0;
> +
> + for (done = 0; done < note_size; done += buf_pos) {
> + read_size = note_size - done;
> + if (read_size > BUF_SIZE)
> + read_size = BUF_SIZE;
> +
> + ret = kernel_read(file, buf, read_size, &file_offset);
> +
> + if (ret != read_size) {
> + ret = (ret < 0) ? ret : -EIO;
> + kfree(buf);
> + return ret;
> + }
> +
> + /*
> + * item_size = sizeof(struct elf_note) + elf_note.n_namesz.
> + * n_namesz is 4 for the note type we look for.
> + */
> + ret = 0;
> + found += scan(buf, read_size, sizeof(struct elf_note) + 4,
> + test_note_type_0, next_note,
> + &align, &buf_pos);
> +
> + file_offset += buf_pos - read_size;
> +
> + if (found == 1) {
> + struct elf_note *n =
> + (struct elf_note *)(buf + buf_pos);
> + u32 start = round_up(sizeof(*n) + n->n_namesz, align);
> + u32 total = round_up(start + n->n_descsz, align);
> +
> + ret = find_feature_x86(file, n->n_descsz,
> + file_offset + start,
> + buf, feature);
> + file_offset += total;
> + buf_pos += total;
> + } else if (!buf_pos) {
> + *feature = 0;
> + break;
> + }
> + }
> +
> + kfree(buf);
> + return ret;
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static int check_notes_32(struct file *file, struct elf32_phdr *phdr,
> + int phnum, u32 *feature)
> +{
> + int i;
> + int err = 0;
> +
> + for (i = 0; i < phnum; i++, phdr++) {
> + if ((phdr->p_type != PT_NOTE) || (phdr->p_align != 4))
> + continue;
> +
> + err = find_note_type_0(file, phdr->p_filesz, phdr->p_offset,
> + phdr->p_align, feature);
> + if (err)
> + return err;
> + }
> +
> + return 0;
> +}
> +#endif
> +
> +#ifdef CONFIG_X86_64
> +static int check_notes_64(struct file *file, struct elf64_phdr *phdr,
> + int phnum, u32 *feature)
> +{
> + int i;
> + int err = 0;
> +
> + for (i = 0; i < phnum; i++, phdr++) {
> + if ((phdr->p_type != PT_NOTE) || (phdr->p_align != 8))
> + continue;
> +
> + err = find_note_type_0(file, phdr->p_filesz, phdr->p_offset,
> + phdr->p_align, feature);
> + if (err)
> + return err;
> + }
> +
> + return 0;
> +}
> +#endif
> +
> +int arch_setup_features(void *ehdr_p, void *phdr_p,
> + struct file *file, bool interp)
> +{
> + int err = 0;
> + u32 feature = 0;
> +
> + struct elf64_hdr *ehdr64 = ehdr_p;
> +
> + if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
> + return 0;
> +
> + if (ehdr64->e_ident[EI_CLASS] == ELFCLASS64) {
> + struct elf64_phdr *phdr64 = phdr_p;
> +
> + err = check_notes_64(file, phdr64, ehdr64->e_phnum,
> + &feature);
> + if (err < 0)
> + goto out;
> + } else {
> +#ifdef CONFIG_COMPAT
> + struct elf32_hdr *ehdr32 = ehdr_p;
> +
> + if (ehdr32->e_ident[EI_CLASS] == ELFCLASS32) {
> + struct elf32_phdr *phdr32 = phdr_p;
> +
> + err = check_notes_32(file, phdr32, ehdr32->e_phnum,
> + &feature);
> + if (err < 0)
> + goto out;
> + }
> +#endif
> + }
> +
> + memset(¤t->thread.cet, 0, sizeof(struct cet_status));
> +
> + if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
> + if (feature & GNU_PROPERTY_X86_FEATURE_1_SHSTK) {
> + err = cet_setup_shstk();
> + if (err < 0)
> + goto out;
> + }
> + }
> +
> +out:
> + return err;
> +}
There's a lot of similar code with bpf stackmap .build-id code (commit
v4.17-rc1~148^2~156^2~3^2~1), it might be worthy generalising some ELF
traversal routines, since there's general need of parsing ELF property
segments.
On Tue, 2018-10-02 at 19:15 +0200, Borislav Petkov wrote:
> On Fri, Sep 21, 2018 at 08:03:27AM -0700, Yu-cheng Yu wrote:
> >
> > diff --git a/arch/x86/include/asm/fpu/xstate.h
> > b/arch/x86/include/asm/fpu/xstate.h
> > index 9b382e5157ed..a32dc5f8c963 100644
> > --- a/arch/x86/include/asm/fpu/xstate.h
> > +++ b/arch/x86/include/asm/fpu/xstate.h
> > @@ -19,10 +19,10 @@
> > #define XSAVE_YMM_SIZE 256
> > #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
> >
> > -/* System features */
> > -#define XFEATURE_MASK_SYSTEM (XFEATURE_MASK_PT)
>
> Previous patch renames it, this patch deletes it. Why do we need all
> that unnecessary churn?
>
> Also, this patch is trying to do a couple of things at once and
> reviewing it is not trivial. Please split the changes logically.
Yes, if we leave XFEATURE_MASK_SUPERVISOR unchanged in the previous patch, this
patch becomes much simpler. Perhaps we don't even need to split this one.
> > diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
> > index 19f8df54c72a..dd2c561c4544 100644
> > --- a/arch/x86/kernel/fpu/xstate.c
> > +++ b/arch/x86/kernel/fpu/xstate.c
> > @@ -51,13 +51,16 @@ static short xsave_cpuid_features[] __initdata = {
> > };
> >
> > /*
> > - * Mask of xstate features supported by the CPU and the kernel:
> > + * Mask of xstate features supported by the CPU and the kernel.
> > + * This is the result from CPUID query, SUPPORTED_XFEATURES_MASK,
> > + * and boot_cpu_has().
> > */
>
> This needs to explain what both masks are - user and system. "CPU" and
> "kernel" is not "user" and "all".
>
> > u64 xfeatures_mask_user __read_mostly;
> > +u64 xfeatures_mask_all __read_mostly;
The first one is all supported "user" states; the latter is "system" and "user"
states combined. I will put in comments.
Yu-cheng
On Thu, 2018-10-04 at 01:27 +0200, Eugene Syromiatnikov wrote:
> On Fri, Sep 21, 2018 at 08:03:45AM -0700, Yu-cheng Yu wrote:
[...]
> > diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> > index 0d157d2a1e2a..5b5f169c5c07 100644
> > --- a/arch/x86/include/asm/elf.h
> > +++ b/arch/x86/include/asm/elf.h
> > @@ -382,4 +382,9 @@ struct va_alignment {
> >
> > extern struct va_alignment va_align;
> > extern unsigned long align_vdso_addr(unsigned long);
> > +
> > +#ifdef CONFIG_ARCH_HAS_PROGRAM_PROPERTIES
> > +extern int arch_setup_features(void *ehdr, void *phdr, struct file *file,
> > + bool interp);
> > +#endif
> > #endif /* _ASM_X86_ELF_H */
> > diff --git a/arch/x86/include/uapi/asm/elf_property.h
> > b/arch/x86/include/uapi/asm/elf_property.h
> > new file mode 100644
> > index 000000000000..af361207718c
> > --- /dev/null
> > +++ b/arch/x86/include/uapi/asm/elf_property.h
> > @@ -0,0 +1,15 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _UAPI_ASM_X86_ELF_PROPERTY_H
> > +#define _UAPI_ASM_X86_ELF_PROPERTY_H
> > +
> > +/*
> > + * pr_type
> > + */
> > +#define GNU_PROPERTY_X86_FEATURE_1_AND (0xc0000002)
> > +
> > +/*
> > + * Bits for GNU_PROPERTY_X86_FEATURE_1_AND
> > + */
> > +#define GNU_PROPERTY_X86_FEATURE_1_SHSTK (0x00000002)
>
> Hm, these defeinitions aren't much different comparing to NT_*
> definitions in include/uapi/linux/elf.h, is it expected that those
> properties have to be parsed individually for each architecture?
Yes, we have NT_GNU_PROPERTY_TYPE_0 defined in include/uapi/linux/elf.h.
GNU_PROPERTY_X86_FEATURE_1_xxxx is for X86 only.
[...]
>
> There's a lot of similar code with bpf stackmap .build-id code (commit
> v4.17-rc1~148^2~156^2~3^2~1), it might be worthy generalising some ELF
> traversal routines, since there's general need of parsing ELF property
> segments.
Only a small similarity exists. The routine find_note_type_0() does a lot more
validation. It appears stack_map_get_build_id() does not need that.
Yu-cheng
On Fri, Sep 21, 2018 at 8:03 AM, Yu-cheng Yu <[email protected]> wrote:
> Look in .note.gnu.property of an ELF file and check if Shadow Stack needs
> to be enabled for the task.
Ah, I've been wanting this for other things too (see below).
>
> Signed-off-by: H.J. Lu <[email protected]>
> Signed-off-by: Yu-cheng Yu <[email protected]>
> ---
> arch/x86/Kconfig | 4 +
> arch/x86/include/asm/elf.h | 5 +
> arch/x86/include/uapi/asm/elf_property.h | 15 +
> arch/x86/kernel/Makefile | 2 +
> arch/x86/kernel/elf.c | 340 +++++++++++++++++++++++
> fs/binfmt_elf.c | 15 +
> include/uapi/linux/elf.h | 1 +
> 7 files changed, 382 insertions(+)
> create mode 100644 arch/x86/include/uapi/asm/elf_property.h
> create mode 100644 arch/x86/kernel/elf.c
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 808aa3aecf3c..6377125543cc 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1919,12 +1919,16 @@ config X86_INTEL_CET
> config ARCH_HAS_SHSTK
> def_bool n
>
> +config ARCH_HAS_PROGRAM_PROPERTIES
> + def_bool n
> +
> config X86_INTEL_SHADOW_STACK_USER
> prompt "Intel Shadow Stack for user-mode"
> def_bool n
> depends on CPU_SUP_INTEL && X86_64
> select X86_INTEL_CET
> select ARCH_HAS_SHSTK
> + select ARCH_HAS_PROGRAM_PROPERTIES
> ---help---
> Shadow stack provides hardware protection against program stack
> corruption. Only when all the following are true will an application
> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> index 0d157d2a1e2a..5b5f169c5c07 100644
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -382,4 +382,9 @@ struct va_alignment {
>
> extern struct va_alignment va_align;
> extern unsigned long align_vdso_addr(unsigned long);
> +
> +#ifdef CONFIG_ARCH_HAS_PROGRAM_PROPERTIES
> +extern int arch_setup_features(void *ehdr, void *phdr, struct file *file,
> + bool interp);
> +#endif
> #endif /* _ASM_X86_ELF_H */
> diff --git a/arch/x86/include/uapi/asm/elf_property.h b/arch/x86/include/uapi/asm/elf_property.h
> new file mode 100644
> index 000000000000..af361207718c
> --- /dev/null
> +++ b/arch/x86/include/uapi/asm/elf_property.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _UAPI_ASM_X86_ELF_PROPERTY_H
> +#define _UAPI_ASM_X86_ELF_PROPERTY_H
> +
> +/*
> + * pr_type
> + */
> +#define GNU_PROPERTY_X86_FEATURE_1_AND (0xc0000002)
> +
> +/*
> + * Bits for GNU_PROPERTY_X86_FEATURE_1_AND
> + */
> +#define GNU_PROPERTY_X86_FEATURE_1_SHSTK (0x00000002)
> +
> +#endif /* _UAPI_ASM_X86_ELF_PROPERTY_H */
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index fbb2d91fb756..36b14ef410c8 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -141,6 +141,8 @@ obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
>
> obj-$(CONFIG_X86_INTEL_CET) += cet.o
>
> +obj-$(CONFIG_ARCH_HAS_PROGRAM_PROPERTIES) += elf.o
> +
> ###
> # 64 bit specific files
> ifeq ($(CONFIG_X86_64),y)
> diff --git a/arch/x86/kernel/elf.c b/arch/x86/kernel/elf.c
> new file mode 100644
> index 000000000000..2fddd0bc545b
> --- /dev/null
> +++ b/arch/x86/kernel/elf.c
> @@ -0,0 +1,340 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Look at an ELF file's .note.gnu.property and determine if the file
> + * supports shadow stack and/or indirect branch tracking.
> + * The path from the ELF header to the note section is the following:
> + * elfhdr->elf_phdr->elf_note->property[].
> + */
> +
> +#include <asm/cet.h>
> +#include <asm/elf_property.h>
> +#include <asm/prctl.h>
> +#include <asm/processor.h>
> +#include <uapi/linux/elf-em.h>
> +#include <uapi/linux/prctl.h>
> +#include <linux/binfmts.h>
> +#include <linux/elf.h>
> +#include <linux/slab.h>
> +#include <linux/fs.h>
> +#include <linux/uaccess.h>
> +#include <linux/string.h>
> +#include <linux/compat.h>
> +
> +/*
> + * The .note.gnu.property layout:
> + *
> + * struct elf_note {
> + * u32 n_namesz; --> sizeof(n_name[]); always (4)
> + * u32 n_ndescsz;--> sizeof(property[])
> + * u32 n_type; --> always NT_GNU_PROPERTY_TYPE_0
> + * };
> + * char n_name[4]; --> always 'GNU\0'
> + *
> + * struct {
> + * struct property_x86 {
> + * u32 pr_type;
> + * u32 pr_datasz;
> + * };
> + * u8 pr_data[pr_datasz];
> + * }[];
> + */
Does NT_GNU_PROPERTY_TYPE_0 only ever contain property_x86 bytes? (I
assume not, since there is a pr_type?)
> +
> +#define BUF_SIZE (PAGE_SIZE / 4)
> +
> +struct property_x86 {
> + u32 pr_type;
> + u32 pr_datasz;
> +};
> +
> +typedef bool (test_fn)(void *buf, u32 *arg);
> +typedef void *(next_fn)(void *buf, u32 *arg);
> +
> +static inline bool test_note_type_0(void *buf, u32 *arg)
> +{
> + struct elf_note *n = buf;
> +
> + return ((n->n_namesz == 4) && (memcmp(n + 1, "GNU", 4) == 0) &&
> + (n->n_type == NT_GNU_PROPERTY_TYPE_0));
Cheaper to test n_type first...
> +}
> +
> +static inline void *next_note(void *buf, u32 *arg)
> +{
> + struct elf_note *n = buf;
> + u32 align = *arg;
> + int size;
> +
> + size = round_up(sizeof(*n) + n->n_namesz, align);
I think this could overflow: n_namesz can be u64 for elf64_note.
> + size = round_up(size + n->n_descsz, align);
Same here. You may want to use check_add_overflow(), etc, an u64 types.
> +
> + if (buf + size < buf)
> + return NULL;
I don't understand this. You want to check size not exceeding the
allocation, which isn't passed into this function. Checking for a full
unsigned address wrap around is not sufficient to detect overflow.
> + else
> + return (buf + size);
> +}
> +
> +static inline bool test_property_x86(void *buf, u32 *arg)
> +{
> + struct property_x86 *pr = buf;
> + u32 max_type = *arg;
> +
> + if (pr->pr_type > max_type)
> + *arg = pr->pr_type;
Why is *arg being updated? I don't see last_pr used outside of here --
are properties required to be pr_type-ordered?
> +
> + return (pr->pr_type == GNU_PROPERTY_X86_FEATURE_1_AND);
> +}
> +
> +static inline void *next_property(void *buf, u32 *arg)
> +{
> + struct property_x86 *pr = buf;
> + u32 max_type = *arg;
> +
> + if ((buf + sizeof(*pr) + pr->pr_datasz < buf) ||
Again, this "< buf" test doesn't look at all correct to me.
> + (pr->pr_type > GNU_PROPERTY_X86_FEATURE_1_AND) ||
> + (pr->pr_type > max_type))
> + return NULL;
> + else
> + return (buf + sizeof(*pr) + pr->pr_datasz);
> +}
> +
> +/*
> + * Scan 'buf' for a pattern; return true if found.
> + * *pos is the distance from the beginning of buf to where
> + * the searched item or the next item is located.
> + */
> +static int scan(u8 *buf, u32 buf_size, int item_size,
> + test_fn test, next_fn next, u32 *arg, u32 *pos)
I'm not a fan of the short "scan", "test" and "next" names, and I
really don't like an arg named "arg". Something slightly more
descriptive for all of these would be nice, please.
> +{
> + int found = 0;
> + u8 *p, *max;
> +
> + max = buf + buf_size;
> + if (max < buf)
> + return 0;
> +
> + p = buf;
> +
> + while ((p + item_size < max) && (p + item_size > buf)) {
These comparisons are safe due to the BUF_SIZE limit of buf_size and
the only used size of item_size, but if this becomes more generic, it
should be more defensive on the size calculations (e.g. make sure than
"item_size < max" and then here "p < max - item_size", etc).
I'd kind of rather this code walked the base type and check each for
the matching feature. What is the general specification for what
NT_GNU_PROPERTY_TYPE_0 contains?
> + if (test(p, arg)) {
> + found = 1;
> + break;
> + }
> +
> + p = next(p, arg);
> + }
> +
> + *pos = (p + item_size <= buf) ? 0 : (u32)(p - buf);
> + return found;
> +}
> +
> +/*
> + * Search a NT_GNU_PROPERTY_TYPE_0 for GNU_PROPERTY_X86_FEATURE_1_AND.
> + */
> +static int find_feature_x86(struct file *file, unsigned long desc_size,
> + loff_t file_offset, u8 *buf, u32 *feature)
> +{
> + u32 buf_pos;
> + unsigned long read_size;
> + unsigned long done;
> + int found = 0;
> + int ret = 0;
> + u32 last_pr = 0;
> +
> + *feature = 0;
> + buf_pos = 0;
> +
> + for (done = 0; done < desc_size; done += buf_pos) {
> + read_size = desc_size - done;
> + if (read_size > BUF_SIZE)
> + read_size = BUF_SIZE;
> +
> + ret = kernel_read(file, buf, read_size, &file_offset);
> +
> + if (ret != read_size)
> + return (ret < 0) ? ret : -EIO;
> +
> + ret = 0;
> + found = scan(buf, read_size, sizeof(struct property_x86),
> + test_property_x86, next_property,
> + &last_pr, &buf_pos);
> +
> + if ((!buf_pos) || found)
> + break;
> +
> + file_offset += buf_pos - read_size;
> + }
> +
> + if (found) {
> + struct property_x86 *pr =
> + (struct property_x86 *)(buf + buf_pos);
> +
> + if (pr->pr_datasz == 4) {
> + u32 *max = (u32 *)(buf + read_size);
> + u32 *data = (u32 *)((u8 *)pr + sizeof(*pr));
> +
> + if (data + 1 <= max) {
> + *feature = *data;
> + } else {
> + file_offset += buf_pos - read_size;
> + file_offset += sizeof(*pr);
> + ret = kernel_read(file, feature, 4,
> + &file_offset);
> + }
> + }
> + }
> +
> + return ret;
> +}
> +
> +/*
> + * Search a PT_NOTE segment for the first NT_GNU_PROPERTY_TYPE_0.
> + */
> +static int find_note_type_0(struct file *file, unsigned long note_size,
> + loff_t file_offset, u32 align, u32 *feature)
> +{
> + u8 *buf;
> + u32 buf_pos;
> + unsigned long read_size;
> + unsigned long done;
> + int found = 0;
> + int ret = 0;
> +
> + buf = kmalloc(BUF_SIZE, GFP_KERNEL);
> + if (!buf)
> + return -ENOMEM;
Why kmalloc over stack variable? (Or, does BUF_SIZE here really need
to be 1024?)
> +
> + *feature = 0;
> + buf_pos = 0;
> +
> + for (done = 0; done < note_size; done += buf_pos) {
> + read_size = note_size - done;
> + if (read_size > BUF_SIZE)
> + read_size = BUF_SIZE;
> +
> + ret = kernel_read(file, buf, read_size, &file_offset);
> +
> + if (ret != read_size) {
> + ret = (ret < 0) ? ret : -EIO;
> + kfree(buf);
> + return ret;
> + }
> +
> + /*
> + * item_size = sizeof(struct elf_note) + elf_note.n_namesz.
> + * n_namesz is 4 for the note type we look for.
> + */
> + ret = 0;
> + found += scan(buf, read_size, sizeof(struct elf_note) + 4,
> + test_note_type_0, next_note,
> + &align, &buf_pos);
> +
> + file_offset += buf_pos - read_size;
> +
> + if (found == 1) {
> + struct elf_note *n =
> + (struct elf_note *)(buf + buf_pos);
> + u32 start = round_up(sizeof(*n) + n->n_namesz, align);
> + u32 total = round_up(start + n->n_descsz, align);
Same overflow notes from earlier...
> +
> + ret = find_feature_x86(file, n->n_descsz,
> + file_offset + start,
> + buf, feature);
> + file_offset += total;
> + buf_pos += total;
> + } else if (!buf_pos) {
> + *feature = 0;
> + break;
> + }
> + }
> +
> + kfree(buf);
> + return ret;
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static int check_notes_32(struct file *file, struct elf32_phdr *phdr,
> + int phnum, u32 *feature)
> +{
> + int i;
> + int err = 0;
> +
> + for (i = 0; i < phnum; i++, phdr++) {
> + if ((phdr->p_type != PT_NOTE) || (phdr->p_align != 4))
> + continue;
> +
> + err = find_note_type_0(file, phdr->p_filesz, phdr->p_offset,
> + phdr->p_align, feature);
> + if (err)
> + return err;
> + }
> +
> + return 0;
> +}
> +#endif
> +
> +#ifdef CONFIG_X86_64
> +static int check_notes_64(struct file *file, struct elf64_phdr *phdr,
> + int phnum, u32 *feature)
> +{
> + int i;
> + int err = 0;
> +
> + for (i = 0; i < phnum; i++, phdr++) {
> + if ((phdr->p_type != PT_NOTE) || (phdr->p_align != 8))
> + continue;
Instead of a separate parser here, wouldn't it be a bit nicer to
attach this to the existing binfmt_elf program header parsing loop:
elf_ppnt = elf_phdata;
for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
switch (elf_ppnt->p_type) {
case PT_GNU_STACK:
...
case PT_LOPROC ... PT_HIPROC:
...
> +
> + err = find_note_type_0(file, phdr->p_filesz, phdr->p_offset,
> + phdr->p_align, feature);
> + if (err)
> + return err;
> + }
> +
> + return 0;
> +}
> +#endif
> +
> +int arch_setup_features(void *ehdr_p, void *phdr_p,
> + struct file *file, bool interp)
> +{
> + int err = 0;
> + u32 feature = 0;
> +
> + struct elf64_hdr *ehdr64 = ehdr_p;
> +
> + if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
> + return 0;
> +
> + if (ehdr64->e_ident[EI_CLASS] == ELFCLASS64) {
> + struct elf64_phdr *phdr64 = phdr_p;
> +
> + err = check_notes_64(file, phdr64, ehdr64->e_phnum,
> + &feature);
> + if (err < 0)
> + goto out;
> + } else {
> +#ifdef CONFIG_COMPAT
> + struct elf32_hdr *ehdr32 = ehdr_p;
> +
> + if (ehdr32->e_ident[EI_CLASS] == ELFCLASS32) {
> + struct elf32_phdr *phdr32 = phdr_p;
> +
> + err = check_notes_32(file, phdr32, ehdr32->e_phnum,
> + &feature);
> + if (err < 0)
> + goto out;
> + }
> +#endif
Should there be an #else error here?
> + }
> +
> + memset(¤t->thread.cet, 0, sizeof(struct cet_status));
> +
> + if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
The CPU feature was already tested at arch_setup_features() entry.
> + if (feature & GNU_PROPERTY_X86_FEATURE_1_SHSTK) {
> + err = cet_setup_shstk();
> + if (err < 0)
> + goto out;
> + }
> + }
> +
> +out:
> + return err;
> +}
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index efae2fb0930a..b891aa292b46 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -1081,6 +1081,21 @@ static int load_elf_binary(struct linux_binprm *bprm)
> goto out_free_dentry;
> }
>
> +#ifdef CONFIG_ARCH_HAS_PROGRAM_PROPERTIES
> + if (interpreter) {
> + retval = arch_setup_features(&loc->interp_elf_ex,
> + interp_elf_phdata,
> + interpreter, true);
> + } else {
> + retval = arch_setup_features(&loc->elf_ex,
> + elf_phdata,
> + bprm->file, false);
> + }
> +
> + if (retval < 0)
> + goto out_free_dentry;
> +#endif
> +
> if (elf_interpreter) {
> unsigned long interp_map_addr = 0;
>
> diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
> index c5358e0ae7c5..5ef25a565e88 100644
> --- a/include/uapi/linux/elf.h
> +++ b/include/uapi/linux/elf.h
> @@ -372,6 +372,7 @@ typedef struct elf64_shdr {
> #define NT_PRFPREG 2
> #define NT_PRPSINFO 3
> #define NT_TASKSTRUCT 4
> +#define NT_GNU_PROPERTY_TYPE_0 5
> #define NT_AUXV 6
> /*
> * Note to userspace developers: size of NT_SIGINFO note may increase
> --
> 2.17.1
>
I'd like to be using this code for a few other cases too (not just
x86-specific). For example, for marking KASan binaries as needing a
"legacy" memory layouts[1]. Others might be setting things like
no_new_privs at exec time, etc.
-Kees
[1] https://lkml.kernel.org/r/CAGXu5jL1HRG7Dn9vraw8Hu7LF+69k3EDpztt1Ju7ijEzmvRdhA@mail.gmail.com
--
Kees Cook
Pixel Security
On Mon, 2018-10-15 at 16:40 -0700, Kees Cook wrote:
> On Fri, Sep 21, 2018 at 8:03 AM, Yu-cheng Yu <[email protected]> wrote:
> > Look in .note.gnu.property of an ELF file and check if Shadow Stack needs
> > to be enabled for the task.
[...]
> > +/*
> > + * The .note.gnu.property layout:
> > + *
> > + * struct elf_note {
> > + * u32 n_namesz; --> sizeof(n_name[]); always (4)
> > + * u32 n_ndescsz;--> sizeof(property[])
> > + * u32 n_type; --> always NT_GNU_PROPERTY_TYPE_0
> > + * };
> > + * char n_name[4]; --> always 'GNU\0'
> > + *
> > + * struct {
> > + * struct property_x86 {
> > + * u32 pr_type;
> > + * u32 pr_datasz;
> > + * };
> > + * u8 pr_data[pr_datasz];
> > + * }[];
> > + */
>
> Does NT_GNU_PROPERTY_TYPE_0 only ever contain property_x86 bytes? (I
> assume not, since there is a pr_type?)
There are other property types, but we only look for NT_GNU_PROPERTY_TYPE_0.
> > +
> > +#define BUF_SIZE (PAGE_SIZE / 4)
> > +
> > +struct property_x86 {
> > + u32 pr_type;
> > + u32 pr_datasz;
> > +};
> > +
> > +typedef bool (test_fn)(void *buf, u32 *arg);
> > +typedef void *(next_fn)(void *buf, u32 *arg);
> > +
> > +static inline bool test_note_type_0(void *buf, u32 *arg)
> > +{
> > + struct elf_note *n = buf;
> > +
> > + return ((n->n_namesz == 4) && (memcmp(n + 1, "GNU", 4) == 0) &&
> > + (n->n_type == NT_GNU_PROPERTY_TYPE_0));
>
> Cheaper to test n_type first...
Yes, Thanks!
>
> > +}
> > +
> > +static inline void *next_note(void *buf, u32 *arg)
> > +{
> > + struct elf_note *n = buf;
> > + u32 align = *arg;
> > + int size;
> > +
> > + size = round_up(sizeof(*n) + n->n_namesz, align);
>
> I think this could overflow: n_namesz can be u64 for elf64_note.
>
> > + size = round_up(size + n->n_descsz, align);
>
> Same here. You may want to use check_add_overflow(), etc, an u64 types.
Note->n_namesz is always four-byte. I should have used u32.
>
> > +
> > + if (buf + size < buf)
> > + return NULL;
>
> I don't understand this. You want to check size not exceeding the
> allocation, which isn't passed into this function. Checking for a full
> unsigned address wrap around is not sufficient to detect overflow.
Here we only detect the warp around. After this returns we then check other
types of overflow in scan().
>
> > + else
> > + return (buf + size);
> > +}
> > +
> > +static inline bool test_property_x86(void *buf, u32 *arg)
> > +{
> > + struct property_x86 *pr = buf;
> > + u32 max_type = *arg;
> > +
> > + if (pr->pr_type > max_type)
> > + *arg = pr->pr_type;
>
> Why is *arg being updated? I don't see last_pr used outside of here --
> are properties required to be pr_type-ordered?
Yes, they need to be in ascending order.
>
> > +
> > + return (pr->pr_type == GNU_PROPERTY_X86_FEATURE_1_AND);
> > +}
> > +
> > +static inline void *next_property(void *buf, u32 *arg)
> > +{
> > + struct property_x86 *pr = buf;
> > + u32 max_type = *arg;
> > +
> > + if ((buf + sizeof(*pr) + pr->pr_datasz < buf) ||
>
> Again, this "< buf" test doesn't look at all correct to me.
>
> > + (pr->pr_type > GNU_PROPERTY_X86_FEATURE_1_AND) ||
> > + (pr->pr_type > max_type))
> > + return NULL;
> > + else
> > + return (buf + sizeof(*pr) + pr->pr_datasz);
> > +}
> > +
> > +/*
> > + * Scan 'buf' for a pattern; return true if found.
> > + * *pos is the distance from the beginning of buf to where
> > + * the searched item or the next item is located.
> > + */
> > +static int scan(u8 *buf, u32 buf_size, int item_size,
> > + test_fn test, next_fn next, u32 *arg, u32 *pos)
>
> I'm not a fan of the short "scan", "test" and "next" names, and I
> really don't like an arg named "arg". Something slightly more
> descriptive for all of these would be nice, please.
I need to work on that :-) What would you suggest?
>
> > +{
> > + int found = 0;
> > + u8 *p, *max;
> > +
> > + max = buf + buf_size;
> > + if (max < buf)
> > + return 0;
> > +
> > + p = buf;
> > +
> > + while ((p + item_size < max) && (p + item_size > buf)) {
>
> These comparisons are safe due to the BUF_SIZE limit of buf_size and
> the only used size of item_size, but if this becomes more generic, it
> should be more defensive on the size calculations (e.g. make sure than
> "item_size < max" and then here "p < max - item_size", etc).
>
> I'd kind of rather this code walked the base type and check each for
> the matching feature. What is the general specification for what
> NT_GNU_PROPERTY_TYPE_0 contains?
There are other property types, but the kernel does not look at most of them.
If the kernel needs to look at others, we need to rewrite this.
[...]
> > +
> > +/*
> > + * Search a PT_NOTE segment for the first NT_GNU_PROPERTY_TYPE_0.
> > + */
> > +static int find_note_type_0(struct file *file, unsigned long note_size,
> > + loff_t file_offset, u32 align, u32 *feature)
> > +{
> > + u8 *buf;
> > + u32 buf_pos;
> > + unsigned long read_size;
> > + unsigned long done;
> > + int found = 0;
> > + int ret = 0;
> > +
> > + buf = kmalloc(BUF_SIZE, GFP_KERNEL);
> > + if (!buf)
> > + return -ENOMEM;
>
> Why kmalloc over stack variable? (Or, does BUF_SIZE here really need
> to be 1024?)
BUF_SIZE can be smaller, for example 64. If it is too small, we need to do
kernel_read() too often.
>
> > +
> > + *feature = 0;
> > + buf_pos = 0;
> > +
> > + for (done = 0; done < note_size; done += buf_pos) {
> > + read_size = note_size - done;
> > + if (read_size > BUF_SIZE)
> > + read_size = BUF_SIZE;
> > +
> > + ret = kernel_read(file, buf, read_size, &file_offset);
> > +
> > + if (ret != read_size) {
> > + ret = (ret < 0) ? ret : -EIO;
> > + kfree(buf);
> > + return ret;
> > + }
> > +
> > + /*
> > + * item_size = sizeof(struct elf_note) + elf_note.n_namesz.
> > + * n_namesz is 4 for the note type we look for.
> > + */
> > + ret = 0;
> > + found += scan(buf, read_size, sizeof(struct elf_note) + 4,
> > + test_note_type_0, next_note,
> > + &align, &buf_pos);
> > +
> > + file_offset += buf_pos - read_size;
> > +
> > + if (found == 1) {
> > + struct elf_note *n =
> > + (struct elf_note *)(buf + buf_pos);
> > + u32 start = round_up(sizeof(*n) + n->n_namesz,
> > align);
> > + u32 total = round_up(start + n->n_descsz, align);
>
> Same overflow notes from earlier...
>
> > +
> > + ret = find_feature_x86(file, n->n_descsz,
> > + file_offset + start,
> > + buf, feature);
> > + file_offset += total;
> > + buf_pos += total;
> > + } else if (!buf_pos) {
> > + *feature = 0;
> > + break;
> > + }
> > + }
> > +
> > + kfree(buf);
> > + return ret;
> > +}
> > +
> > +#ifdef CONFIG_COMPAT
> > +static int check_notes_32(struct file *file, struct elf32_phdr *phdr,
> > + int phnum, u32 *feature)
> > +{
> > + int i;
> > + int err = 0;
> > +
> > + for (i = 0; i < phnum; i++, phdr++) {
> > + if ((phdr->p_type != PT_NOTE) || (phdr->p_align != 4))
> > + continue;
> > +
> > + err = find_note_type_0(file, phdr->p_filesz, phdr->p_offset,
> > + phdr->p_align, feature);
> > + if (err)
> > + return err;
> > + }
> > +
> > + return 0;
> > +}
> > +#endif
> > +
> > +#ifdef CONFIG_X86_64
> > +static int check_notes_64(struct file *file, struct elf64_phdr *phdr,
> > + int phnum, u32 *feature)
> > +{
> > + int i;
> > + int err = 0;
> > +
> > + for (i = 0; i < phnum; i++, phdr++) {
> > + if ((phdr->p_type != PT_NOTE) || (phdr->p_align != 8))
> > + continue;
>
> Instead of a separate parser here, wouldn't it be a bit nicer to
> attach this to the existing binfmt_elf program header parsing loop:
We need to wait until SET_PERSONALITY2() is done.
[...]
> > +int arch_setup_features(void *ehdr_p, void *phdr_p,
> > + struct file *file, bool interp)
> > +{
> > + int err = 0;
> > + u32 feature = 0;
> > +
> > + struct elf64_hdr *ehdr64 = ehdr_p;
> > +
> > + if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
> > + return 0;
> > +
> > + if (ehdr64->e_ident[EI_CLASS] == ELFCLASS64) {
> > + struct elf64_phdr *phdr64 = phdr_p;
> > +
> > + err = check_notes_64(file, phdr64, ehdr64->e_phnum,
> > + &feature);
> > + if (err < 0)
> > + goto out;
> > + } else {
> > +#ifdef CONFIG_COMPAT
> > + struct elf32_hdr *ehdr32 = ehdr_p;
> > +
> > + if (ehdr32->e_ident[EI_CLASS] == ELFCLASS32) {
> > + struct elf32_phdr *phdr32 = phdr_p;
> > +
> > + err = check_notes_32(file, phdr32, ehdr32->e_phnum,
> > + &feature);
> > + if (err < 0)
> > + goto out;
> > + }
> > +#endif
>
> Should there be an #else error here?
Yes, thanks.
> I'd like to be using this code for a few other cases too (not just
> x86-specific). For example, for marking KASan binaries as needing a
> "legacy" memory layouts[1]. Others might be setting things like
> no_new_privs at exec time, etc.
If the item is a bit of GNU_PROPERTY_X86_FEATURE_1_AND, then this code would
work. Has it been finalized?
Yu-cheng