Version 2 of the speculative execution mitigation for s390. Changes to v1:
* The KVM patch to add the guest bpb feature already went upstream.
* Dropped the patch to introduce the gmb barrier to defend against spectre
variant 1 until the bikeshedding in regard to the naming is done.
* Switched from a system call to the PR_ISOLATE_BP process control to run
user space tasks with branch prediction isolation.
My main question is if the prctl(PR_ISOLATE_BP) makes sense.
Martin Schwidefsky (5):
prctl: add PR_ISOLATE_BP process control
s390/alternative: use a copy of the facility bit mask
s390: add options to change branch prediction behaviour for the kernel
s390: define ISOLATE_BP to run tasks with modified branch prediction
s390: scrub registers on kernel entry and KVM exit
arch/s390/Kconfig | 17 +++++
arch/s390/include/asm/facility.h | 18 +++++
arch/s390/include/asm/lowcore.h | 3 +-
arch/s390/include/asm/processor.h | 4 ++
arch/s390/include/asm/thread_info.h | 4 ++
arch/s390/kernel/alternative.c | 26 ++++++-
arch/s390/kernel/early.c | 5 ++
arch/s390/kernel/entry.S | 134 +++++++++++++++++++++++++++++++++++-
arch/s390/kernel/ipl.c | 1 +
arch/s390/kernel/processor.c | 8 +++
arch/s390/kernel/setup.c | 4 +-
arch/s390/kernel/smp.c | 6 +-
include/uapi/linux/prctl.h | 8 +++
kernel/sys.c | 6 ++
14 files changed, 238 insertions(+), 6 deletions(-)
--
2.7.4
Add the PPA instruction to the system entry and exit path to switch
the kernel to a different branch prediction behaviour. The instructions
are added via CPU alternatives and can be disabled with the "nospec"
or the "nobp=0" kernel parameter. If the default behaviour selected
with CONFIG_KERNEL_NOBP is set to "n" then the "nobp=1" parameter can be
used to enable the changed kernel branch prediction.
Signed-off-by: Martin Schwidefsky <[email protected]>
---
arch/s390/Kconfig | 17 +++++++++++++
arch/s390/include/asm/processor.h | 1 +
arch/s390/kernel/alternative.c | 23 ++++++++++++++++++
arch/s390/kernel/early.c | 2 ++
arch/s390/kernel/entry.S | 50 ++++++++++++++++++++++++++++++++++++++-
arch/s390/kernel/ipl.c | 1 +
arch/s390/kernel/smp.c | 2 ++
7 files changed, 95 insertions(+), 1 deletion(-)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 829c679..7697fe2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -541,6 +541,23 @@ config ARCH_RANDOM
If unsure, say Y.
+config KERNEL_NOBP
+ def_bool n
+ prompt "Enable modified branch prediction for the kernel by default"
+ help
+ If this option is selected the kernel will switch to a modified
+ branch prediction mode if the firmware interface is available.
+ The modified branch prediction mode improves the behaviour in
+ regard to speculative execution.
+
+ With the option enabled the kernel parameter "nobp=0" or "nospec"
+ can be used to run the kernel in the normal branch prediction mode.
+
+ With the option disabled the modified branch prediction mode is
+ enabled with the "nobp=1" kernel parameter.
+
+ If unsure, say N.
+
endmenu
menu "Memory setup"
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index bfbfad4..5f37f9c 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -91,6 +91,7 @@ void cpu_detect_mhz_feature(void);
extern const struct seq_operations cpuinfo_op;
extern int sysctl_ieee_emulation_warnings;
extern void execve_tail(void);
+extern void __bpon(void);
/*
* User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 1abf4f3..2247613 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -15,6 +15,29 @@ static int __init disable_alternative_instructions(char *str)
early_param("noaltinstr", disable_alternative_instructions);
+static int __init nobp_setup_early(char *str)
+{
+ bool enabled;
+ int rc;
+
+ rc = kstrtobool(str, &enabled);
+ if (rc)
+ return rc;
+ if (enabled && test_facility(82))
+ __set_facility(82, S390_lowcore.alt_stfle_fac_list);
+ else
+ __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ return 0;
+}
+early_param("nobp", nobp_setup_early);
+
+static int __init nospec_setup_early(char *str)
+{
+ __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ return 0;
+}
+early_param("nospec", nospec_setup_early);
+
struct brcl_insn {
u16 opc;
s32 disp;
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 510f218..ac707a9 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -196,6 +196,8 @@ static noinline __init void setup_facility_list(void)
memcpy(S390_lowcore.alt_stfle_fac_list,
S390_lowcore.stfle_fac_list,
sizeof(S390_lowcore.alt_stfle_fac_list));
+ if (!IS_ENABLED(CONFIG_KERNEL_NOBP))
+ __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
}
static __init void detect_diag9c(void)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 9e5f6cd..dab716b 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -159,6 +159,34 @@ _PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
tm off+\addr, \mask
.endm
+ .macro BPOFF
+ .pushsection .altinstr_replacement, "ax"
+660: .long 0xb2e8c000
+ .popsection
+661: .long 0x47000000
+ .pushsection .altinstructions, "a"
+ .long 661b - .
+ .long 660b - .
+ .word 82
+ .byte 4
+ .byte 4
+ .popsection
+ .endm
+
+ .macro BPON
+ .pushsection .altinstr_replacement, "ax"
+662: .long 0xb2e8d000
+ .popsection
+663: .long 0x47000000
+ .pushsection .altinstructions, "a"
+ .long 663b - .
+ .long 662b - .
+ .word 82
+ .byte 4
+ .byte 4
+ .popsection
+ .endm
+
.section .kprobes.text, "ax"
.Ldummy:
/*
@@ -171,6 +199,11 @@ _PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
*/
nop 0
+ENTRY(__bpon)
+ .globl __bpon
+ BPON
+ br %r14
+
/*
* Scheduler resume function, called by switch_to
* gpr2 = (task_struct *) prev
@@ -226,8 +259,11 @@ ENTRY(sie64a)
jnz .Lsie_skip
TSTMSK __LC_CPU_FLAGS,_CIF_FPU
jo .Lsie_skip # exit if fp/vx regs changed
+ BPON
.Lsie_entry:
sie 0(%r14)
+.Lsie_exit:
+ BPOFF
.Lsie_skip:
ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
@@ -273,6 +309,7 @@ ENTRY(system_call)
stpt __LC_SYNC_ENTER_TIMER
.Lsysc_stmg:
stmg %r8,%r15,__LC_SAVE_AREA_SYNC
+ BPOFF
lg %r12,__LC_CURRENT
lghi %r13,__TASK_thread
lghi %r14,_PIF_SYSCALL
@@ -317,6 +354,7 @@ ENTRY(system_call)
jnz .Lsysc_work # check for work
TSTMSK __LC_CPU_FLAGS,_CIF_WORK
jnz .Lsysc_work
+ BPON
.Lsysc_restore:
lg %r14,__LC_VDSO_PER_CPU
lmg %r0,%r10,__PT_R0(%r11)
@@ -522,6 +560,7 @@ ENTRY(kernel_thread_starter)
ENTRY(pgm_check_handler)
stpt __LC_SYNC_ENTER_TIMER
+ BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_SYNC
lg %r10,__LC_LAST_BREAK
lg %r12,__LC_CURRENT
@@ -620,6 +659,7 @@ ENTRY(pgm_check_handler)
ENTRY(io_int_handler)
STCK __LC_INT_CLOCK
stpt __LC_ASYNC_ENTER_TIMER
+ BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_ASYNC
lg %r12,__LC_CURRENT
larl %r13,cleanup_critical
@@ -660,9 +700,13 @@ ENTRY(io_int_handler)
lg %r14,__LC_VDSO_PER_CPU
lmg %r0,%r10,__PT_R0(%r11)
mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
+ tm __PT_PSW+1(%r11),0x01 # returning to user ?
+ jno .Lio_exit_kernel
+ BPON
.Lio_exit_timer:
stpt __LC_EXIT_TIMER
mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
+.Lio_exit_kernel:
lmg %r11,%r15,__PT_R11(%r11)
lpswe __LC_RETURN_PSW
.Lio_done:
@@ -833,6 +877,7 @@ ENTRY(io_int_handler)
ENTRY(ext_int_handler)
STCK __LC_INT_CLOCK
stpt __LC_ASYNC_ENTER_TIMER
+ BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_ASYNC
lg %r12,__LC_CURRENT
larl %r13,cleanup_critical
@@ -871,6 +916,7 @@ ENTRY(psw_idle)
.Lpsw_idle_stcctm:
#endif
oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT
+ BPON
STCK __CLOCK_IDLE_ENTER(%r2)
stpt __TIMER_IDLE_ENTER(%r2)
.Lpsw_idle_lpsw:
@@ -971,6 +1017,7 @@ load_fpu_regs:
*/
ENTRY(mcck_int_handler)
STCK __LC_MCCK_CLOCK
+ BPOFF
la %r1,4095 # validate r1
spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer
sckc __LC_CLOCK_COMPARATOR # validate comparator
@@ -1071,6 +1118,7 @@ ENTRY(mcck_int_handler)
mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
jno 0f
+ BPON
stpt __LC_EXIT_TIMER
mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
0: lmg %r11,%r15,__PT_R11(%r11)
@@ -1385,7 +1433,7 @@ cleanup_critical:
.Lsie_crit_mcck_start:
.quad .Lsie_entry
.Lsie_crit_mcck_length:
- .quad .Lsie_skip - .Lsie_entry
+ .quad .Lsie_exit - .Lsie_entry
#endif
.section .rodata, "a"
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 8ecb872..443b9b8 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -547,6 +547,7 @@ static struct kset *ipl_kset;
static void __ipl_run(void *unused)
{
+ __bpon();
diag308(DIAG308_LOAD_CLEAR, NULL);
if (MACHINE_IS_VM)
__cpcmd("IPL", NULL, 0, NULL);
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 4ce68ca..9cd1696 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -319,6 +319,7 @@ static void pcpu_delegate(struct pcpu *pcpu, void (*func)(void *),
mem_assign_absolute(lc->restart_fn, (unsigned long) func);
mem_assign_absolute(lc->restart_data, (unsigned long) data);
mem_assign_absolute(lc->restart_source, source_cpu);
+ __bpon();
asm volatile(
"0: sigp 0,%0,%2 # sigp restart to target cpu\n"
" brc 2,0b # busy, try again\n"
@@ -903,6 +904,7 @@ void __cpu_die(unsigned int cpu)
void __noreturn cpu_die(void)
{
idle_task_exit();
+ __bpon();
pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0);
for (;;) ;
}
--
2.7.4
Add the PR_ISOLATE_BP operation to prctl. The effect of the process
control is to make all branch prediction entries created by the execution
of the user space code of this task not applicable to kernel code or the
code of any other task.
This can be achieved by the architecture specific implementation
in different ways, e.g. by limiting the branch predicion for the task,
or by clearing the branch prediction tables on each context switch, or
by tagging the branch prediction entries in a suitable way.
The architecture code needs to define the ISOLATE_BP macro to implement
the hardware specific details of the branch prediction isolation.
The control can not be removed from a task once it is activated and it
is inherited by all children of the task.
The user space wrapper to start a program with the isolated branch
prediction:
int main(int argc, char *argv[], char *envp[])
{
int rc;
if (argc < 2) {
fprintf(stderr, "Usage: %s <file-to-exec> <arguments>\n",
argv[0]);
exit(EXIT_FAILURE);
}
rc = prctl(PR_ISOLATE_BP);
if (rc) {
perror("PR_ISOLATE_BP");
exit(EXIT_FAILURE);
}
execve(argv[1], argv + 1, envp);
perror("execve");
exit(EXIT_FAILURE);
}
Signed-off-by: Martin Schwidefsky <[email protected]>
---
include/uapi/linux/prctl.h | 8 ++++++++
kernel/sys.c | 6 ++++++
2 files changed, 14 insertions(+)
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index af5f8c2..e7b84c9 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -207,4 +207,12 @@ struct prctl_mm_map {
# define PR_SVE_VL_LEN_MASK 0xffff
# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */
+/*
+ * Prevent branch prediction entries created by the execution of
+ * user space code of this task to be used in any other context.
+ * This makes it impossible for malicious user space code to train
+ * a branch in the kernel code or in another task to be mispredicted.
+ */
+#define PR_ISOLATE_BP 52
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 83ffd7d..e41cb2f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -117,6 +117,9 @@
#ifndef SVE_GET_VL
# define SVE_GET_VL() (-EINVAL)
#endif
+#ifndef ISOLATE_BP
+# define ISOLATE_BP() (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -2398,6 +2401,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SVE_GET_VL:
error = SVE_GET_VL();
break;
+ case PR_ISOLATE_BP:
+ error = ISOLATE_BP();
+ break;
default:
error = -EINVAL;
break;
--
2.7.4
Define the ISOLATE_BP macro to enable the use of the PR_ISOLATE_BP process
control to switch a task from the standard branch prediction to a modified,
more secure but slower behaviour.
Signed-off-by: Martin Schwidefsky <[email protected]>
---
arch/s390/include/asm/processor.h | 3 +++
arch/s390/include/asm/thread_info.h | 4 +++
arch/s390/kernel/entry.S | 51 +++++++++++++++++++++++++++++++++----
arch/s390/kernel/processor.c | 8 ++++++
4 files changed, 61 insertions(+), 5 deletions(-)
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 5f37f9c..99ee222 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -378,6 +378,9 @@ extern void memcpy_absolute(void *, void *, size_t);
memcpy_absolute(&(dest), &__tmp, sizeof(__tmp)); \
} while (0)
+extern int s390_isolate_bp(void);
+#define ISOLATE_BP s390_isolate_bp
+
#endif /* __ASSEMBLY__ */
#endif /* __ASM_S390_PROCESSOR_H */
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 0880a37..301b4f7 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -60,6 +60,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
#define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */
#define TIF_PATCH_PENDING 5 /* pending live patching update */
#define TIF_PGSTE 6 /* New mm's will use 4K page tables */
+#define TIF_ISOLATE_BP 8 /* Run process with isolated BP */
+#define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */
#define TIF_31BIT 16 /* 32bit process */
#define TIF_MEMDIE 17 /* is terminating due to OOM killer */
@@ -80,6 +82,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
#define _TIF_UPROBE _BITUL(TIF_UPROBE)
#define _TIF_GUARDED_STORAGE _BITUL(TIF_GUARDED_STORAGE)
#define _TIF_PATCH_PENDING _BITUL(TIF_PATCH_PENDING)
+#define _TIF_ISOLATE_BP _BITUL(TIF_ISOLATE_BP)
+#define _TIF_ISOLATE_BP_GUEST _BITUL(TIF_ISOLATE_BP_GUEST)
#define _TIF_31BIT _BITUL(TIF_31BIT)
#define _TIF_SINGLE_STEP _BITUL(TIF_SINGLE_STEP)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index dab716b..07e4e46 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -107,6 +107,7 @@ _PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
j 3f
1: UPDATE_VTIME %r14,%r15,\timer
+ BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
2: lg %r15,__LC_ASYNC_STACK # load async stack
3: la %r11,STACK_FRAME_OVERHEAD(%r15)
.endm
@@ -187,6 +188,40 @@ _PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
.popsection
.endm
+ .macro BPENTER tif_ptr,tif_mask
+ .pushsection .altinstr_replacement, "ax"
+662: .word 0xc004, 0x0000, 0x0000 # 6 byte nop
+ .word 0xc004, 0x0000, 0x0000 # 6 byte nop
+ .popsection
+664: TSTMSK \tif_ptr,\tif_mask
+ jz . + 8
+ .long 0xb2e8d000
+ .pushsection .altinstructions, "a"
+ .long 664b - .
+ .long 662b - .
+ .word 82
+ .byte 12
+ .byte 12
+ .popsection
+ .endm
+
+ .macro BPEXIT tif_ptr,tif_mask
+ TSTMSK \tif_ptr,\tif_mask
+ .pushsection .altinstr_replacement, "ax"
+662: jnz . + 8
+ .long 0xb2e8d000
+ .popsection
+664: jz . + 8
+ .long 0xb2e8c000
+ .pushsection .altinstructions, "a"
+ .long 664b - .
+ .long 662b - .
+ .word 82
+ .byte 8
+ .byte 8
+ .popsection
+ .endm
+
.section .kprobes.text, "ax"
.Ldummy:
/*
@@ -240,9 +275,11 @@ ENTRY(__switch_to)
*/
ENTRY(sie64a)
stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers
+ lg %r12,__LC_CURRENT
stg %r2,__SF_EMPTY(%r15) # save control block pointer
stg %r3,__SF_EMPTY+8(%r15) # save guest register save area
xc __SF_EMPTY+16(8,%r15),__SF_EMPTY+16(%r15) # reason code = 0
+ mvc __SF_EMPTY+24(8,%r15),__TI_flags(%r12) # copy thread flags
TSTMSK __LC_CPU_FLAGS,_CIF_FPU # load guest fp/vx registers ?
jno .Lsie_load_guest_gprs
brasl %r14,load_fpu_regs # load guest fp/vx regs
@@ -259,11 +296,12 @@ ENTRY(sie64a)
jnz .Lsie_skip
TSTMSK __LC_CPU_FLAGS,_CIF_FPU
jo .Lsie_skip # exit if fp/vx regs changed
- BPON
+ BPEXIT __SF_EMPTY+24(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
.Lsie_entry:
sie 0(%r14)
.Lsie_exit:
BPOFF
+ BPENTER __SF_EMPTY+24(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
.Lsie_skip:
ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
@@ -318,6 +356,7 @@ ENTRY(system_call)
la %r11,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs
.Lsysc_vtime:
UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER
+ BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
stmg %r0,%r7,__PT_R0(%r11)
mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW
@@ -354,7 +393,7 @@ ENTRY(system_call)
jnz .Lsysc_work # check for work
TSTMSK __LC_CPU_FLAGS,_CIF_WORK
jnz .Lsysc_work
- BPON
+ BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
.Lsysc_restore:
lg %r14,__LC_VDSO_PER_CPU
lmg %r0,%r10,__PT_R0(%r11)
@@ -589,6 +628,7 @@ ENTRY(pgm_check_handler)
aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
j 4f
2: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER
+ BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
lg %r15,__LC_KERNEL_STACK
lgr %r14,%r12
aghi %r14,__TASK_thread # pointer to thread_struct
@@ -702,7 +742,7 @@ ENTRY(io_int_handler)
mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
tm __PT_PSW+1(%r11),0x01 # returning to user ?
jno .Lio_exit_kernel
- BPON
+ BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
.Lio_exit_timer:
stpt __LC_EXIT_TIMER
mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
@@ -1118,7 +1158,7 @@ ENTRY(mcck_int_handler)
mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
jno 0f
- BPON
+ BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
stpt __LC_EXIT_TIMER
mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
0: lmg %r11,%r15,__PT_R11(%r11)
@@ -1245,7 +1285,8 @@ cleanup_critical:
clg %r9,BASED(.Lsie_crit_mcck_length)
jh 1f
oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
-1: lg %r9,__SF_EMPTY(%r15) # get control block pointer
+1: BPENTER __SF_EMPTY+24(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+ lg %r9,__SF_EMPTY(%r15) # get control block pointer
ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE
lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
larl %r9,sie_exit # skip forward to sie_exit
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 5362fd8..5159636 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -197,3 +197,11 @@ const struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
+
+int s390_isolate_bp(void)
+{
+ if (!test_facility(82))
+ return -EOPNOTSUPP;
+ set_thread_flag(TIF_ISOLATE_BP);
+ return 0;
+}
--
2.7.4
Clear all user space registers on entry to the kernel and all KVM guest
registers on KVM guest exit if the register does not contain either a
parameter or a result value.
Signed-off-by: Martin Schwidefsky <[email protected]>
---
arch/s390/kernel/entry.S | 41 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 07e4e46..0ee9408 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -322,6 +322,12 @@ ENTRY(sie64a)
sie_exit:
lg %r14,__SF_EMPTY+8(%r15) # load guest register save area
stmg %r0,%r13,0(%r14) # save guest gprs 0-13
+ xgr %r0,%r0 # clear guest registers
+ xgr %r1,%r1
+ xgr %r2,%r2
+ xgr %r3,%r3
+ xgr %r4,%r4
+ xgr %r5,%r5
lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers
lg %r2,__SF_EMPTY+16(%r15) # return exit reason code
br %r14
@@ -358,6 +364,7 @@ ENTRY(system_call)
UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER
BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
stmg %r0,%r7,__PT_R0(%r11)
+ xgr %r0,%r0
mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW
mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC
@@ -640,6 +647,14 @@ ENTRY(pgm_check_handler)
4: lgr %r13,%r11
la %r11,STACK_FRAME_OVERHEAD(%r15)
stmg %r0,%r7,__PT_R0(%r11)
+ xgr %r0,%r0 # clear user space registers
+ xgr %r1,%r1
+ xgr %r2,%r2
+ xgr %r3,%r3
+ xgr %r4,%r4
+ xgr %r5,%r5
+ xgr %r6,%r6
+ xgr %r7,%r7
mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
stmg %r8,%r9,__PT_PSW(%r11)
mvc __PT_INT_CODE(4,%r11),__LC_PGM_ILC
@@ -706,6 +721,15 @@ ENTRY(io_int_handler)
lmg %r8,%r9,__LC_IO_OLD_PSW
SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
stmg %r0,%r7,__PT_R0(%r11)
+ xgr %r0,%r0 # clear user space registers
+ xgr %r1,%r1
+ xgr %r2,%r2
+ xgr %r3,%r3
+ xgr %r4,%r4
+ xgr %r5,%r5
+ xgr %r6,%r6
+ xgr %r7,%r7
+ xgr %r10,%r10
mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
stmg %r8,%r9,__PT_PSW(%r11)
mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
@@ -924,6 +948,15 @@ ENTRY(ext_int_handler)
lmg %r8,%r9,__LC_EXT_OLD_PSW
SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
stmg %r0,%r7,__PT_R0(%r11)
+ xgr %r0,%r0 # clear user space registers
+ xgr %r1,%r1
+ xgr %r2,%r2
+ xgr %r3,%r3
+ xgr %r4,%r4
+ xgr %r5,%r5
+ xgr %r6,%r6
+ xgr %r7,%r7
+ xgr %r10,%r10
mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
stmg %r8,%r9,__PT_PSW(%r11)
lghi %r1,__LC_EXT_PARAMS2
@@ -1133,6 +1166,14 @@ ENTRY(mcck_int_handler)
.Lmcck_skip:
lghi %r14,__LC_GPREGS_SAVE_AREA+64
stmg %r0,%r7,__PT_R0(%r11)
+ xgr %r0,%r0 # clear user space registers
+ xgr %r2,%r2
+ xgr %r3,%r3
+ xgr %r4,%r4
+ xgr %r5,%r5
+ xgr %r6,%r6
+ xgr %r7,%r7
+ xgr %r10,%r10
mvc __PT_R8(64,%r11),0(%r14)
stmg %r8,%r9,__PT_PSW(%r11)
xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
--
2.7.4
To be able to switch off specific CPU alternatives with kernel parameters
make a copy of the facility bit mask provided by STFLE and use the copy
for the decision to apply an alternative.
Signed-off-by: Martin Schwidefsky <[email protected]>
---
arch/s390/include/asm/facility.h | 18 ++++++++++++++++++
arch/s390/include/asm/lowcore.h | 3 ++-
arch/s390/kernel/alternative.c | 3 ++-
arch/s390/kernel/early.c | 3 +++
arch/s390/kernel/setup.c | 4 +++-
arch/s390/kernel/smp.c | 4 +++-
6 files changed, 31 insertions(+), 4 deletions(-)
diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h
index f040644..2d58478 100644
--- a/arch/s390/include/asm/facility.h
+++ b/arch/s390/include/asm/facility.h
@@ -15,6 +15,24 @@
#define MAX_FACILITY_BIT (sizeof(((struct lowcore *)0)->stfle_fac_list) * 8)
+static inline void __set_facility(unsigned long nr, void *facilities)
+{
+ unsigned char *ptr = (unsigned char *) facilities;
+
+ if (nr >= MAX_FACILITY_BIT)
+ return;
+ ptr[nr >> 3] |= 0x80 >> (nr & 7);
+}
+
+static inline void __clear_facility(unsigned long nr, void *facilities)
+{
+ unsigned char *ptr = (unsigned char *) facilities;
+
+ if (nr >= MAX_FACILITY_BIT)
+ return;
+ ptr[nr >> 3] &= ~(0x80 >> (nr & 7));
+}
+
static inline int __test_facility(unsigned long nr, void *facilities)
{
unsigned char *ptr;
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index ec6592e..c63986a 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -151,7 +151,8 @@ struct lowcore {
__u8 pad_0x0e20[0x0f00-0x0e20]; /* 0x0e20 */
/* Extended facility list */
- __u64 stfle_fac_list[32]; /* 0x0f00 */
+ __u64 stfle_fac_list[16]; /* 0x0f00 */
+ __u64 alt_stfle_fac_list[16]; /* 0x0f80 */
__u8 pad_0x1000[0x11b0-0x1000]; /* 0x1000 */
/* Pointer to the machine check extended save area */
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 574e776..1abf4f3 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -75,7 +75,8 @@ static void __init_or_module __apply_alternatives(struct alt_instr *start,
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- if (!test_facility(a->facility))
+ if (!__test_facility(a->facility,
+ S390_lowcore.alt_stfle_fac_list))
continue;
if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) {
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 497a920..510f218 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -193,6 +193,9 @@ static noinline __init void setup_facility_list(void)
{
stfle(S390_lowcore.stfle_fac_list,
ARRAY_SIZE(S390_lowcore.stfle_fac_list));
+ memcpy(S390_lowcore.alt_stfle_fac_list,
+ S390_lowcore.stfle_fac_list,
+ sizeof(S390_lowcore.alt_stfle_fac_list));
}
static __init void detect_diag9c(void)
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 793da97..bcd2a4a 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -340,7 +340,9 @@ static void __init setup_lowcore(void)
lc->preempt_count = S390_lowcore.preempt_count;
lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
- MAX_FACILITY_BIT/8);
+ sizeof(lc->stfle_fac_list));
+ memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
+ sizeof(lc->alt_stfle_fac_list));
nmi_alloc_boot_cpu(lc);
vdso_alloc_boot_cpu(lc);
lc->sync_enter_timer = S390_lowcore.sync_enter_timer;
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b8c1a85..4ce68ca 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -266,7 +266,9 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
__ctl_store(lc->cregs_save_area, 0, 15);
save_access_regs((unsigned int *) lc->access_regs_save_area);
memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
- MAX_FACILITY_BIT/8);
+ sizeof(lc->stfle_fac_list));
+ memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
+ sizeof(lc->alt_stfle_fac_list));
arch_spin_lock_setup(cpu);
}
--
2.7.4
On 01/23/2018 02:07 PM, Martin Schwidefsky wrote:
> Clear all user space registers on entry to the kernel and all KVM guest
> registers on KVM guest exit if the register does not contain either a
> parameter or a result value.
>
> Signed-off-by: Martin Schwidefsky <[email protected]>
you dropped my reviewed-by. Has this patch changed?
On Tue, 23 Jan 2018 14:07:02 +0100
Martin Schwidefsky <[email protected]> wrote:
> To be able to switch off specific CPU alternatives with kernel parameters
> make a copy of the facility bit mask provided by STFLE and use the copy
> for the decision to apply an alternative.
>
> Signed-off-by: Martin Schwidefsky <[email protected]>
> ---
> arch/s390/include/asm/facility.h | 18 ++++++++++++++++++
> arch/s390/include/asm/lowcore.h | 3 ++-
> arch/s390/kernel/alternative.c | 3 ++-
> arch/s390/kernel/early.c | 3 +++
> arch/s390/kernel/setup.c | 4 +++-
> arch/s390/kernel/smp.c | 4 +++-
> 6 files changed, 31 insertions(+), 4 deletions(-)
You have dropped various r-bs (including mine). Has this patch been
changed? (Doesn't look like it.)
Paolo, Radim,
this patch not only allows to isolate a userspace process, it also allows us
to add a new interface for KVM that would allow us to isolate a KVM guest CPU
to no longer being able to inject branches in any host or other guests. (while
at the same time QEMU and host kernel can run with full power).
We just have to set the TIF bit TIF_ISOLATE_BP_GUEST for the thread that runs a
given CPU. This would certainly be an addon patch on top of this patch at a later
point in time.
Do you think something similar would be useful for other architectures as well?
In that case we should try to come up with a cross-architecture interface to enable
that.
Christian
On 01/23/2018 02:07 PM, Martin Schwidefsky wrote:
> Define the ISOLATE_BP macro to enable the use of the PR_ISOLATE_BP process
> control to switch a task from the standard branch prediction to a modified,
> more secure but slower behaviour.
>
> Signed-off-by: Martin Schwidefsky <[email protected]>
> ---
> arch/s390/include/asm/processor.h | 3 +++
> arch/s390/include/asm/thread_info.h | 4 +++
> arch/s390/kernel/entry.S | 51 +++++++++++++++++++++++++++++++++----
> arch/s390/kernel/processor.c | 8 ++++++
> 4 files changed, 61 insertions(+), 5 deletions(-)
>
> diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
> index 5f37f9c..99ee222 100644
> --- a/arch/s390/include/asm/processor.h
> +++ b/arch/s390/include/asm/processor.h
> @@ -378,6 +378,9 @@ extern void memcpy_absolute(void *, void *, size_t);
> memcpy_absolute(&(dest), &__tmp, sizeof(__tmp)); \
> } while (0)
>
> +extern int s390_isolate_bp(void);
> +#define ISOLATE_BP s390_isolate_bp
> +
> #endif /* __ASSEMBLY__ */
>
> #endif /* __ASM_S390_PROCESSOR_H */
> diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
> index 0880a37..301b4f7 100644
> --- a/arch/s390/include/asm/thread_info.h
> +++ b/arch/s390/include/asm/thread_info.h
> @@ -60,6 +60,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
> #define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */
> #define TIF_PATCH_PENDING 5 /* pending live patching update */
> #define TIF_PGSTE 6 /* New mm's will use 4K page tables */
> +#define TIF_ISOLATE_BP 8 /* Run process with isolated BP */
> +#define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */
>
> #define TIF_31BIT 16 /* 32bit process */
> #define TIF_MEMDIE 17 /* is terminating due to OOM killer */
> @@ -80,6 +82,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
> #define _TIF_UPROBE _BITUL(TIF_UPROBE)
> #define _TIF_GUARDED_STORAGE _BITUL(TIF_GUARDED_STORAGE)
> #define _TIF_PATCH_PENDING _BITUL(TIF_PATCH_PENDING)
> +#define _TIF_ISOLATE_BP _BITUL(TIF_ISOLATE_BP)
> +#define _TIF_ISOLATE_BP_GUEST _BITUL(TIF_ISOLATE_BP_GUEST)
>
> #define _TIF_31BIT _BITUL(TIF_31BIT)
> #define _TIF_SINGLE_STEP _BITUL(TIF_SINGLE_STEP)
> diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
> index dab716b..07e4e46 100644
> --- a/arch/s390/kernel/entry.S
> +++ b/arch/s390/kernel/entry.S
> @@ -107,6 +107,7 @@ _PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
> aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
> j 3f
> 1: UPDATE_VTIME %r14,%r15,\timer
> + BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
> 2: lg %r15,__LC_ASYNC_STACK # load async stack
> 3: la %r11,STACK_FRAME_OVERHEAD(%r15)
> .endm
> @@ -187,6 +188,40 @@ _PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
> .popsection
> .endm
>
> + .macro BPENTER tif_ptr,tif_mask
> + .pushsection .altinstr_replacement, "ax"
> +662: .word 0xc004, 0x0000, 0x0000 # 6 byte nop
> + .word 0xc004, 0x0000, 0x0000 # 6 byte nop
> + .popsection
> +664: TSTMSK \tif_ptr,\tif_mask
> + jz . + 8
> + .long 0xb2e8d000
> + .pushsection .altinstructions, "a"
> + .long 664b - .
> + .long 662b - .
> + .word 82
> + .byte 12
> + .byte 12
> + .popsection
> + .endm
> +
> + .macro BPEXIT tif_ptr,tif_mask
> + TSTMSK \tif_ptr,\tif_mask
> + .pushsection .altinstr_replacement, "ax"
> +662: jnz . + 8
> + .long 0xb2e8d000
> + .popsection
> +664: jz . + 8
> + .long 0xb2e8c000
> + .pushsection .altinstructions, "a"
> + .long 664b - .
> + .long 662b - .
> + .word 82
> + .byte 8
> + .byte 8
> + .popsection
> + .endm
> +
> .section .kprobes.text, "ax"
> .Ldummy:
> /*
> @@ -240,9 +275,11 @@ ENTRY(__switch_to)
> */
> ENTRY(sie64a)
> stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers
> + lg %r12,__LC_CURRENT
> stg %r2,__SF_EMPTY(%r15) # save control block pointer
> stg %r3,__SF_EMPTY+8(%r15) # save guest register save area
> xc __SF_EMPTY+16(8,%r15),__SF_EMPTY+16(%r15) # reason code = 0
> + mvc __SF_EMPTY+24(8,%r15),__TI_flags(%r12) # copy thread flags
> TSTMSK __LC_CPU_FLAGS,_CIF_FPU # load guest fp/vx registers ?
> jno .Lsie_load_guest_gprs
> brasl %r14,load_fpu_regs # load guest fp/vx regs
> @@ -259,11 +296,12 @@ ENTRY(sie64a)
> jnz .Lsie_skip
> TSTMSK __LC_CPU_FLAGS,_CIF_FPU
> jo .Lsie_skip # exit if fp/vx regs changed
> - BPON
> + BPEXIT __SF_EMPTY+24(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
> .Lsie_entry:
> sie 0(%r14)
> .Lsie_exit:
> BPOFF
> + BPENTER __SF_EMPTY+24(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
> .Lsie_skip:
> ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
> lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
> @@ -318,6 +356,7 @@ ENTRY(system_call)
> la %r11,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs
> .Lsysc_vtime:
> UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER
> + BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
> stmg %r0,%r7,__PT_R0(%r11)
> mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
> mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW
> @@ -354,7 +393,7 @@ ENTRY(system_call)
> jnz .Lsysc_work # check for work
> TSTMSK __LC_CPU_FLAGS,_CIF_WORK
> jnz .Lsysc_work
> - BPON
> + BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
> .Lsysc_restore:
> lg %r14,__LC_VDSO_PER_CPU
> lmg %r0,%r10,__PT_R0(%r11)
> @@ -589,6 +628,7 @@ ENTRY(pgm_check_handler)
> aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
> j 4f
> 2: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER
> + BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
> lg %r15,__LC_KERNEL_STACK
> lgr %r14,%r12
> aghi %r14,__TASK_thread # pointer to thread_struct
> @@ -702,7 +742,7 @@ ENTRY(io_int_handler)
> mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
> tm __PT_PSW+1(%r11),0x01 # returning to user ?
> jno .Lio_exit_kernel
> - BPON
> + BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
> .Lio_exit_timer:
> stpt __LC_EXIT_TIMER
> mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
> @@ -1118,7 +1158,7 @@ ENTRY(mcck_int_handler)
> mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
> tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
> jno 0f
> - BPON
> + BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
> stpt __LC_EXIT_TIMER
> mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
> 0: lmg %r11,%r15,__PT_R11(%r11)
> @@ -1245,7 +1285,8 @@ cleanup_critical:
> clg %r9,BASED(.Lsie_crit_mcck_length)
> jh 1f
> oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
> -1: lg %r9,__SF_EMPTY(%r15) # get control block pointer
> +1: BPENTER __SF_EMPTY+24(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
> + lg %r9,__SF_EMPTY(%r15) # get control block pointer
> ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE
> lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
> larl %r9,sie_exit # skip forward to sie_exit
> diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
> index 5362fd8..5159636 100644
> --- a/arch/s390/kernel/processor.c
> +++ b/arch/s390/kernel/processor.c
> @@ -197,3 +197,11 @@ const struct seq_operations cpuinfo_op = {
> .stop = c_stop,
> .show = show_cpuinfo,
> };
> +
> +int s390_isolate_bp(void)
> +{
> + if (!test_facility(82))
> + return -EOPNOTSUPP;
> + set_thread_flag(TIF_ISOLATE_BP);
> + return 0;
> +}
>
On Tue, 23 Jan 2018 14:09:50 +0100
Christian Borntraeger <[email protected]> wrote:
> On 01/23/2018 02:07 PM, Martin Schwidefsky wrote:
> > Clear all user space registers on entry to the kernel and all KVM guest
> > registers on KVM guest exit if the register does not contain either a
> > parameter or a result value.
> >
> > Signed-off-by: Martin Schwidefsky <[email protected]>
>
> you dropped my reviewed-by. Has this patch changed?
To drop something I have to add it in the first place..
But the patch did not change and your r-b tag is added now.
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.
On Tue, 23 Jan 2018 14:59:47 +0100
Cornelia Huck <[email protected]> wrote:
> On Tue, 23 Jan 2018 14:07:02 +0100
> Martin Schwidefsky <[email protected]> wrote:
>
> > To be able to switch off specific CPU alternatives with kernel parameters
> > make a copy of the facility bit mask provided by STFLE and use the copy
> > for the decision to apply an alternative.
> >
> > Signed-off-by: Martin Schwidefsky <[email protected]>
> > ---
> > arch/s390/include/asm/facility.h | 18 ++++++++++++++++++
> > arch/s390/include/asm/lowcore.h | 3 ++-
> > arch/s390/kernel/alternative.c | 3 ++-
> > arch/s390/kernel/early.c | 3 +++
> > arch/s390/kernel/setup.c | 4 +++-
> > arch/s390/kernel/smp.c | 4 +++-
> > 6 files changed, 31 insertions(+), 4 deletions(-)
>
> You have dropped various r-bs (including mine). Has this patch been
> changed? (Doesn't look like it.)
Added your and Davids r-b to patch #2, as well as your a-b to patch #4.
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.
On Tue, 23 Jan 2018 15:40:06 +0100
Martin Schwidefsky <[email protected]> wrote:
> On Tue, 23 Jan 2018 14:59:47 +0100
> Cornelia Huck <[email protected]> wrote:
>
> > On Tue, 23 Jan 2018 14:07:02 +0100
> > Martin Schwidefsky <[email protected]> wrote:
> >
> > > To be able to switch off specific CPU alternatives with kernel parameters
> > > make a copy of the facility bit mask provided by STFLE and use the copy
> > > for the decision to apply an alternative.
> > >
> > > Signed-off-by: Martin Schwidefsky <[email protected]>
> > > ---
> > > arch/s390/include/asm/facility.h | 18 ++++++++++++++++++
> > > arch/s390/include/asm/lowcore.h | 3 ++-
> > > arch/s390/kernel/alternative.c | 3 ++-
> > > arch/s390/kernel/early.c | 3 +++
> > > arch/s390/kernel/setup.c | 4 +++-
> > > arch/s390/kernel/smp.c | 4 +++-
> > > 6 files changed, 31 insertions(+), 4 deletions(-)
> >
> > You have dropped various r-bs (including mine). Has this patch been
> > changed? (Doesn't look like it.)
>
> Added your and Davids r-b to patch #2, as well as your a-b to patch #4.
>
OK, thx.
On Tue, Jan 23, 2018 at 02:07:01PM +0100, Martin Schwidefsky wrote:
> Add the PR_ISOLATE_BP operation to prctl. The effect of the process
> control is to make all branch prediction entries created by the execution
> of the user space code of this task not applicable to kernel code or the
> code of any other task.
What is the rationale for requiring a per-process *opt-in* for this added
protection?
For KPTI on x86, the exact opposite approach is being discussed (see, e.g.
http://lkml.kernel.org/r/[email protected] ): By
default, play it safe, with KPTI enabled. But for "trusted" processes, one
may opt out using prctrl.
Thanks,
Dominik
2018-01-23 15:21+0100, Christian Borntraeger:
> Paolo, Radim,
>
> this patch not only allows to isolate a userspace process, it also allows us
> to add a new interface for KVM that would allow us to isolate a KVM guest CPU
> to no longer being able to inject branches in any host or other guests. (while
> at the same time QEMU and host kernel can run with full power).
> We just have to set the TIF bit TIF_ISOLATE_BP_GUEST for the thread that runs a
> given CPU. This would certainly be an addon patch on top of this patch at a later
> point in time.
I think that the default should be secure, so userspace will be
breaking the isolation instead of setting it up and having just one
place to screw up would be better -- the prctl could decide which
isolation mode to pick.
Maybe we can change the conditions and break logical connection between
TIF_ISOLATE_BP and TIF_ISOLATE_BP_GUEST, to make a separate KVM
interface useful.
> Do you think something similar would be useful for other architectures as well?
It goes against my idea of virtualization, but there probably are users
that don't care about isolation and still use virtual machines ...
I expect most architectures to have a fairly similar resolution of
branch prediction leaks, so the idea should be easily abstractable on
all levels. (At least x86 is.)
> In that case we should try to come up with a cross-architecture interface to enable
> that.
Makes me think of a generic VM control "prefer performance over
security", which would also take care of future problems and let arches
decide what is worth the code.
A main drawback is that this will introduce dynamic branches to the
code, which are going to slow down the common case to speed up a niche.
On Tue, 23 Jan 2018 18:07:19 +0100
Dominik Brodowski <[email protected]> wrote:
> On Tue, Jan 23, 2018 at 02:07:01PM +0100, Martin Schwidefsky wrote:
> > Add the PR_ISOLATE_BP operation to prctl. The effect of the process
> > control is to make all branch prediction entries created by the execution
> > of the user space code of this task not applicable to kernel code or the
> > code of any other task.
>
> What is the rationale for requiring a per-process *opt-in* for this added
> protection?
>
> For KPTI on x86, the exact opposite approach is being discussed (see, e.g.
> http://lkml.kernel.org/r/[email protected] ): By
> default, play it safe, with KPTI enabled. But for "trusted" processes, one
> may opt out using prctrl.
The rationale is that there are cases where you got code from *somewhere*
and want to run it in an isolated context. Think: a docker container that
runs under KVM. But with spectre this is still not really safe. So you
include a wrapper program in the docker container to use the trap door
prctl to start the potential malicious program. Now you should be good, no?
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.
On Tue, 23 Jan 2018 21:32:24 +0100
Radim Krčmář <[email protected]> wrote:
> 2018-01-23 15:21+0100, Christian Borntraeger:
> > Paolo, Radim,
> >
> > this patch not only allows to isolate a userspace process, it also allows us
> > to add a new interface for KVM that would allow us to isolate a KVM guest CPU
> > to no longer being able to inject branches in any host or other guests. (while
> > at the same time QEMU and host kernel can run with full power).
> > We just have to set the TIF bit TIF_ISOLATE_BP_GUEST for the thread that runs a
> > given CPU. This would certainly be an addon patch on top of this patch at a later
> > point in time.
>
> I think that the default should be secure, so userspace will be
> breaking the isolation instead of setting it up and having just one
> place to screw up would be better -- the prctl could decide which
> isolation mode to pick.
The prctl is one direction only. Once a task is "secured" there is no way back.
If we start with a default of secure then *all* tasks will run with limited
branch prediction.
> Maybe we can change the conditions and break logical connection between
> TIF_ISOLATE_BP and TIF_ISOLATE_BP_GUEST, to make a separate KVM
> interface useful.
The thinking here is that you use TIF_ISOLATE_BP to make use space secure,
but you need to close the loophole that you can use a KVM guest to get out of
the secured mode. That is why you need to run the guest with isolated BP if
TIF_ISOLATE_BP is set. But if you want to run qemu as always and only the
KVM guest with isolataed BP you need a second bit, thus TIF_ISOLATE_GUEST_BP.
> > Do you think something similar would be useful for other architectures as well?
>
> It goes against my idea of virtualization, but there probably are users
> that don't care about isolation and still use virtual machines ...
> I expect most architectures to have a fairly similar resolution of
> branch prediction leaks, so the idea should be easily abstractable on
> all levels. (At least x86 is.)
Yes.
> > In that case we should try to come up with a cross-architecture interface to enable
> > that.
>
> Makes me think of a generic VM control "prefer performance over
> security", which would also take care of future problems and let arches
> decide what is worth the code.
VM as in virtual machine or VM as in virtual memory?
> A main drawback is that this will introduce dynamic branches to the
> code, which are going to slow down the common case to speed up a niche.
Where would you place these additional branches? I don't quite get the idea.
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.
On 01/23/2018 06:07 PM, Dominik Brodowski wrote:
> On Tue, Jan 23, 2018 at 02:07:01PM +0100, Martin Schwidefsky wrote:
>> Add the PR_ISOLATE_BP operation to prctl. The effect of the process
>> control is to make all branch prediction entries created by the execution
>> of the user space code of this task not applicable to kernel code or the
>> code of any other task.
>
> What is the rationale for requiring a per-process *opt-in* for this added
> protection?
>
> For KPTI on x86, the exact opposite approach is being discussed (see, e.g.
> http://lkml.kernel.org/r/[email protected] ): By
> default, play it safe, with KPTI enabled. But for "trusted" processes, one
> may opt out using prctrl.
FWIW, this is not about KPTI. s390 always has the kernel in a separate address
space. Its only about potential spectre like attacks.
This idea is to be able to isolate in controlled environments, e.g. if you have
only one thread with untrusted code (e.g. jitting remote code). The property of
the branch prediction mode on s390 is that it protects in two ways - against
being attacked but also against being able to attack via the btb.
On Wed, Jan 24, 2018 at 07:29:53AM +0100, Martin Schwidefsky wrote:
> On Tue, 23 Jan 2018 18:07:19 +0100
> Dominik Brodowski <[email protected]> wrote:
>
> > On Tue, Jan 23, 2018 at 02:07:01PM +0100, Martin Schwidefsky wrote:
> > > Add the PR_ISOLATE_BP operation to prctl. The effect of the process
> > > control is to make all branch prediction entries created by the execution
> > > of the user space code of this task not applicable to kernel code or the
> > > code of any other task.
> >
> > What is the rationale for requiring a per-process *opt-in* for this added
> > protection?
> >
> > For KPTI on x86, the exact opposite approach is being discussed (see, e.g.
> > http://lkml.kernel.org/r/[email protected] ): By
> > default, play it safe, with KPTI enabled. But for "trusted" processes, one
> > may opt out using prctrl.
>
> The rationale is that there are cases where you got code from *somewhere*
> and want to run it in an isolated context. Think: a docker container that
> runs under KVM. But with spectre this is still not really safe. So you
> include a wrapper program in the docker container to use the trap door
> prctl to start the potential malicious program. Now you should be good, no?
Well, partly. It may be that s390 and its use cases are special -- but as I
understand it, this uapi question goes beyond this question:
To my understanding, Linux traditionally tried to aim for the security goal
of avoiding information leaks *between* users[+], probably even between
processes of the same user. It wasn't a guarantee, and there always were
(and will be) information leaks -- and that is where additional safeguards
such as seccomp come into play, which reduce the attack surface against
unknown or unresolved security-related bugs. And everyone knew (or should
have known) that allowing "untrusted" code to be run (be it by an user, be
it JavaScript, etc.) is more risky. But still, avoiding information leaks
between users and between processes was (to my understanding) at least a
goal.[?]
In recent days however, the outlook on this issue seems to have shifted:
- Your proposal would mean to trust all userspace code, unless it is
specifically marked as untrusted. As I understand it, this would mean that
by default, spectre isn't fully mitigated cross-user and cross-process,
though the kernel could. And rogue user-run code may make use of that,
unless it is run with a special wrapper.
- Concerning x86 and IPBP, the current proposal is to limit the protection
offered by IPBP to non-dumpable processes. As I understand it, this would
mean that other processes are left hanging out to dry.[~]
- Concerning x86 and STIBP, David mentioned that "[t]here's an argument that
there are so many other information leaks between HT siblings that we
might not care"; in the last couple of hours, a proposal emerged to limit
the protection offered by STIBP to non-dumpable processes as well. To my
understanding, this would mean that many processes are left hanging out to
dry again.
I am a bit worried whether this is a sign for a shift in the security goals.
I fully understand that there might be processes (e.g. some[?] kernel
threads) and users (root) which you need to trust anyway, as they can
already access anything. Disabling additional, costly safeguards for
those special cases then seems OK. Opting out of additional protections for
single-user or single-use systems (haproxy?) might make sense as well. But
the kernel[*] not offering full[#] spectre mitigation by default for regular
users and their processes? I'm not so sure.
Thanks,
Dominik
[+] root is different.
[?] Whether such goals and their pursuit may have legal relevance -- e.g.
concerning the criminal law protection against unlawful access to data -- is
a related, fascinating topic.
[~] For example, I doubt that mutt sets the non-dumpable flag. But I
wouldn't want other users to be able to read my mail.
[#] Well, at least the best the kernel can currently and reasonably manage.
[*] Whether CPUs should enable full mitigation (IBRS_ALL) by default in future
has been discussed on this list as well.
On Wed, 2018-01-24 at 09:37 +0100, Dominik Brodowski wrote:
> On Wed, Jan 24, 2018 at 07:29:53AM +0100, Martin Schwidefsky wrote:
> >
> > On Tue, 23 Jan 2018 18:07:19 +0100
> > Dominik Brodowski <[email protected]> wrote:
> >
> > >
> > > On Tue, Jan 23, 2018 at 02:07:01PM +0100, Martin Schwidefsky wrote:
> > > >
> > > > Add the PR_ISOLATE_BP operation to prctl. The effect of the process
> > > > control is to make all branch prediction entries created by the execution
> > > > of the user space code of this task not applicable to kernel code or the
> > > > code of any other task.
> > >
> > > What is the rationale for requiring a per-process *opt-in* for this added
> > > protection?
> > >
> > > For KPTI on x86, the exact opposite approach is being discussed (see, e.g.
> > > http://lkml.kernel.org/r/[email protected] ): By
> > > default, play it safe, with KPTI enabled. But for "trusted" processes, one
> > > may opt out using prctrl.
> >
> > The rationale is that there are cases where you got code from *somewhere*
> > and want to run it in an isolated context. Think: a docker container that
> > runs under KVM. But with spectre this is still not really safe. So you
> > include a wrapper program in the docker container to use the trap door
> > prctl to start the potential malicious program. Now you should be good, no?
>
> Well, partly. It may be that s390 and its use cases are special -- but as I
> understand it, this uapi question goes beyond this question:
>
> To my understanding, Linux traditionally tried to aim for the security goal
> of avoiding information leaks *between* users[+], probably even between
> processes of the same user. It wasn't a guarantee, and there always were
> (and will be) information leaks -- and that is where additional safeguards
> such as seccomp come into play, which reduce the attack surface against
> unknown or unresolved security-related bugs. And everyone knew (or should
> have known) that allowing "untrusted" code to be run (be it by an user, be
> it JavaScript, etc.) is more risky. But still, avoiding information leaks
> between users and between processes was (to my understanding) at least a
> goal.[§]
>
> In recent days however, the outlook on this issue seems to have shifted:
>
> - Your proposal would mean to trust all userspace code, unless it is
> specifically marked as untrusted. As I understand it, this would mean that
> by default, spectre isn't fully mitigated cross-user and cross-process,
> though the kernel could. And rogue user-run code may make use of that,
> unless it is run with a special wrapper.
>
> - Concerning x86 and IPBP, the current proposal is to limit the protection
> offered by IPBP to non-dumpable processes. As I understand it, this would
> mean that other processes are left hanging out to dry.[~]
>
> - Concerning x86 and STIBP, David mentioned that "[t]here's an argument that
> there are so many other information leaks between HT siblings that we
> might not care"; in the last couple of hours, a proposal emerged to limit
> the protection offered by STIBP to non-dumpable processes as well. To my
> understanding, this would mean that many processes are left hanging out to
> dry again.
>
> I am a bit worried whether this is a sign for a shift in the security goals.
> I fully understand that there might be processes (e.g. some[?] kernel
> threads) and users (root) which you need to trust anyway, as they can
> already access anything. Disabling additional, costly safeguards for
> those special cases then seems OK. Opting out of additional protections for
> single-user or single-use systems (haproxy?) might make sense as well. But
> the kernel[*] not offering full[#] spectre mitigation by default for regular
> users and their processes? I'm not so sure.
Note that for STIBP/IBPB the operation of the flag is different in
another way. We're using it as a "protect this process from others"
flag, not a "protect others from this process" flag.
I'm not sure this is a fundamental shift in overall security goals;
more a recognition that on *current* hardware the cost of 100%
protection against an attack that was fairly unlikely in the first
place, is fairly prohibitive. For a process to make itself non-dumpable
is a simple enough way to opt in. And *maybe* we could contemplate a
command line option for 'IBPB always' but I'm *really* wary of exposing
too much of that stuff, rather than simply trying to Do The Right
Thing.
> [*] Whether CPUs should enable full mitigation (IBRS_ALL) by default
> in future has been discussed on this list as well.
The kernel will do that; it's just not implemented yet because it's
slightly non-trivial and can't be fully tested yet. We *will* want to
ALTERNATIVE away the retpolines and just set IBRS_ALL because it'll be
faster to do so.
For IBRS_ALL, note that we still need the same IBPB flushes on context
switch; just not STIBP. That's because IBRS_ALL, as Linus so eloquently
reminded us, is *still* a stop-gap measure and not actually a fix.
Reading between the lines, I think tagging predictions with the ring
(and HT sibling?) they came from is the best they could slip into the
next generation without having to stop the fabs for two years while
they go back to the drawing board.
A real fix will *hopefully* come later, but unfortunately Intel haven't
even defined the bit in IA32_ARCH_CAPABILITIES which advertises "you
don't have to do any of this shit any more; we fixed it", analogous to
their RDCL_NO bit for "no more Meltdown". I'm *hoping* that's just an
oversight in preparing the doc and not looking far enough ahead, rather
than an actual *intent* to never fix it properly as Linus inferred.
Hi!
On Wed 2018-01-24 09:37:05, Dominik Brodowski wrote:
> On Wed, Jan 24, 2018 at 07:29:53AM +0100, Martin Schwidefsky wrote:
> > On Tue, 23 Jan 2018 18:07:19 +0100
> > Dominik Brodowski <[email protected]> wrote:
> >
> > > On Tue, Jan 23, 2018 at 02:07:01PM +0100, Martin Schwidefsky wrote:
> > > > Add the PR_ISOLATE_BP operation to prctl. The effect of the process
> > > > control is to make all branch prediction entries created by the execution
> > > > of the user space code of this task not applicable to kernel code or the
> > > > code of any other task.
> > >
> > > What is the rationale for requiring a per-process *opt-in* for this added
> > > protection?
> > >
> > > For KPTI on x86, the exact opposite approach is being discussed (see, e.g.
> > > http://lkml.kernel.org/r/[email protected] ): By
> > > default, play it safe, with KPTI enabled. But for "trusted" processes, one
> > > may opt out using prctrl.
> >
> > The rationale is that there are cases where you got code from *somewhere*
> > and want to run it in an isolated context. Think: a docker container that
> > runs under KVM. But with spectre this is still not really safe. So you
> > include a wrapper program in the docker container to use the trap door
> > prctl to start the potential malicious program. Now you should be good, no?
>
> Well, partly. It may be that s390 and its use cases are special -- but as I
> understand it, this uapi question goes beyond this question:
>
> To my understanding, Linux traditionally tried to aim for the security goal
> of avoiding information leaks *between* users[+], probably even between
> processes of the same user. It wasn't a guarantee, and there always
It used to be guarantee. It still is, on non-buggy CPUs.
Leaks between users need to be prevented.
Leaks between one user should be prevented, too. There are various
ways to restrict the user these days, and for example sandboxed
chromium process should not be able to read my ~/.ssh.
can_ptrace() is closer to "can allow leaks between these two". Still
not quite there, as code might be running in process that
can_ptrace(), but the code has been audited by JIT or something not to
do syscalls.
> (and will be) information leaks -- and that is where additional safeguards
> such as seccomp come into play, which reduce the attack surface against
> unknown or unresolved security-related bugs. And everyone knew (or should
> have known) that allowing "untrusted" code to be run (be it by an user, be
> it JavaScript, etc.) is more risky. But still, avoiding information leaks
> between users and between processes was (to my understanding) at least a
> goal.[?]
>
> In recent days however, the outlook on this issue seems to have shifted:
>
> - Your proposal would mean to trust all userspace code, unless it is
> specifically marked as untrusted. As I understand it, this would mean that
> by default, spectre isn't fully mitigated cross-user and cross-process,
> though the kernel could. And rogue user-run code may make use of that,
> unless it is run with a special wrapper.
Yeah, well, that proposal does not fly, then.
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
2018-01-24 07:36+0100, Martin Schwidefsky:
> On Tue, 23 Jan 2018 21:32:24 +0100
> Radim Krčmář <[email protected]> wrote:
>
> > 2018-01-23 15:21+0100, Christian Borntraeger:
> > > Paolo, Radim,
> > >
> > > this patch not only allows to isolate a userspace process, it also allows us
> > > to add a new interface for KVM that would allow us to isolate a KVM guest CPU
> > > to no longer being able to inject branches in any host or other guests. (while
> > > at the same time QEMU and host kernel can run with full power).
> > > We just have to set the TIF bit TIF_ISOLATE_BP_GUEST for the thread that runs a
> > > given CPU. This would certainly be an addon patch on top of this patch at a later
> > > point in time.
> >
> > I think that the default should be secure, so userspace will be
> > breaking the isolation instead of setting it up and having just one
> > place to screw up would be better -- the prctl could decide which
> > isolation mode to pick.
>
> The prctl is one direction only. Once a task is "secured" there is no way back.
Good point, I was thinking of reversing the direction and having
TIF_NOT_ISOLATE_BP_GUEST prctl, but allowing tasks to subvert security
would be even worse.
> If we start with a default of secure then *all* tasks will run with limited
> branch prediction.
Right, because all of them are untrusted. What is the performance
impact of BP isolation?
This design seems very fragile to me -- we're forcing userspace to care
about some arcane hardware implementation and isolation in the system is
broken if a task running malicious code doesn't do that for any reason.
> > Maybe we can change the conditions and break logical connection between
> > TIF_ISOLATE_BP and TIF_ISOLATE_BP_GUEST, to make a separate KVM
> > interface useful.
>
> The thinking here is that you use TIF_ISOLATE_BP to make use space secure,
> but you need to close the loophole that you can use a KVM guest to get out of
> the secured mode. That is why you need to run the guest with isolated BP if
> TIF_ISOLATE_BP is set. But if you want to run qemu as always and only the
> KVM guest with isolataed BP you need a second bit, thus TIF_ISOLATE_GUEST_BP.
I understand, I was following the misguided idea where we have reversed
logic and then use just TIF_NOT_ISOLATE_GUEST_BP for sie switches.
> > > Do you think something similar would be useful for other architectures as well?
> >
> > It goes against my idea of virtualization, but there probably are users
> > that don't care about isolation and still use virtual machines ...
> > I expect most architectures to have a fairly similar resolution of
> > branch prediction leaks, so the idea should be easily abstractable on
> > all levels. (At least x86 is.)
>
> Yes.
>
> > > In that case we should try to come up with a cross-architecture interface to enable
> > > that.
> >
> > Makes me think of a generic VM control "prefer performance over
> > security", which would also take care of future problems and let arches
> > decide what is worth the code.
>
> VM as in virtual machine or VM as in virtual memory?
Virtual machine. (But could be anywhere really, especially the
kernel/user split slowed applications down for too long already. :])
> > A main drawback is that this will introduce dynamic branches to the
> > code, which are going to slow down the common case to speed up a niche.
>
> Where would you place these additional branches? I don't quite get the idea.
The BP* macros contain a branch in them -- avoidable if we only had
isolated virtual machines.
Thanks.
On Wed, 24 Jan 2018 12:15:53 +0100
Pavel Machek <[email protected]> wrote:
> Hi!
>
> On Wed 2018-01-24 09:37:05, Dominik Brodowski wrote:
> > On Wed, Jan 24, 2018 at 07:29:53AM +0100, Martin Schwidefsky wrote:
> > > On Tue, 23 Jan 2018 18:07:19 +0100
> > > Dominik Brodowski <[email protected]> wrote:
> > >
> > > > On Tue, Jan 23, 2018 at 02:07:01PM +0100, Martin Schwidefsky wrote:
> > > > > Add the PR_ISOLATE_BP operation to prctl. The effect of the process
> > > > > control is to make all branch prediction entries created by the execution
> > > > > of the user space code of this task not applicable to kernel code or the
> > > > > code of any other task.
> > > >
> > > > What is the rationale for requiring a per-process *opt-in* for this added
> > > > protection?
> > > >
> > > > For KPTI on x86, the exact opposite approach is being discussed (see, e.g.
> > > > http://lkml.kernel.org/r/[email protected] ): By
> > > > default, play it safe, with KPTI enabled. But for "trusted" processes, one
> > > > may opt out using prctrl.
> > >
> > > The rationale is that there are cases where you got code from *somewhere*
> > > and want to run it in an isolated context. Think: a docker container that
> > > runs under KVM. But with spectre this is still not really safe. So you
> > > include a wrapper program in the docker container to use the trap door
> > > prctl to start the potential malicious program. Now you should be good, no?
> >
> > Well, partly. It may be that s390 and its use cases are special -- but as I
> > understand it, this uapi question goes beyond this question:
> >
> > To my understanding, Linux traditionally tried to aim for the security goal
> > of avoiding information leaks *between* users[+], probably even between
> > processes of the same user. It wasn't a guarantee, and there always
>
> It used to be guarantee. It still is, on non-buggy CPUs.
In a perfect world none of this would have ever happened.
But reality begs to differ.
> Leaks between users need to be prevented.
>
> Leaks between one user should be prevented, too. There are various
> ways to restrict the user these days, and for example sandboxed
> chromium process should not be able to read my ~/.ssh.
Interesting that you mention the use case of a sandboxed browser process.
Why do you sandbox it in the first place? Because your do not trust it
as it might download malicious java-script code which uses some form of
attack to read the content of your ~/.ssh files. That is the use case for
the new prctl, limit this piece of code you *identified* as untrusted.
> can_ptrace() is closer to "can allow leaks between these two". Still
> not quite there, as code might be running in process that
> can_ptrace(), but the code has been audited by JIT or something not to
> do syscalls.
>
> > (and will be) information leaks -- and that is where additional safeguards
> > such as seccomp come into play, which reduce the attack surface against
> > unknown or unresolved security-related bugs. And everyone knew (or should
> > have known) that allowing "untrusted" code to be run (be it by an user, be
> > it JavaScript, etc.) is more risky. But still, avoiding information leaks
> > between users and between processes was (to my understanding) at least a
> > goal.[§]
> >
> > In recent days however, the outlook on this issue seems to have shifted:
> >
> > - Your proposal would mean to trust all userspace code, unless it is
> > specifically marked as untrusted. As I understand it, this would mean that
> > by default, spectre isn't fully mitigated cross-user and cross-process,
> > though the kernel could. And rogue user-run code may make use of that,
> > unless it is run with a special wrapper.
>
> Yeah, well, that proposal does not fly, then.
It does not fly as a solution for the general case if cross-process attacks.
But for the special case where you can identify all of the potential untrusted
code in your setup it should work just fine, no?
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.
On Wed, 24 Jan 2018 09:37:05 +0100
> To my understanding, Linux traditionally tried to aim for the security goal
> of avoiding information leaks *between* users[+], probably even between
> processes of the same user. It wasn't a guarantee, and there always were
Not between processes of the same user in general (see ptrace or use gdb).
> (and will be) information leaks -- and that is where additional safeguards
> such as seccomp come into play, which reduce the attack surface against
seccomp is irrelevant on many processors (see the Armageddon paper). You
can (given willing partners) transfer data into and out of a seccomp
process at quite a respectable rate depending upon your hardware features.
> I am a bit worried whether this is a sign for a shift in the security goals.
> I fully understand that there might be processes (e.g. some[?] kernel
> threads) and users (root) which you need to trust anyway, as they can
dumpable is actually very useful but only in a specific way. The question
if process A is dumpable by process B then there is no meaningful
protection between them and you don't need to do any work. Likewise if A
and B can dump each other and are both running on the same ht pair you
don't have to worry about them attacking one another. In all those cases
they can do it with ptrace already.
[There's a corner case here of using BPF filters to block ptrace]
Alan
Hi!
> > On Wed 2018-01-24 09:37:05, Dominik Brodowski wrote:
> > > On Wed, Jan 24, 2018 at 07:29:53AM +0100, Martin Schwidefsky wrote:
> > > > On Tue, 23 Jan 2018 18:07:19 +0100
> > > > Dominik Brodowski <[email protected]> wrote:
> > > >
> > > > > On Tue, Jan 23, 2018 at 02:07:01PM +0100, Martin Schwidefsky wrote:
> > > Well, partly. It may be that s390 and its use cases are special -- but as I
> > > understand it, this uapi question goes beyond this question:
> > >
> > > To my understanding, Linux traditionally tried to aim for the security goal
> > > of avoiding information leaks *between* users[+], probably even between
> > > processes of the same user. It wasn't a guarantee, and there always
> >
> > It used to be guarantee. It still is, on non-buggy CPUs.
>
> In a perfect world none of this would have ever happened.
> But reality begs to differ.
Ok, so: "Linux traditionally guarantees lack of information leaks
between PIDs". Yes, you can use ptrace, but that should be it.
> > Leaks between users need to be prevented.
> >
> > Leaks between one user should be prevented, too. There are various
> > ways to restrict the user these days, and for example sandboxed
> > chromium process should not be able to read my ~/.ssh.
>
> Interesting that you mention the use case of a sandboxed browser process.
> Why do you sandbox it in the first place? Because your do not trust it
> as it might download malicious java-script code which uses some form of
> attack to read the content of your ~/.ssh files. That is the use case for
> the new prctl, limit this piece of code you *identified* as
> untrusted.
See Alan Cox's replies.
Anyway. There's more than one way to mark process as untrusted,
(setuid nobody, seccomp, chroot nowhere, ptrace jail, ...). Do not
attempt to add prctl() to the list.
> > > In recent days however, the outlook on this issue seems to have shifted:
> > >
> > > - Your proposal would mean to trust all userspace code, unless it is
> > > specifically marked as untrusted. As I understand it, this would mean that
> > > by default, spectre isn't fully mitigated cross-user and cross-process,
> > > though the kernel could. And rogue user-run code may make use of that,
> > > unless it is run with a special wrapper.
> >
> > Yeah, well, that proposal does not fly, then.
>
> It does not fly as a solution for the general case if cross-process attacks.
> But for the special case where you can identify all of the potential untrusted
> code in your setup it should work just fine, no?
Well.. you can identify all of the untrusted code. Anything that does
not have CAP_HW_ACCESS is untrusted :-).
Anyway, no need to add prctl(), if A can ptrace B and B can ptrace A,
leaking info between them should not be a big deal. You can probably
find existing macros doing neccessary checks.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
> Anyway, no need to add prctl(), if A can ptrace B and B can ptrace A,
> leaking info between them should not be a big deal. You can probably
> find existing macros doing neccessary checks.
Until one of them is security managed so it shouldn't be able to ptrace
the other, or (and this is the nasty one) when a process is executing
code it wants to protect from the rest of the same process (eg an
untrusted jvm, javascript or probably nastiest of all webassembly)
We don't need a prctl for trusted/untrusted IMHO but we do eventually
need to think about API's for "this lot is me but I don't trust
it" (flatpack, docker, etc) and for what JIT engines need to do.
Alan
On Wed 2018-01-24 20:46:22, Alan Cox wrote:
> > Anyway, no need to add prctl(), if A can ptrace B and B can ptrace A,
> > leaking info between them should not be a big deal. You can probably
> > find existing macros doing neccessary checks.
>
> Until one of them is security managed so it shouldn't be able to ptrace
> the other, or (and this is the nasty one) when a process is executing
> code it wants to protect from the rest of the same process (eg an
> untrusted jvm, javascript or probably nastiest of all webassembly)
>
> We don't need a prctl for trusted/untrusted IMHO but we do eventually
> need to think about API's for "this lot is me but I don't trust
> it" (flatpack, docker, etc) and for what JIT engines need to do.
Agreed.
And yes, JITs are interesting, and given the latest
rowhammer/sidechannel attacks, something we may want to limit in
future...
It sounds nice on paper but is just risky.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Mon, 29 Jan 2018 14:14:46 +0100
Pavel Machek <[email protected]> wrote:
> On Wed 2018-01-24 20:46:22, Alan Cox wrote:
> > > Anyway, no need to add prctl(), if A can ptrace B and B can ptrace A,
> > > leaking info between them should not be a big deal. You can probably
> > > find existing macros doing neccessary checks.
> >
> > Until one of them is security managed so it shouldn't be able to ptrace
> > the other, or (and this is the nasty one) when a process is executing
> > code it wants to protect from the rest of the same process (eg an
> > untrusted jvm, javascript or probably nastiest of all webassembly)
> >
> > We don't need a prctl for trusted/untrusted IMHO but we do eventually
> > need to think about API's for "this lot is me but I don't trust
> > it" (flatpack, docker, etc) and for what JIT engines need to do.
>
> Agreed.
>
> And yes, JITs are interesting, and given the latest
> rowhammer/sidechannel attacks, something we may want to limit in
> future...
>
> It sounds nice on paper but is just risky.
I don't think java, javascript, webassembly, (and for some
implementations truetype, pdf, postscript, ... and more) are going away
in a hurry.
Alan