Hi,
First version of this series[1] was briefly in linux-next but had to be
reverted due to a bug where schedule would end up being called while
user_access was active[2].
After clarifications[3], rescheduling while in a user_access region is not
allowed.
* Patches 1 and 2 implement the unsafe accessors for arm64
* Patches 3 and 4 clarify this restriction in the API and attempts to
check against violations of the restriction.
Changes since v2[4]:
- Rebase on v5.0-rc2
- access_ok() is now done in user_access_begin(), so rework accessors
so access_ok() is not called in unsafe_get/put_user()
- Split addition of unsafe accessors and the user_access_region check
into separate patches
- Avoid reading SCTLR_EL1 in user_access_region check
- Add build option for user_access_region checking
- Reword clarifications on unsafe accessors API
Changes since v1[1]:
- Add a way to detect code calling schedule within a user_access region
- Make sure put_user/get_user arguments are evaluated before disabling PAN
[1] https://www.spinics.net/lists/arm-kernel/msg674925.html
[2] https://patchwork.kernel.org/patch/10634783/
[3] https://lkml.org/lkml/2018/11/28/50
[4] http://lists.infradead.org/pipermail/linux-arm-kernel/2018-December/617080.html
Cheers,
Julien
-->
Julien Thierry (4):
arm64: uaccess: Cleanup get/put_user()
arm64: uaccess: Implement unsafe accessors
uaccess: Check no rescheduling function is called in unsafe region
arm64: uaccess: Implement user_access_region_active
arch/arm64/include/asm/sysreg.h | 2 +
arch/arm64/include/asm/uaccess.h | 123 ++++++++++++++++++++++++++-------------
include/linux/kernel.h | 11 +++-
include/linux/uaccess.h | 13 +++++
kernel/sched/core.c | 22 +++++++
lib/Kconfig.debug | 8 +++
6 files changed, 135 insertions(+), 44 deletions(-)
--
1.9.1
Current implementation of get/put_user_unsafe default to get/put_user
which toggle PAN before each access, despite having been told by the caller
that multiple accesses to user memory were about to happen.
Provide implementations for user_access_begin/end to turn PAN off/on and
implement unsafe accessors that assume PAN was already turned off.
Signed-off-by: Julien Thierry <[email protected]>
Cc: Catalin Marinas <[email protected]>
Cc: Will Deacon <[email protected]>
---
arch/arm64/include/asm/uaccess.h | 79 ++++++++++++++++++++++++++++++----------
1 file changed, 59 insertions(+), 20 deletions(-)
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 8e40808..6a70c75 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -270,31 +270,26 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
#define __raw_get_user(x, ptr, err) \
do { \
- unsigned long __gu_val; \
- __chk_user_ptr(ptr); \
- uaccess_enable_not_uao(); \
switch (sizeof(*(ptr))) { \
case 1: \
- __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \
+ __get_user_asm("ldrb", "ldtrb", "%w", (x), (ptr), \
(err), ARM64_HAS_UAO); \
break; \
case 2: \
- __get_user_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr), \
+ __get_user_asm("ldrh", "ldtrh", "%w", (x), (ptr), \
(err), ARM64_HAS_UAO); \
break; \
case 4: \
- __get_user_asm("ldr", "ldtr", "%w", __gu_val, (ptr), \
+ __get_user_asm("ldr", "ldtr", "%w", (x), (ptr), \
(err), ARM64_HAS_UAO); \
break; \
case 8: \
- __get_user_asm("ldr", "ldtr", "%x", __gu_val, (ptr), \
+ __get_user_asm("ldr", "ldtr", "%x", (x), (ptr), \
(err), ARM64_HAS_UAO); \
break; \
default: \
BUILD_BUG(); \
} \
- uaccess_disable_not_uao(); \
- (x) = (__force __typeof__(*(ptr)))__gu_val; \
} while (0)
#define __get_user_error(x, ptr, err) \
@@ -302,8 +297,13 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
__typeof__(*(ptr)) __user *__p = (ptr); \
might_fault(); \
if (access_ok(__p, sizeof(*__p))) { \
+ unsigned long __gu_val; \
+ __chk_user_ptr(__p); \
__p = uaccess_mask_ptr(__p); \
- __raw_get_user((x), __p, (err)); \
+ uaccess_enable_not_uao(); \
+ __raw_get_user(__gu_val, __p, (err)); \
+ uaccess_disable_not_uao(); \
+ (x) = (__force __typeof__(*__p)) __gu_val; \
} else { \
(x) = 0; (err) = -EFAULT; \
} \
@@ -334,30 +334,26 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
#define __raw_put_user(x, ptr, err) \
do { \
- __typeof__(*(ptr)) __pu_val = (x); \
- __chk_user_ptr(ptr); \
- uaccess_enable_not_uao(); \
switch (sizeof(*(ptr))) { \
case 1: \
- __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \
+ __put_user_asm("strb", "sttrb", "%w", (x), (ptr), \
(err), ARM64_HAS_UAO); \
break; \
case 2: \
- __put_user_asm("strh", "sttrh", "%w", __pu_val, (ptr), \
+ __put_user_asm("strh", "sttrh", "%w", (x), (ptr), \
(err), ARM64_HAS_UAO); \
break; \
case 4: \
- __put_user_asm("str", "sttr", "%w", __pu_val, (ptr), \
+ __put_user_asm("str", "sttr", "%w", (x), (ptr), \
(err), ARM64_HAS_UAO); \
break; \
case 8: \
- __put_user_asm("str", "sttr", "%x", __pu_val, (ptr), \
+ __put_user_asm("str", "sttr", "%x", (x), (ptr), \
(err), ARM64_HAS_UAO); \
break; \
default: \
BUILD_BUG(); \
} \
- uaccess_disable_not_uao(); \
} while (0)
#define __put_user_error(x, ptr, err) \
@@ -365,9 +361,13 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
__typeof__(*(ptr)) __user *__p = (ptr); \
might_fault(); \
if (access_ok(__p, sizeof(*__p))) { \
+ __typeof__(*(ptr)) __pu_val = (x); \
+ __chk_user_ptr(__p); \
__p = uaccess_mask_ptr(__p); \
- __raw_put_user((x), __p, (err)); \
- } else { \
+ uaccess_enable_not_uao(); \
+ __raw_put_user(__pu_val, __p, (err)); \
+ uaccess_disable_not_uao(); \
+ } else { \
(err) = -EFAULT; \
} \
} while (0)
@@ -381,6 +381,45 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
#define put_user __put_user
+static __must_check inline bool user_access_begin(const void __user *ptr,
+ size_t len)
+{
+ if (unlikely(!access_ok(ptr, len)))
+ return false;
+
+ uaccess_enable_not_uao();
+ return true;
+}
+#define user_access_begin(ptr, len) user_access_begin(ptr, len)
+#define user_access_end() uaccess_disable_not_uao()
+
+#define unsafe_get_user(x, ptr, err) \
+do { \
+ __typeof__(*(ptr)) __user *__p = (ptr); \
+ unsigned long __gu_val; \
+ int __gu_err = 0; \
+ might_fault(); \
+ __chk_user_ptr(__p); \
+ __p = uaccess_mask_ptr(__p); \
+ __raw_get_user(__gu_val, __p, __gu_err); \
+ (x) = (__force __typeof__(*__p)) __gu_val; \
+ if (__gu_err != 0) \
+ goto err; \
+} while (0)
+
+#define unsafe_put_user(x, ptr, err) \
+do { \
+ __typeof__(*(ptr)) __user *__p = (ptr); \
+ __typeof__(*(ptr)) __pu_val = (x); \
+ int __pu_err = 0; \
+ might_fault(); \
+ __chk_user_ptr(__p); \
+ __p = uaccess_mask_ptr(__p); \
+ __raw_put_user(__pu_val, __p, __pu_err); \
+ if (__pu_err != 0) \
+ goto err; \
+} while (0)
+
extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
#define raw_copy_from_user(to, from, n) \
({ \
--
1.9.1
__get/put_user_check() macro is made to return a value but this is never
used. Get rid of them and just use directly __get/put_user_error() as
a statement, reducing macro indirection.
Also, take this opportunity to rename __get/put_user_err() as it gets
a bit confusing having them along __get/put_user_error().
Signed-off-by: Julien Thierry <[email protected]>
Cc: Catalin Marinas <[email protected]>
Cc: Will Deacon <[email protected]>
---
arch/arm64/include/asm/uaccess.h | 36 ++++++++++++------------------------
1 file changed, 12 insertions(+), 24 deletions(-)
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 547d7a0..8e40808 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -268,7 +268,7 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
: "+r" (err), "=&r" (x) \
: "r" (addr), "i" (-EFAULT))
-#define __get_user_err(x, ptr, err) \
+#define __raw_get_user(x, ptr, err) \
do { \
unsigned long __gu_val; \
__chk_user_ptr(ptr); \
@@ -297,28 +297,22 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
(x) = (__force __typeof__(*(ptr)))__gu_val; \
} while (0)
-#define __get_user_check(x, ptr, err) \
-({ \
+#define __get_user_error(x, ptr, err) \
+do { \
__typeof__(*(ptr)) __user *__p = (ptr); \
might_fault(); \
if (access_ok(__p, sizeof(*__p))) { \
__p = uaccess_mask_ptr(__p); \
- __get_user_err((x), __p, (err)); \
+ __raw_get_user((x), __p, (err)); \
} else { \
(x) = 0; (err) = -EFAULT; \
} \
-})
-
-#define __get_user_error(x, ptr, err) \
-({ \
- __get_user_check((x), (ptr), (err)); \
- (void)0; \
-})
+} while (0)
#define __get_user(x, ptr) \
({ \
int __gu_err = 0; \
- __get_user_check((x), (ptr), __gu_err); \
+ __get_user_error((x), (ptr), __gu_err); \
__gu_err; \
})
@@ -338,7 +332,7 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
: "+r" (err) \
: "r" (x), "r" (addr), "i" (-EFAULT))
-#define __put_user_err(x, ptr, err) \
+#define __raw_put_user(x, ptr, err) \
do { \
__typeof__(*(ptr)) __pu_val = (x); \
__chk_user_ptr(ptr); \
@@ -366,28 +360,22 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
uaccess_disable_not_uao(); \
} while (0)
-#define __put_user_check(x, ptr, err) \
-({ \
+#define __put_user_error(x, ptr, err) \
+do { \
__typeof__(*(ptr)) __user *__p = (ptr); \
might_fault(); \
if (access_ok(__p, sizeof(*__p))) { \
__p = uaccess_mask_ptr(__p); \
- __put_user_err((x), __p, (err)); \
+ __raw_put_user((x), __p, (err)); \
} else { \
(err) = -EFAULT; \
} \
-})
-
-#define __put_user_error(x, ptr, err) \
-({ \
- __put_user_check((x), (ptr), (err)); \
- (void)0; \
-})
+} while (0)
#define __put_user(x, ptr) \
({ \
int __pu_err = 0; \
- __put_user_check((x), (ptr), __pu_err); \
+ __put_user_error((x), (ptr), __pu_err); \
__pu_err; \
})
--
1.9.1
Provide arm64 implementation of user_access_region_active.
Check the status of PAN or SW PAN if needed.
Signed-off-by: Julien Thierry <[email protected]>
Cc: Catalin Marinas <[email protected]>
Cc: Will Deacon <[email protected]>
---
arch/arm64/include/asm/sysreg.h | 2 ++
arch/arm64/include/asm/uaccess.h | 12 ++++++++++++
2 files changed, 14 insertions(+)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 72dc4c0..f0b2f32 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -114,6 +114,8 @@
#define SYS_DC_CSW sys_insn(1, 0, 7, 10, 2)
#define SYS_DC_CISW sys_insn(1, 0, 7, 14, 2)
+#define SYS_PSTATE_PAN sys_reg(3, 0, 4, 2, 3)
+
#define SYS_OSDTRRX_EL1 sys_reg(2, 0, 0, 0, 2)
#define SYS_MDCCINT_EL1 sys_reg(2, 0, 0, 2, 0)
#define SYS_MDSCR_EL1 sys_reg(2, 0, 0, 2, 2)
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 6a70c75..1a8acc9 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -225,6 +225,18 @@ static inline void uaccess_enable_not_uao(void)
__uaccess_enable(ARM64_ALT_PAN_NOT_UAO);
}
+#define unsafe_user_region_active uaccess_region_active
+static inline bool uaccess_region_active(void)
+{
+ if (system_uses_ttbr0_pan()) {
+ return read_sysreg(ttbr1_el1) & TTBR_ASID_MASK;
+ } else if (cpus_have_const_cap(ARM64_ALT_PAN_NOT_UAO) &&
+ alternatives_applied) {
+ return !read_sysreg_s(SYS_PSTATE_PAN);
+ }
+ return false;
+}
+
/*
* Sanitise a uaccess pointer such that it becomes NULL if above the
* current addr_limit.
--
1.9.1
While running a user_access regions, it is not supported to reschedule.
Add an overridable primitive to indicate whether a user_access region is
active and check that this is not the case when calling rescheduling
functions.
These checks are only performed when DEBUG_UACCESS_SLEEP is selected.
Also, add a comment clarifying the behaviour of user_access regions.
Signed-off-by: Julien Thierry <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
include/linux/kernel.h | 11 +++++++++--
include/linux/uaccess.h | 13 +++++++++++++
kernel/sched/core.c | 22 ++++++++++++++++++++++
lib/Kconfig.debug | 8 ++++++++
4 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8f0e68e..73f1f82 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -237,11 +237,18 @@
struct pt_regs;
struct user;
+#ifdef CONFIG_DEBUG_UACCESS_SLEEP
+extern void __might_resched(const char *file, int line);
+#else
+#define __might_resched(file, line) do { } while (0)
+#endif
+
#ifdef CONFIG_PREEMPT_VOLUNTARY
extern int _cond_resched(void);
-# define might_resched() _cond_resched()
+# define might_resched() \
+ do { __might_resched(__FILE__, __LINE__); _cond_resched(); } while (0)
#else
-# define might_resched() do { } while (0)
+# define might_resched() __might_resched(__FILE__, __LINE__)
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 37b226e..2c0c39e 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -263,6 +263,15 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
#define probe_kernel_address(addr, retval) \
probe_kernel_read(&retval, addr, sizeof(retval))
+/*
+ * user_access_begin() and user_access_end() define a region where
+ * unsafe user accessors can be used. Exceptions and interrupt shall exit the
+ * user_access region and re-enter it when returning to the interrupted context.
+ *
+ * No sleeping function should get called during a user_access region - we rely
+ * on exception handling to take care of the user_access status for us, but that
+ * doesn't happen when directly calling schedule().
+ */
#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
@@ -270,6 +279,10 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
#endif
+#ifndef unsafe_user_region_active
+#define unsafe_user_region_active() false
+#endif
+
#ifdef CONFIG_HARDENED_USERCOPY
void usercopy_warn(const char *name, const char *detail, bool to_user,
unsigned long offset, unsigned long len);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a674c7db..b1bb7e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3289,6 +3289,14 @@ static inline void schedule_debug(struct task_struct *prev)
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
}
+
+ if (IS_ENABLED(CONFIG_DEBUG_UACCESS_SLEEP) &&
+ unlikely(unsafe_user_region_active())) {
+ printk(KERN_ERR "BUG: scheduling while user_access enabled: %s/%d/0x%08x\n",
+ prev->comm, prev->pid, preempt_count());
+ dump_stack();
+ }
+
rcu_sleep_check();
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -6151,6 +6159,20 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
EXPORT_SYMBOL(___might_sleep);
#endif
+#ifdef CONFIG_DEBUG_UACCESS_SLEEP
+void __might_resched(const char *file, int line)
+{
+ if (!unsafe_user_region_active())
+ return;
+
+ printk(KERN_ERR
+ "BUG: rescheduling function called from user access context at %s:%d\n",
+ file, line);
+ dump_stack();
+}
+EXPORT_SYMBOL(__might_resched);
+#endif
+
#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)
{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d4df5b2..d030e31 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2069,6 +2069,14 @@ config IO_STRICT_DEVMEM
If in doubt, say Y.
+config DEBUG_UACCESS_SLEEP
+ bool "Check sleep inside a user access region"
+ depends on DEBUG_KERNEL
+ help
+ If you say Y here, various routines which may sleep will become very
+ noisy if they are called inside a user access region (i.e. between
+ a user_access_begin() and a user_access_end())
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu # Kernel hacking
--
1.9.1
Hi Julien,
On Tue, Jan 15, 2019 at 01:58:25PM +0000, Julien Thierry wrote:
> Julien Thierry (4):
> arm64: uaccess: Cleanup get/put_user()
> arm64: uaccess: Implement unsafe accessors
> uaccess: Check no rescheduling function is called in unsafe region
> arm64: uaccess: Implement user_access_region_active
I queued the first two patches for 5.1. It would be nice to upstream the
other two but patch 3 needs an ack from (or merged by) the scheduler
folk.
Thanks.
--
Catalin
Hi,
Gentle ping for patches 3 and 4.
Thanks,
Julien
On 15/01/2019 13:58, Julien Thierry wrote:
> Hi,
>
> First version of this series[1] was briefly in linux-next but had to be
> reverted due to a bug where schedule would end up being called while
> user_access was active[2].
>
> After clarifications[3], rescheduling while in a user_access region is not
> allowed.
>
> * Patches 1 and 2 implement the unsafe accessors for arm64
> * Patches 3 and 4 clarify this restriction in the API and attempts to
> check against violations of the restriction.
>
> Changes since v2[4]:
> - Rebase on v5.0-rc2
> - access_ok() is now done in user_access_begin(), so rework accessors
> so access_ok() is not called in unsafe_get/put_user()
> - Split addition of unsafe accessors and the user_access_region check
> into separate patches
> - Avoid reading SCTLR_EL1 in user_access_region check
> - Add build option for user_access_region checking
> - Reword clarifications on unsafe accessors API
>
> Changes since v1[1]:
> - Add a way to detect code calling schedule within a user_access region
> - Make sure put_user/get_user arguments are evaluated before disabling PAN
>
> [1] https://www.spinics.net/lists/arm-kernel/msg674925.html
> [2] https://patchwork.kernel.org/patch/10634783/
> [3] https://lkml.org/lkml/2018/11/28/50
> [4] http://lists.infradead.org/pipermail/linux-arm-kernel/2018-December/617080.html
>
> Cheers,
>
> Julien
>
> -->
>
> Julien Thierry (4):
> arm64: uaccess: Cleanup get/put_user()
> arm64: uaccess: Implement unsafe accessors
> uaccess: Check no rescheduling function is called in unsafe region
> arm64: uaccess: Implement user_access_region_active
>
> arch/arm64/include/asm/sysreg.h | 2 +
> arch/arm64/include/asm/uaccess.h | 123 ++++++++++++++++++++++++++-------------
> include/linux/kernel.h | 11 +++-
> include/linux/uaccess.h | 13 +++++
> kernel/sched/core.c | 22 +++++++
> lib/Kconfig.debug | 8 +++
> 6 files changed, 135 insertions(+), 44 deletions(-)
>
> --
> 1.9.1
>
--
Julien Thierry
On 15/01/2019 13:58, Julien Thierry wrote:
[...]> @@ -6151,6 +6159,20 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
> EXPORT_SYMBOL(___might_sleep);
> #endif
>
> +#ifdef CONFIG_DEBUG_UACCESS_SLEEP
> +void __might_resched(const char *file, int line)
> +{
> + if (!unsafe_user_region_active())
> + return;
> +
> + printk(KERN_ERR
> + "BUG: rescheduling function called from user access context at %s:%d\n",
> + file, line);
> + dump_stack();
Since I've been staring intensely at ___might_sleep() lately, I was thinking
we could "copy" it a bit more closely (sorry for going back on what I said
earlier).
Coming back to the double warnings (__might_resched() + schedule_debug()),
it might be better to drop the schedule_debug() warning and just have the
one in __might_resched() - if something goes wrong, there'll already be a
"BUG" in the log.
> +}
> +EXPORT_SYMBOL(__might_resched);
> +#endif
> +
> #ifdef CONFIG_MAGIC_SYSRQ
> void normalize_rt_tasks(void)
> {
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index d4df5b2..d030e31 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -2069,6 +2069,14 @@ config IO_STRICT_DEVMEM
>
> If in doubt, say Y.
>
> +config DEBUG_UACCESS_SLEEP
> + bool "Check sleep inside a user access region"
> + depends on DEBUG_KERNEL
> + help
> + If you say Y here, various routines which may sleep will become very
> + noisy if they are called inside a user access region (i.e. between
> + a user_access_begin() and a user_access_end())
If it does get noisy, we should go for some ratelimiting - it's probably
good practice even if it is not noisy actually.
___might_sleep() has this:
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
> +
> source "arch/$(SRCARCH)/Kconfig.debug"
>
> endmenu # Kernel hacking
> --
> 1.9.1
>
On 30/01/2019 16:58, Valentin Schneider wrote:
> On 15/01/2019 13:58, Julien Thierry wrote:
> [...]> @@ -6151,6 +6159,20 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
>> EXPORT_SYMBOL(___might_sleep);
>> #endif
>>
>> +#ifdef CONFIG_DEBUG_UACCESS_SLEEP
>> +void __might_resched(const char *file, int line)
>> +{
>> + if (!unsafe_user_region_active())
>> + return;
>> +
>> + printk(KERN_ERR
>> + "BUG: rescheduling function called from user access context at %s:%d\n",
>> + file, line);
>> + dump_stack();
>
> Since I've been staring intensely at ___might_sleep() lately, I was thinking
> we could "copy" it a bit more closely (sorry for going back on what I said
> earlier).
>
> Coming back to the double warnings (__might_resched() + schedule_debug()),
> it might be better to drop the schedule_debug() warning and just have the
> one in __might_resched() - if something goes wrong, there'll already be a
> "BUG" in the log.
>
My only worry with that approach is that if someone has a function that
does resched but does not include the annotation __might_resched() we'd
miss the fact that something went wrong.
But that might be a lot of "if"s since that assumes that something does
go wrong.
Could I have a maintainers opinion on this to know if I respin it?
>> +}
>> +EXPORT_SYMBOL(__might_resched);
>> +#endif
>> +
>> #ifdef CONFIG_MAGIC_SYSRQ
>> void normalize_rt_tasks(void)
>> {
>> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
>> index d4df5b2..d030e31 100644
>> --- a/lib/Kconfig.debug
>> +++ b/lib/Kconfig.debug
>> @@ -2069,6 +2069,14 @@ config IO_STRICT_DEVMEM
>>
>> If in doubt, say Y.
>>
>> +config DEBUG_UACCESS_SLEEP
>> + bool "Check sleep inside a user access region"
>> + depends on DEBUG_KERNEL
>> + help
>> + If you say Y here, various routines which may sleep will become very
>> + noisy if they are called inside a user access region (i.e. between
>> + a user_access_begin() and a user_access_end())
>
> If it does get noisy, we should go for some ratelimiting - it's probably
> good practice even if it is not noisy actually.
>
> ___might_sleep() has this:
>
> if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
> return;
> prev_jiffy = jiffies;
>
I guess the noisiness could depend on the arch specific handling of user
accesses. So yes I guess it might be a good idea to add this.
Thanks,
--
Julien Thierry
* Julien Thierry <[email protected]> wrote:
> While running a user_access regions, it is not supported to reschedule.
> Add an overridable primitive to indicate whether a user_access region is
> active and check that this is not the case when calling rescheduling
> functions.
>
> These checks are only performed when DEBUG_UACCESS_SLEEP is selected.
>
> Also, add a comment clarifying the behaviour of user_access regions.
>
> Signed-off-by: Julien Thierry <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
>
> ---
> include/linux/kernel.h | 11 +++++++++--
> include/linux/uaccess.h | 13 +++++++++++++
> kernel/sched/core.c | 22 ++++++++++++++++++++++
> lib/Kconfig.debug | 8 ++++++++
> 4 files changed, 52 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index 8f0e68e..73f1f82 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -237,11 +237,18 @@
> struct pt_regs;
> struct user;
>
> +#ifdef CONFIG_DEBUG_UACCESS_SLEEP
> +extern void __might_resched(const char *file, int line);
> +#else
> +#define __might_resched(file, line) do { } while (0)
> +#endif
> +
> #ifdef CONFIG_PREEMPT_VOLUNTARY
> extern int _cond_resched(void);
> -# define might_resched() _cond_resched()
> +# define might_resched() \
> + do { __might_resched(__FILE__, __LINE__); _cond_resched(); } while (0)
> #else
> -# define might_resched() do { } while (0)
> +# define might_resched() __might_resched(__FILE__, __LINE__)
> #endif
>
> #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
> diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
> index 37b226e..2c0c39e 100644
> --- a/include/linux/uaccess.h
> +++ b/include/linux/uaccess.h
> @@ -263,6 +263,15 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
> #define probe_kernel_address(addr, retval) \
> probe_kernel_read(&retval, addr, sizeof(retval))
>
> +/*
> + * user_access_begin() and user_access_end() define a region where
> + * unsafe user accessors can be used. Exceptions and interrupt shall exit the
> + * user_access region and re-enter it when returning to the interrupted context.
> + *
> + * No sleeping function should get called during a user_access region - we rely
> + * on exception handling to take care of the user_access status for us, but that
> + * doesn't happen when directly calling schedule().
> + */
> #ifndef user_access_begin
> #define user_access_begin(ptr,len) access_ok(ptr, len)
> #define user_access_end() do { } while (0)
> @@ -270,6 +279,10 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
> #define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
> #endif
>
> +#ifndef unsafe_user_region_active
> +#define unsafe_user_region_active() false
> +#endif
> +
> #ifdef CONFIG_HARDENED_USERCOPY
> void usercopy_warn(const char *name, const char *detail, bool to_user,
> unsigned long offset, unsigned long len);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a674c7db..b1bb7e9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3289,6 +3289,14 @@ static inline void schedule_debug(struct task_struct *prev)
> __schedule_bug(prev);
> preempt_count_set(PREEMPT_DISABLED);
> }
> +
> + if (IS_ENABLED(CONFIG_DEBUG_UACCESS_SLEEP) &&
> + unlikely(unsafe_user_region_active())) {
> + printk(KERN_ERR "BUG: scheduling while user_access enabled: %s/%d/0x%08x\n",
> + prev->comm, prev->pid, preempt_count());
> + dump_stack();
> + }
> +
> rcu_sleep_check();
>
> profile_hit(SCHED_PROFILING, __builtin_return_address(0));
> @@ -6151,6 +6159,20 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
> EXPORT_SYMBOL(___might_sleep);
> #endif
>
> +#ifdef CONFIG_DEBUG_UACCESS_SLEEP
> +void __might_resched(const char *file, int line)
> +{
> + if (!unsafe_user_region_active())
> + return;
Could you please more clearly explain why you want/need an exception from
the __might_resched() debug warning?
Thanks,
Ingo
On Mon, Feb 11, 2019 at 02:45:27PM +0100, Ingo Molnar wrote:
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index a674c7db..b1bb7e9 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3289,6 +3289,14 @@ static inline void schedule_debug(struct task_struct *prev)
> > __schedule_bug(prev);
> > preempt_count_set(PREEMPT_DISABLED);
> > }
> > +
> > + if (IS_ENABLED(CONFIG_DEBUG_UACCESS_SLEEP) &&
> > + unlikely(unsafe_user_region_active())) {
> > + printk(KERN_ERR "BUG: scheduling while user_access enabled: %s/%d/0x%08x\n",
> > + prev->comm, prev->pid, preempt_count());
> > + dump_stack();
> > + }
> > +
> > rcu_sleep_check();
> >
> > profile_hit(SCHED_PROFILING, __builtin_return_address(0));
> > @@ -6151,6 +6159,20 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
> > EXPORT_SYMBOL(___might_sleep);
> > #endif
> >
> > +#ifdef CONFIG_DEBUG_UACCESS_SLEEP
> > +void __might_resched(const char *file, int line)
> > +{
> > + if (!unsafe_user_region_active())
> > + return;
>
> Could you please more clearly explain why you want/need an exception from
> the __might_resched() debug warning?
In specific; how is the addition in schedule_debug() not triggering on
PREEMPT=y kernels?
If code is preemptible, you can (get) schedule(d). If it is not
preemptible; you do not need these additional tests.
On 11/02/2019 13:51, Peter Zijlstra wrote:
> On Mon, Feb 11, 2019 at 02:45:27PM +0100, Ingo Molnar wrote:
>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>> index a674c7db..b1bb7e9 100644
>>> --- a/kernel/sched/core.c
>>> +++ b/kernel/sched/core.c
>>> @@ -3289,6 +3289,14 @@ static inline void schedule_debug(struct task_struct *prev)
>>> __schedule_bug(prev);
>>> preempt_count_set(PREEMPT_DISABLED);
>>> }
>>> +
>>> + if (IS_ENABLED(CONFIG_DEBUG_UACCESS_SLEEP) &&
>>> + unlikely(unsafe_user_region_active())) {
>>> + printk(KERN_ERR "BUG: scheduling while user_access enabled: %s/%d/0x%08x\n",
>>> + prev->comm, prev->pid, preempt_count());
>>> + dump_stack();
>>> + }
>>> +
>>> rcu_sleep_check();
>>>
>>> profile_hit(SCHED_PROFILING, __builtin_return_address(0));
>>> @@ -6151,6 +6159,20 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
>>> EXPORT_SYMBOL(___might_sleep);
>>> #endif
>>>
>>> +#ifdef CONFIG_DEBUG_UACCESS_SLEEP
>>> +void __might_resched(const char *file, int line)
>>> +{
>>> + if (!unsafe_user_region_active())
>>> + return;
>>
>> Could you please more clearly explain why you want/need an exception from
>> the __might_resched() debug warning?
So, the scenarios I'm trying to avoid are of the following flavour:
if (user_access_begin(ptr, size)) {
[...]
// Calling a function that might call schedule()
[...]
user_access_end();
}
The thing is, as I understand, not all function that call schedule() are
annotated with might_resched(), and on the other hand, not every time we
call a function that might_resched() will it call schedule().
Now with Peter's remark I think I might have been overzealous.
>
> In specific; how is the addition in schedule_debug() not triggering on
> PREEMPT=y kernels?
>
> If code is preemptible, you can (get) schedule(d). If it is not
> preemptible; you do not need these additional tests.
>
Yes that sounds right, might_resched() only potentially reschedules if
in a suitable context, so best case I issue two warnings, worst case I
actually be warn when the caller took care to disable preemption or
interrupts before calling a might_resched().
I guess I got a bit confused with might_sleep() which is "if you call
this in the wrong context I warn" whereas might_resched() is just "if
you call this in preemptible context, lets resched".
I guess I'll drop the might_resched() part of this patch if that sounds
alright.
Thanks,
--
Julien Thierry
* Julien Thierry <[email protected]> wrote:
> I guess I'll drop the might_resched() part of this patch if that sounds
> alright.
That sounds perfect - that bit was pretty much the only problem I had
with the series.
Thanks,
Ingo
On Tue, Feb 12, 2019 at 09:15:13AM +0000, Julien Thierry wrote:
> >>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> >>> index a674c7db..b1bb7e9 100644
> >>> --- a/kernel/sched/core.c
> >>> +++ b/kernel/sched/core.c
> >>> @@ -3289,6 +3289,14 @@ static inline void schedule_debug(struct task_struct *prev)
> >>> __schedule_bug(prev);
> >>> preempt_count_set(PREEMPT_DISABLED);
> >>> }
> >>> +
> >>> + if (IS_ENABLED(CONFIG_DEBUG_UACCESS_SLEEP) &&
> >>> + unlikely(unsafe_user_region_active())) {
> >>> + printk(KERN_ERR "BUG: scheduling while user_access enabled: %s/%d/0x%08x\n",
> >>> + prev->comm, prev->pid, preempt_count());
> >>> + dump_stack();
> >>> + }
> >>> +
> >>> rcu_sleep_check();
> >>>
> >>> profile_hit(SCHED_PROFILING, __builtin_return_address(0));
> I guess I'll drop the might_resched() part of this patch if that sounds
> alright.
I'm still confused by the schedule_debug() part. How is that not broken?
On 13/02/2019 10:35, Peter Zijlstra wrote:
> On Tue, Feb 12, 2019 at 09:15:13AM +0000, Julien Thierry wrote:
>
>>>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>>>> index a674c7db..b1bb7e9 100644
>>>>> --- a/kernel/sched/core.c
>>>>> +++ b/kernel/sched/core.c
>>>>> @@ -3289,6 +3289,14 @@ static inline void schedule_debug(struct task_struct *prev)
>>>>> __schedule_bug(prev);
>>>>> preempt_count_set(PREEMPT_DISABLED);
>>>>> }
>>>>> +
>>>>> + if (IS_ENABLED(CONFIG_DEBUG_UACCESS_SLEEP) &&
>>>>> + unlikely(unsafe_user_region_active())) {
>>>>> + printk(KERN_ERR "BUG: scheduling while user_access enabled: %s/%d/0x%08x\n",
>>>>> + prev->comm, prev->pid, preempt_count());
>>>>> + dump_stack();
>>>>> + }
>>>>> +
>>>>> rcu_sleep_check();
>>>>>
>>>>> profile_hit(SCHED_PROFILING, __builtin_return_address(0));
>
>> I guess I'll drop the might_resched() part of this patch if that sounds
>> alright.
>
> I'm still confused by the schedule_debug() part. How is that not broken?
Hmmm, I am not exactly sure which part you expect to be broken, I guess
it's because of the nature of the uaccess unsafe accessor usage.
Basically, the following is a definite no:
if (user_access_begin(ptr, size)) {
[...]
//something that calls schedule
[...]
user_access_end();
}
However the following is fine:
- user_access_begin(ptr, size)
- taking irq/exception
- get preempted
- get resumed at some point in time
- restore state + eret
- user_access_end()
That's because exceptions/irq implicitly "suspend" the user access
region. (That's what I'm trying to clarify with the comment)
So, unsafe_user_region_active() should return false in a irq/exception
context.
Is this what you were concerned about? Or there still something that
might be broken?
Thanks,
--
Julien Thierry
On Wed, Feb 13, 2019 at 10:50:21AM +0000, Julien Thierry wrote:
> On 13/02/2019 10:35, Peter Zijlstra wrote:
> > On Tue, Feb 12, 2019 at 09:15:13AM +0000, Julien Thierry wrote:
> >
> >>>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> >>>>> index a674c7db..b1bb7e9 100644
> >>>>> --- a/kernel/sched/core.c
> >>>>> +++ b/kernel/sched/core.c
> >>>>> @@ -3289,6 +3289,14 @@ static inline void schedule_debug(struct task_struct *prev)
> >>>>> __schedule_bug(prev);
> >>>>> preempt_count_set(PREEMPT_DISABLED);
> >>>>> }
> >>>>> +
> >>>>> + if (IS_ENABLED(CONFIG_DEBUG_UACCESS_SLEEP) &&
> >>>>> + unlikely(unsafe_user_region_active())) {
> >>>>> + printk(KERN_ERR "BUG: scheduling while user_access enabled: %s/%d/0x%08x\n",
> >>>>> + prev->comm, prev->pid, preempt_count());
> >>>>> + dump_stack();
> >>>>> + }
> >>>>> +
> >>>>> rcu_sleep_check();
> >>>>>
> >>>>> profile_hit(SCHED_PROFILING, __builtin_return_address(0));
> >
> >> I guess I'll drop the might_resched() part of this patch if that sounds
> >> alright.
> >
> > I'm still confused by the schedule_debug() part. How is that not broken?
>
> Hmmm, I am not exactly sure which part you expect to be broken, I guess
> it's because of the nature of the uaccess unsafe accessor usage.
>
> Basically, the following is a definite no:
> if (user_access_begin(ptr, size)) {
>
> [...]
>
> //something that calls schedule
>
> [...]
>
> user_access_end();
> }
>
>
> However the following is fine:
>
> - user_access_begin(ptr, size)
> - taking irq/exception
> - get preempted
This; how is getting preempted fundamentally different from scheduling
ourselves?
> - get resumed at some point in time
> - restore state + eret
> - user_access_end()
>
> That's because exceptions/irq implicitly "suspend" the user access
> region. (That's what I'm trying to clarify with the comment)
> So, unsafe_user_region_active() should return false in a irq/exception
> context.
>
> Is this what you were concerned about? Or there still something that
> might be broken?
I really hate the asymetry introduced between preemptible and being able
to schedule. Both end up calling __schedule() and there really should
not be a difference.
On Wed, Feb 13, 2019 at 02:17:20PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 13, 2019 at 10:50:21AM +0000, Julien Thierry wrote:
> > On 13/02/2019 10:35, Peter Zijlstra wrote:
> > > On Tue, Feb 12, 2019 at 09:15:13AM +0000, Julien Thierry wrote:
> > >
> > >>>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > >>>>> index a674c7db..b1bb7e9 100644
> > >>>>> --- a/kernel/sched/core.c
> > >>>>> +++ b/kernel/sched/core.c
> > >>>>> @@ -3289,6 +3289,14 @@ static inline void schedule_debug(struct task_struct *prev)
> > >>>>> __schedule_bug(prev);
> > >>>>> preempt_count_set(PREEMPT_DISABLED);
> > >>>>> }
> > >>>>> +
> > >>>>> + if (IS_ENABLED(CONFIG_DEBUG_UACCESS_SLEEP) &&
> > >>>>> + unlikely(unsafe_user_region_active())) {
> > >>>>> + printk(KERN_ERR "BUG: scheduling while user_access enabled: %s/%d/0x%08x\n",
> > >>>>> + prev->comm, prev->pid, preempt_count());
> > >>>>> + dump_stack();
> > >>>>> + }
> > >>>>> +
> > >>>>> rcu_sleep_check();
> > >>>>>
> > >>>>> profile_hit(SCHED_PROFILING, __builtin_return_address(0));
> > >
> > >> I guess I'll drop the might_resched() part of this patch if that sounds
> > >> alright.
> > >
> > > I'm still confused by the schedule_debug() part. How is that not broken?
> >
> > Hmmm, I am not exactly sure which part you expect to be broken, I guess
> > it's because of the nature of the uaccess unsafe accessor usage.
> >
> > Basically, the following is a definite no:
> > if (user_access_begin(ptr, size)) {
> >
> > [...]
> >
> > //something that calls schedule
> >
> > [...]
> >
> > user_access_end();
> > }
> >
> >
> > However the following is fine:
> >
> > - user_access_begin(ptr, size)
> > - taking irq/exception
> > - get preempted
>
> This; how is getting preempted fundamentally different from scheduling
> ourselves?
This is also the thing that PREEMPT_VOLUNTARY hinges on; it inserts
'random' reschedule points through might_sleep() and cond_resched().
If you're preemptible; you must be able to schedule and vice-versa.
You're breaking that.
> > - get resumed at some point in time
> > - restore state + eret
> > - user_access_end()
> >
> > That's because exceptions/irq implicitly "suspend" the user access
> > region. (That's what I'm trying to clarify with the comment)
> > So, unsafe_user_region_active() should return false in a irq/exception
> > context.
> >
> > Is this what you were concerned about? Or there still something that
> > might be broken?
>
> I really hate the asymetry introduced between preemptible and being able
> to schedule. Both end up calling __schedule() and there really should
> not be a difference.
Hi Peter,
On Wed, Feb 13, 2019 at 02:17:20PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 13, 2019 at 10:50:21AM +0000, Julien Thierry wrote:
> > On 13/02/2019 10:35, Peter Zijlstra wrote:
> > > On Tue, Feb 12, 2019 at 09:15:13AM +0000, Julien Thierry wrote:
> > >
> > >>>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > >>>>> index a674c7db..b1bb7e9 100644
> > >>>>> --- a/kernel/sched/core.c
> > >>>>> +++ b/kernel/sched/core.c
> > >>>>> @@ -3289,6 +3289,14 @@ static inline void schedule_debug(struct task_struct *prev)
> > >>>>> __schedule_bug(prev);
> > >>>>> preempt_count_set(PREEMPT_DISABLED);
> > >>>>> }
> > >>>>> +
> > >>>>> + if (IS_ENABLED(CONFIG_DEBUG_UACCESS_SLEEP) &&
> > >>>>> + unlikely(unsafe_user_region_active())) {
> > >>>>> + printk(KERN_ERR "BUG: scheduling while user_access enabled: %s/%d/0x%08x\n",
> > >>>>> + prev->comm, prev->pid, preempt_count());
> > >>>>> + dump_stack();
> > >>>>> + }
> > >>>>> +
> > >>>>> rcu_sleep_check();
> > >>>>>
> > >>>>> profile_hit(SCHED_PROFILING, __builtin_return_address(0));
> > >
> > >> I guess I'll drop the might_resched() part of this patch if that sounds
> > >> alright.
> > >
> > > I'm still confused by the schedule_debug() part. How is that not broken?
> >
> > Hmmm, I am not exactly sure which part you expect to be broken, I guess
> > it's because of the nature of the uaccess unsafe accessor usage.
> >
> > Basically, the following is a definite no:
> > if (user_access_begin(ptr, size)) {
> >
> > [...]
> >
> > //something that calls schedule
> >
> > [...]
> >
> > user_access_end();
> > }
> >
> >
> > However the following is fine:
> >
> > - user_access_begin(ptr, size)
> > - taking irq/exception
> > - get preempted
>
> This; how is getting preempted fundamentally different from scheduling
> ourselves?
The difference is because getting preempted in the sequence above is
triggered off the back of an interrupt. On arm64, and I think also on x86,
the user access state (SMAP or PAN) is saved and restored across exceptions
but not across context switch. Consequently, taking an irq in a
user_access_{begin,end} section and then scheduling is fine, but calling
schedule directly within such a section is not.
Julien -- please yell if I've missed some crucial detail, but I think that's
the gist of what we're trying to describe here.
Will
On 13/02/2019 14:00, Will Deacon wrote:
> Hi Peter,
>
> On Wed, Feb 13, 2019 at 02:17:20PM +0100, Peter Zijlstra wrote:
>> On Wed, Feb 13, 2019 at 10:50:21AM +0000, Julien Thierry wrote:
>>> On 13/02/2019 10:35, Peter Zijlstra wrote:
>>>> On Tue, Feb 12, 2019 at 09:15:13AM +0000, Julien Thierry wrote:
>>>>
>>>>>>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>>>>>>> index a674c7db..b1bb7e9 100644
>>>>>>>> --- a/kernel/sched/core.c
>>>>>>>> +++ b/kernel/sched/core.c
>>>>>>>> @@ -3289,6 +3289,14 @@ static inline void schedule_debug(struct task_struct *prev)
>>>>>>>> __schedule_bug(prev);
>>>>>>>> preempt_count_set(PREEMPT_DISABLED);
>>>>>>>> }
>>>>>>>> +
>>>>>>>> + if (IS_ENABLED(CONFIG_DEBUG_UACCESS_SLEEP) &&
>>>>>>>> + unlikely(unsafe_user_region_active())) {
>>>>>>>> + printk(KERN_ERR "BUG: scheduling while user_access enabled: %s/%d/0x%08x\n",
>>>>>>>> + prev->comm, prev->pid, preempt_count());
>>>>>>>> + dump_stack();
>>>>>>>> + }
>>>>>>>> +
>>>>>>>> rcu_sleep_check();
>>>>>>>>
>>>>>>>> profile_hit(SCHED_PROFILING, __builtin_return_address(0));
>>>>
>>>>> I guess I'll drop the might_resched() part of this patch if that sounds
>>>>> alright.
>>>>
>>>> I'm still confused by the schedule_debug() part. How is that not broken?
>>>
>>> Hmmm, I am not exactly sure which part you expect to be broken, I guess
>>> it's because of the nature of the uaccess unsafe accessor usage.
>>>
>>> Basically, the following is a definite no:
>>> if (user_access_begin(ptr, size)) {
>>>
>>> [...]
>>>
>>> //something that calls schedule
>>>
>>> [...]
>>>
>>> user_access_end();
>>> }
>>>
>>>
>>> However the following is fine:
>>>
>>> - user_access_begin(ptr, size)
>>> - taking irq/exception
>>> - get preempted
>>
>> This; how is getting preempted fundamentally different from scheduling
>> ourselves?
>
> The difference is because getting preempted in the sequence above is
> triggered off the back of an interrupt. On arm64, and I think also on x86,
> the user access state (SMAP or PAN) is saved and restored across exceptions
> but not across context switch. Consequently, taking an irq in a
> user_access_{begin,end} section and then scheduling is fine, but calling
> schedule directly within such a section is not.
>
> Julien -- please yell if I've missed some crucial detail, but I think that's
> the gist of what we're trying to describe here.
>
Yes, this summarizes things correctly. Thanks!
I might also stress out that this limitation is already existing for x86
(and is in the arm64 patches picked by Catalin for 5.1), as was
discussed in here:
https://lkml.org/lkml/2018/11/23/430
So this patch is not introducing new semantics, it is only making
existing ones explicit.
If the current state is not good, we need to re-discuss the semantics of
user_access regions.
Thanks,
--
Julien Thierry
On Wed, Feb 13, 2019 at 02:00:26PM +0000, Will Deacon wrote:
> > This; how is getting preempted fundamentally different from scheduling
> > ourselves?
>
> The difference is because getting preempted in the sequence above is
> triggered off the back of an interrupt. On arm64, and I think also on x86,
> the user access state (SMAP or PAN) is saved and restored across exceptions
> but not across context switch. Consequently, taking an irq in a
> user_access_{begin,end} section and then scheduling is fine, but calling
> schedule directly within such a section is not.
So how's this then:
if (user_access_begin()) {
preempt_disable();
<IRQ>
set_need_resched();
</IRQ no preempt>
preempt_enable()
__schedule();
user_access_end();
}
That _should_ work just fine but explodes with the proposed nonsense.
On 13/02/2019 14:17, Peter Zijlstra wrote:
> On Wed, Feb 13, 2019 at 02:00:26PM +0000, Will Deacon wrote:
>>> This; how is getting preempted fundamentally different from scheduling
>>> ourselves?
>>
>> The difference is because getting preempted in the sequence above is
>> triggered off the back of an interrupt. On arm64, and I think also on x86,
>> the user access state (SMAP or PAN) is saved and restored across exceptions
>> but not across context switch. Consequently, taking an irq in a
>> user_access_{begin,end} section and then scheduling is fine, but calling
>> schedule directly within such a section is not.
>
> So how's this then:
>
> if (user_access_begin()) {
>
> preempt_disable();
>
> <IRQ>
> set_need_resched();
> </IRQ no preempt>
>
> preempt_enable()
> __schedule();
>
> user_access_end();
> }
>
> That _should_ work just fine but explodes with the proposed nonsense.
AFAICT, This does not work properly because when you schedule out this
task, you won't be saving the EFLAGS.AF/PSTATE.PAN bit on the stack, and
next time you schedule the task back in, it might no longer have the
correct flag value (so an unsafe_get/put_user() will fail even though
you haven't reached user_access_end()).
One solution is to deal with this in task switching code, but so far
I've been told that calling schedule() in such a context is not expected
to be supported.
Cheers,
--
Julien Thierry
On Wed, Feb 13, 2019 at 02:00:26PM +0000, Will Deacon wrote:
> The difference is because getting preempted in the sequence above is
> triggered off the back of an interrupt. On arm64, and I think also on x86,
> the user access state (SMAP or PAN) is saved and restored across exceptions
> but not across context switch.
A quick reading of the SDM seems to suggest the SMAP state is part of
EFLAGS, which is context switched just fine AFAIK.
SMAP {ab,re}uses the EFLAGS.AC bit.
> Consequently, taking an irq in a
> user_access_{begin,end} section and then scheduling is fine, but calling
> schedule directly within such a section is not.
On Wed, Feb 13, 2019 at 02:24:34PM +0000, Julien Thierry wrote:
> On 13/02/2019 14:17, Peter Zijlstra wrote:
> > On Wed, Feb 13, 2019 at 02:00:26PM +0000, Will Deacon wrote:
> >>> This; how is getting preempted fundamentally different from scheduling
> >>> ourselves?
> >>
> >> The difference is because getting preempted in the sequence above is
> >> triggered off the back of an interrupt. On arm64, and I think also on x86,
> >> the user access state (SMAP or PAN) is saved and restored across exceptions
> >> but not across context switch. Consequently, taking an irq in a
> >> user_access_{begin,end} section and then scheduling is fine, but calling
> >> schedule directly within such a section is not.
> >
> > So how's this then:
> >
> > if (user_access_begin()) {
> >
> > preempt_disable();
> >
> > <IRQ>
> > set_need_resched();
> > </IRQ no preempt>
> >
> > preempt_enable()
> > __schedule();
> >
> > user_access_end();
> > }
> >
> > That _should_ work just fine but explodes with the proposed nonsense.
>
> AFAICT, This does not work properly because when you schedule out this
> task, you won't be saving the EFLAGS.AF/PSTATE.PAN bit on the stack, and
EFLAGS.AC, but yes.
> next time you schedule the task back in, it might no longer have the
> correct flag value (so an unsafe_get/put_user() will fail even though
> you haven't reached user_access_end()).
/me looks at __switch_to_asm() and there is indeed a distinct lack of
pushing and popping EFLAGS :/
> One solution is to deal with this in task switching code, but so far
> I've been told that calling schedule() in such a context is not expected
> to be supported.
Well, per the above it breaks the preemption model. And I hates that.
And the added WARN doesn't even begin to cover it, since you'd have to
actually hit the preempt_enable() reschedule for it trigger.
So far, all 6 in-tree users are indeed free of dodgy code, but *groan*.
Hi Peter,
On 13/02/2019 14:25, Peter Zijlstra wrote:
> On Wed, Feb 13, 2019 at 02:00:26PM +0000, Will Deacon wrote:
>> The difference is because getting preempted in the sequence above is
>> triggered off the back of an interrupt. On arm64, and I think also on x86,
>> the user access state (SMAP or PAN) is saved and restored across exceptions
>> but not across context switch.
>
> A quick reading of the SDM seems to suggest the SMAP state is part of
> EFLAGS, which is context switched just fine AFAIK.
>
I fail to see where this is happening when looking at the switch_to()
logic in x86_64.
And Peter A. didn't seem to suggest that this transfer of the eflags was
happening without them being saved on the stack through exception handling.
What am I missing?
Thanks,
--
Julien Thierry
On Wed, Feb 13, 2019 at 02:39:22PM +0000, Julien Thierry wrote:
> Hi Peter,
>
> On 13/02/2019 14:25, Peter Zijlstra wrote:
> > On Wed, Feb 13, 2019 at 02:00:26PM +0000, Will Deacon wrote:
> >> The difference is because getting preempted in the sequence above is
> >> triggered off the back of an interrupt. On arm64, and I think also on x86,
> >> the user access state (SMAP or PAN) is saved and restored across exceptions
> >> but not across context switch.
> >
> > A quick reading of the SDM seems to suggest the SMAP state is part of
> > EFLAGS, which is context switched just fine AFAIK.
> >
> I fail to see where this is happening when looking at the switch_to()
> logic in x86_64.
Yeah, me too.. we obviously preserve EFLAGS for user context, but for
kernel-kernel switches we do not seem to preserve it :-(
On Wed, Feb 13, 2019 at 03:40:00PM +0100, Peter Zijlstra wrote:
> So far, all 6 in-tree users are indeed free of dodgy code, but *groan*.
because of this, there must also not be tracepoints (even implicit ones
like function-trace) between user_access_{begin,end}().
And while that is unlikely with the current code; it is not guaranteed
afaict.
What a ff'ing mess.
On Wed, Feb 13, 2019 at 03:41:45PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 13, 2019 at 02:39:22PM +0000, Julien Thierry wrote:
> > Hi Peter,
> >
> > On 13/02/2019 14:25, Peter Zijlstra wrote:
> > > On Wed, Feb 13, 2019 at 02:00:26PM +0000, Will Deacon wrote:
> > >> The difference is because getting preempted in the sequence above is
> > >> triggered off the back of an interrupt. On arm64, and I think also on x86,
> > >> the user access state (SMAP or PAN) is saved and restored across exceptions
> > >> but not across context switch.
> > >
> > > A quick reading of the SDM seems to suggest the SMAP state is part of
> > > EFLAGS, which is context switched just fine AFAIK.
> > >
> > I fail to see where this is happening when looking at the switch_to()
> > logic in x86_64.
>
> Yeah, me too.. we obviously preserve EFLAGS for user context, but for
> kernel-kernel switches we do not seem to preserve it :-(
So I dug around the context switch code a little, and I think we lost it
here:
0100301bfdf5 ("sched/x86: Rewrite the switch_to() code")
Before that, x86_64 switch_to() read like (much simplified):
asm volatile ( /* do RSP twiddle */
: /* output */
: /* input */
: "memory", "cc", .... "flags");
(see __EXTRA_CLOBBER)
Which I suppose means that GCC generates the PUSHF/POPF to preserve the
EFLAGS, since we mark those explicitly clobbered.
Before that:
f05e798ad4c0 ("Disintegrate asm/system.h for X86")
We had explicit PUSHF / POPF in SAVE_CONTEXT / RESTORE_CONTEXT resp.
Now I cannot see how the current code preserves EFLAGS (if indeed it
does), and the changelog doesn't mention this change _AT_ALL_.
For a little bit of context; it turns out that user_access_begin() /
user_access_end() sets EFLAGS.AC and scheduling in between there wrecks
that because we're apparently not saving that anymore.
Now, I'm tempted to add the PUSHF / POPF right back because of this, but
first I suppose we need to figure out if that change was on purpose and
why that went missing from the Changelog.
On Wed, Feb 13, 2019 at 04:45:32PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 13, 2019 at 03:41:45PM +0100, Peter Zijlstra wrote:
> > On Wed, Feb 13, 2019 at 02:39:22PM +0000, Julien Thierry wrote:
> > > Hi Peter,
> > >
> > > On 13/02/2019 14:25, Peter Zijlstra wrote:
> > > > On Wed, Feb 13, 2019 at 02:00:26PM +0000, Will Deacon wrote:
> > > >> The difference is because getting preempted in the sequence above is
> > > >> triggered off the back of an interrupt. On arm64, and I think also on x86,
> > > >> the user access state (SMAP or PAN) is saved and restored across exceptions
> > > >> but not across context switch.
> > > >
> > > > A quick reading of the SDM seems to suggest the SMAP state is part of
> > > > EFLAGS, which is context switched just fine AFAIK.
> > > >
> > > I fail to see where this is happening when looking at the switch_to()
> > > logic in x86_64.
> >
> > Yeah, me too.. we obviously preserve EFLAGS for user context, but for
> > kernel-kernel switches we do not seem to preserve it :-(
>
> So I dug around the context switch code a little, and I think we lost it
> here:
>
> 0100301bfdf5 ("sched/x86: Rewrite the switch_to() code")
>
> Before that, x86_64 switch_to() read like (much simplified):
>
> asm volatile ( /* do RSP twiddle */
> : /* output */
> : /* input */
> : "memory", "cc", .... "flags");
>
> (see __EXTRA_CLOBBER)
>
> Which I suppose means that GCC generates the PUSHF/POPF to preserve the
> EFLAGS, since we mark those explicitly clobbered.
>
> Before that:
>
> f05e798ad4c0 ("Disintegrate asm/system.h for X86")
>
> We had explicit PUSHF / POPF in SAVE_CONTEXT / RESTORE_CONTEXT resp.
>
> Now I cannot see how the current code preserves EFLAGS (if indeed it
> does), and the changelog doesn't mention this change _AT_ALL_.
>
>
> For a little bit of context; it turns out that user_access_begin() /
> user_access_end() sets EFLAGS.AC and scheduling in between there wrecks
> that because we're apparently not saving that anymore.
>
> Now, I'm tempted to add the PUSHF / POPF right back because of this, but
> first I suppose we need to figure out if that change was on purpose and
> why that went missing from the Changelog.
Just for giggles; the below patch seems to boot.
---
arch/x86/entry/entry_32.S | 2 ++
arch/x86/entry/entry_64.S | 2 ++
arch/x86/include/asm/switch_to.h | 1 +
3 files changed, 5 insertions(+)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index d309f30cf7af..5fc76b755510 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -650,6 +650,7 @@ ENTRY(__switch_to_asm)
pushl %ebx
pushl %edi
pushl %esi
+ pushfl
/* switch stack */
movl %esp, TASK_threadsp(%eax)
@@ -672,6 +673,7 @@ ENTRY(__switch_to_asm)
#endif
/* restore callee-saved registers */
+ popfl
popl %esi
popl %edi
popl %ebx
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1f0efdb7b629..4fe27b67d7e2 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -291,6 +291,7 @@ ENTRY(__switch_to_asm)
pushq %r13
pushq %r14
pushq %r15
+ pushfq
/* switch stack */
movq %rsp, TASK_threadsp(%rdi)
@@ -313,6 +314,7 @@ ENTRY(__switch_to_asm)
#endif
/* restore callee-saved registers */
+ popfq
popq %r15
popq %r14
popq %r13
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 7cf1a270d891..157149d4129c 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void);
* order of the fields must match the code in __switch_to_asm().
*/
struct inactive_task_frame {
+ unsigned long flags;
#ifdef CONFIG_X86_64
unsigned long r15;
unsigned long r14;
On Wed, Feb 13, 2019 at 10:51:24AM -0800, Andy Lutomirski wrote:
> > On Feb 13, 2019, at 7:45 AM, Peter Zijlstra <[email protected]> wrote:
> > Which I suppose means that GCC generates the PUSHF/POPF to preserve the
> > EFLAGS, since we mark those explicitly clobbered.
> >
>
> Not quite. A flags clobber doesn’t save the control bits like AC
> except on certain rather buggy llvm compilers. The change you’re
> looking for is:
>
> http://git.kernel.org/tip/2c7577a7583747c9b71f26dced7f696b739da745
Indeed, failed to find that.
> > For a little bit of context; it turns out that user_access_begin() /
> > user_access_end() sets EFLAGS.AC and scheduling in between there wrecks
> > that because we're apparently not saving that anymore.
>
> But only explicit scheduling — preemption and sleepy page faults are
> fine because the interrupt frame saves flags.
No, like pointed out elsewhere in this thread, anything that does
preempt_disable() is utterly broken with this.
Because at that point the IRQ return path doesn't reschedule but
preempt_enable() will, and that doesn't preserve EFLAGS again.
> > Now, I'm tempted to add the PUSHF / POPF right back because of this, but
> > first I suppose we need to figure out if that change was on purpose and
> > why that went missing from the Changelog.
>
> That’s certainly the easy solution. Or we could teach the might_sleep
> checks about this, but that could be a mess.
That's not enough, we'd have to teach preempt_disable(), but worse,
preempt_disable_notrace().
Anything that lands in ftrace, which _will_ use
preempt_disable_notrace(), will screw this thing up.
> On Feb 13, 2019, at 2:21 PM, Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Feb 13, 2019 at 10:51:24AM -0800, Andy Lutomirski wrote:
>>> On Feb 13, 2019, at 7:45 AM, Peter Zijlstra <[email protected]> wrote:
>
>>> Which I suppose means that GCC generates the PUSHF/POPF to preserve the
>>> EFLAGS, since we mark those explicitly clobbered.
>>>
>>
>> Not quite. A flags clobber doesn’t save the control bits like AC
>> except on certain rather buggy llvm compilers. The change you’re
>> looking for is:
>>
>> http://git.kernel.org/tip/2c7577a7583747c9b71f26dced7f696b739da745
>
> Indeed, failed to find that.
>
>>> For a little bit of context; it turns out that user_access_begin() /
>>> user_access_end() sets EFLAGS.AC and scheduling in between there wrecks
>>> that because we're apparently not saving that anymore.
>>
>> But only explicit scheduling — preemption and sleepy page faults are
>> fine because the interrupt frame saves flags.
>
> No, like pointed out elsewhere in this thread, anything that does
> preempt_disable() is utterly broken with this.
>
> Because at that point the IRQ return path doesn't reschedule but
> preempt_enable() will, and that doesn't preserve EFLAGS again.
>
>>> Now, I'm tempted to add the PUSHF / POPF right back because of this, but
>>> first I suppose we need to figure out if that change was on purpose and
>>> why that went missing from the Changelog.
>>
>> That’s certainly the easy solution. Or we could teach the might_sleep
>> checks about this, but that could be a mess.
>
> That's not enough, we'd have to teach preempt_disable(), but worse,
> preempt_disable_notrace().
>
> Anything that lands in ftrace, which _will_ use
> preempt_disable_notrace(), will screw this thing up.
Ugh. Consider your patch acked. Do we need to backport this thing? The problem can’t be too widespread or we would have heard of it before.
On Wed, Feb 13, 2019 at 7:45 AM Peter Zijlstra <[email protected]> wrote:
>
> Before that, x86_64 switch_to() read like (much simplified):
>
> asm volatile ( /* do RSP twiddle */
> : /* output */
> : /* input */
> : "memory", "cc", .... "flags");
>
> (see __EXTRA_CLOBBER)
>
> Which I suppose means that GCC generates the PUSHF/POPF to preserve the
> EFLAGS, since we mark those explicitly clobbered.
No, it only means that gcc won't keep conditionals in the flags over
the asm. It doesn't make gcc save anything.
The push/pop got removed elsewhere as Andy says.
That said, I do agree that it's probably a good idea to save/restore
flags anyway when scheduling. It's not just AC, actually, now that I
look at it again I worry a bit about DF too.
We have the rule that we run with DF clear in the kernel, and all the
kernel entry points do clear it properly (so that memcpy etc don't
need to). But there are a few places that set DF temporarily because
they do something odd (backwards memmove), and those atcually have the
*exact* same issue as stac/clac has: it's ok to take a trap or
interrupt, and schedule due to that (because the trap/irq will clear
DF), but it would be a horrible bug to have a synchronous scheduling
point there.
Arguably the DF issue really isn't even remotely likely to actually be
a real issue (the code that sets DF really is very special and should
never do any kind of preemption), but it's conceptually quite
similar..
Of course, if DF is ever set, and we end up calling any C code at all,
I guess it would already be a huge problem. If the C code then does
memcpy or something, it would corrupt things quite badly.
So I guess save/restore isn't going to save us wrt DF. If we get
anywhere close to the scheduler with the DF bit set, we've already
lost.
But I still do kind of prefer saving flags. We have other sticky state
in there too, although none of it matters in the kernel currently (eg
iopl etc - only matters in user space, and user space will always
reload eflags on return).
Linus
On Wed, Feb 13, 2019 at 02:49:47PM -0800, Andy Lutomirski wrote:
> Do we need to backport this thing?
Possibly, just to be safe.
> The problem can’t be too widespread or we would have heard of it before.
Yes, so far we've been lucky.
---
Subject: sched/x86: Save [ER]FLAGS on context switch
From: Peter Zijlstra <[email protected]>
Date: Thu Feb 14 10:30:52 CET 2019
Effectively revert commit:
2c7577a75837 ("sched/x86_64: Don't save flags on context switch")
Specifically because SMAP uses FLAGS.AC which invalidates the claim
that the kernel has clean flags.
In particular; while preemption from interrupt return is fine (the
IRET frame on the exception stack contains FLAGS) it breaks any code
that does synchonous scheduling, including preempt_enable().
This has become a significant issue ever since commit:
5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses")
provided for means of having 'normal' C code between STAC / CLAC,
exposing the FLAGS.AC state. So far this hasn't led to trouble,
however fix it before it comes apart.
Fixes: 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses")
Acked-by: Andy Lutomirski <[email protected]>
Reported-by: Julien Thierry <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/entry/entry_32.S | 2 ++
arch/x86/entry/entry_64.S | 2 ++
arch/x86/include/asm/switch_to.h | 1 +
3 files changed, 5 insertions(+)
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -650,6 +650,7 @@ ENTRY(__switch_to_asm)
pushl %ebx
pushl %edi
pushl %esi
+ pushfl
/* switch stack */
movl %esp, TASK_threadsp(%eax)
@@ -672,6 +673,7 @@ ENTRY(__switch_to_asm)
#endif
/* restore callee-saved registers */
+ popfl
popl %esi
popl %edi
popl %ebx
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -291,6 +291,7 @@ ENTRY(__switch_to_asm)
pushq %r13
pushq %r14
pushq %r15
+ pushfq
/* switch stack */
movq %rsp, TASK_threadsp(%rdi)
@@ -313,6 +314,7 @@ ENTRY(__switch_to_asm)
#endif
/* restore callee-saved registers */
+ popfq
popq %r15
popq %r14
popq %r13
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void);
* order of the fields must match the code in __switch_to_asm().
*/
struct inactive_task_frame {
+ unsigned long flags;
#ifdef CONFIG_X86_64
unsigned long r15;
unsigned long r14;
On Thu, Feb 14, 2019 at 5:14 AM Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Feb 13, 2019 at 02:49:47PM -0800, Andy Lutomirski wrote:
>
> > Do we need to backport this thing?
>
> Possibly, just to be safe.
>
> > The problem can’t be too widespread or we would have heard of it before.
>
> Yes, so far we've been lucky.
>
> ---
> Subject: sched/x86: Save [ER]FLAGS on context switch
> From: Peter Zijlstra <[email protected]>
> Date: Thu Feb 14 10:30:52 CET 2019
>
> Effectively revert commit:
>
> 2c7577a75837 ("sched/x86_64: Don't save flags on context switch")
>
> Specifically because SMAP uses FLAGS.AC which invalidates the claim
> that the kernel has clean flags.
>
> In particular; while preemption from interrupt return is fine (the
> IRET frame on the exception stack contains FLAGS) it breaks any code
> that does synchonous scheduling, including preempt_enable().
>
> This has become a significant issue ever since commit:
>
> 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses")
>
> provided for means of having 'normal' C code between STAC / CLAC,
> exposing the FLAGS.AC state. So far this hasn't led to trouble,
> however fix it before it comes apart.
>
> Fixes: 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses")
> Acked-by: Andy Lutomirski <[email protected]>
> Reported-by: Julien Thierry <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/x86/entry/entry_32.S | 2 ++
> arch/x86/entry/entry_64.S | 2 ++
> arch/x86/include/asm/switch_to.h | 1 +
> 3 files changed, 5 insertions(+)
>
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -650,6 +650,7 @@ ENTRY(__switch_to_asm)
> pushl %ebx
> pushl %edi
> pushl %esi
> + pushfl
>
> /* switch stack */
> movl %esp, TASK_threadsp(%eax)
> @@ -672,6 +673,7 @@ ENTRY(__switch_to_asm)
> #endif
>
> /* restore callee-saved registers */
> + popfl
> popl %esi
> popl %edi
> popl %ebx
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -291,6 +291,7 @@ ENTRY(__switch_to_asm)
> pushq %r13
> pushq %r14
> pushq %r15
> + pushfq
>
> /* switch stack */
> movq %rsp, TASK_threadsp(%rdi)
> @@ -313,6 +314,7 @@ ENTRY(__switch_to_asm)
> #endif
>
> /* restore callee-saved registers */
> + popfq
> popq %r15
> popq %r14
> popq %r13
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void);
> * order of the fields must match the code in __switch_to_asm().
> */
> struct inactive_task_frame {
> + unsigned long flags;
> #ifdef CONFIG_X86_64
> unsigned long r15;
> unsigned long r14;
flags should be initialized in copy_thread_tls(). I think the new
stack is zeroed already, but it would be better to explicitly set it.
--
Brian Gerst
On Thu, Feb 14, 2019 at 11:18:01AM -0500, Brian Gerst wrote:
> > --- a/arch/x86/include/asm/switch_to.h
> > +++ b/arch/x86/include/asm/switch_to.h
> > @@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void);
> > * order of the fields must match the code in __switch_to_asm().
> > */
> > struct inactive_task_frame {
> > + unsigned long flags;
> > #ifdef CONFIG_X86_64
> > unsigned long r15;
> > unsigned long r14;
>
> flags should be initialized in copy_thread_tls(). I think the new
> stack is zeroed already, but it would be better to explicitly set it.
Ah indeed. I somehow misread that code and thought we got initialized
with a copy of current.
Something like the below, right?
---
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -127,6 +127,7 @@ int copy_thread_tls(unsigned long clone_
struct task_struct *tsk;
int err;
+ frame->flags = 0;
frame->bp = 0;
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -392,6 +392,7 @@ int copy_thread_tls(unsigned long clone_
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
+ frame->flags = 0;
frame->bp = 0;
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
On Thu, Feb 14, 2019 at 2:34 PM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Feb 14, 2019 at 11:18:01AM -0500, Brian Gerst wrote:
>
> > > --- a/arch/x86/include/asm/switch_to.h
> > > +++ b/arch/x86/include/asm/switch_to.h
> > > @@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void);
> > > * order of the fields must match the code in __switch_to_asm().
> > > */
> > > struct inactive_task_frame {
> > > + unsigned long flags;
> > > #ifdef CONFIG_X86_64
> > > unsigned long r15;
> > > unsigned long r14;
> >
> > flags should be initialized in copy_thread_tls(). I think the new
> > stack is zeroed already, but it would be better to explicitly set it.
>
> Ah indeed. I somehow misread that code and thought we got initialized
> with a copy of current.
>
> Something like the below, right?
>
> ---
>
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -127,6 +127,7 @@ int copy_thread_tls(unsigned long clone_
> struct task_struct *tsk;
> int err;
>
> + frame->flags = 0;
> frame->bp = 0;
> frame->ret_addr = (unsigned long) ret_from_fork;
> p->thread.sp = (unsigned long) fork_frame;
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -392,6 +392,7 @@ int copy_thread_tls(unsigned long clone_
> childregs = task_pt_regs(p);
> fork_frame = container_of(childregs, struct fork_frame, regs);
> frame = &fork_frame->frame;
> + frame->flags = 0;
> frame->bp = 0;
> frame->ret_addr = (unsigned long) ret_from_fork;
> p->thread.sp = (unsigned long) fork_frame;
Yes, this looks good.
--
Brian Gerst
On Thu, Feb 14, 2019 at 11:34 AM Peter Zijlstra <[email protected]> wrote:
>
> Something like the below, right?
>
> + frame->flags = 0;
> + frame->flags = 0;
Those are not valid flag values.
Can you popf them? Yes.
Do they make sense? No.
It has the IF flag clear, for example. Is that intentional? If it is,
it should likely be documented. Because IF clear means "interrupts
disabled". Are the places that load flags in irq disabled regions?
Maybe they are, maybe they aren't, but shouldn't this be something
that is made explicit, rather than "oh, let's initialize to zero like
all the other registers we don't care about".
Because a zero initializer for eflags really is odd.
Linus
On Fri, Feb 15, 2019 at 09:18:00AM -0800, Linus Torvalds wrote:
> On Thu, Feb 14, 2019 at 11:34 AM Peter Zijlstra <[email protected]> wrote:
> >
> > Something like the below, right?
> >
> > + frame->flags = 0;
> > + frame->flags = 0;
>
> Those are not valid flag values.
>
> Can you popf them? Yes.
>
> Do they make sense? No.
>
> It has the IF flag clear, for example. Is that intentional? If it is,
Uhmm. yeah, that's bonkers. We should have interrupts disabled here.
I'll go read up on the eflags and figure out what they _should_ be right
about there.
> On Feb 15, 2019, at 9:40 AM, Peter Zijlstra <[email protected]> wrote:
>
>> On Fri, Feb 15, 2019 at 09:18:00AM -0800, Linus Torvalds wrote:
>>> On Thu, Feb 14, 2019 at 11:34 AM Peter Zijlstra <[email protected]> wrote:
>>>
>>> Something like the below, right?
>>>
>>> + frame->flags = 0;
>>> + frame->flags = 0;
>>
>> Those are not valid flag values.
>>
>> Can you popf them? Yes.
>>
>> Do they make sense? No.
>>
>> It has the IF flag clear, for example. Is that intentional? If it is,
>
> Uhmm. yeah, that's bonkers. We should have interrupts disabled here.
> I'll go read up on the eflags and figure out what they _should_ be right
> about there.
And probably add a comment near the POPF explaining that it will keep IRQs off :)
On Fri, Feb 15, 2019 at 06:40:34PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 15, 2019 at 09:18:00AM -0800, Linus Torvalds wrote:
> > On Thu, Feb 14, 2019 at 11:34 AM Peter Zijlstra <[email protected]> wrote:
> > >
> > > Something like the below, right?
> > >
> > > + frame->flags = 0;
> > > + frame->flags = 0;
> >
> > Those are not valid flag values.
> >
> > Can you popf them? Yes.
> >
> > Do they make sense? No.
> >
> > It has the IF flag clear, for example. Is that intentional? If it is,
>
> Uhmm. yeah, that's bonkers. We should have interrupts disabled here.
> I'll go read up on the eflags and figure out what they _should_ be right
> about there.
I misread (I'm forever confused about what way around IF goes), but you
said it right; IF=0 is interrupts disabled and we very much have that in
the middle of context switch.
(just for giggles I set IF for the initial flags value; and it comes
unstuck _real_ quick)
Now, EFLAGS bit 1 is supposedly always 1, but it really doesn't seem to
matter for POPF.
I went through the other flags, and aside from VIP/VIF (I've no clue),
they looks like 0 should be just fine.
On Fri, Feb 15, 2019 at 3:34 PM Peter Zijlstra <[email protected]> wrote:
>
> Now, EFLAGS bit 1 is supposedly always 1, but it really doesn't seem to
> matter for POPF.
Correct, it's "read as 1", you can try to write it and it doesn't matter.
> I went through the other flags, and aside from VIP/VIF (I've no clue),
> they looks like 0 should be just fine.
So 0 is a perfectly valid initializer in the sense that it _works_, I
just want it to be something that was thought about, not just a random
"initialize to zero" without thinking.
Even just a comment about it would be fine. But it might also be good
to show that it's an explicit eflags value and just use
X86_EFLAGS_FIXED as the initializer.
Linus
On February 14, 2019 2:14:29 AM PST, Peter Zijlstra <[email protected]> wrote:
>On Wed, Feb 13, 2019 at 02:49:47PM -0800, Andy Lutomirski wrote:
>
>> Do we need to backport this thing?
>
>Possibly, just to be safe.
>
>> The problem can’t be too widespread or we would have heard of it
>before.
>
>Yes, so far we've been lucky.
>
>---
>Subject: sched/x86: Save [ER]FLAGS on context switch
>From: Peter Zijlstra <[email protected]>
>Date: Thu Feb 14 10:30:52 CET 2019
>
>Effectively revert commit:
>
> 2c7577a75837 ("sched/x86_64: Don't save flags on context switch")
>
>Specifically because SMAP uses FLAGS.AC which invalidates the claim
>that the kernel has clean flags.
>
>In particular; while preemption from interrupt return is fine (the
>IRET frame on the exception stack contains FLAGS) it breaks any code
>that does synchonous scheduling, including preempt_enable().
>
>This has become a significant issue ever since commit:
>
>5b24a7a2aa20 ("Add 'unsafe' user access functions for batched
>accesses")
>
>provided for means of having 'normal' C code between STAC / CLAC,
>exposing the FLAGS.AC state. So far this hasn't led to trouble,
>however fix it before it comes apart.
>
>Fixes: 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched
>accesses")
>Acked-by: Andy Lutomirski <[email protected]>
>Reported-by: Julien Thierry <[email protected]>
>Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>---
> arch/x86/entry/entry_32.S | 2 ++
> arch/x86/entry/entry_64.S | 2 ++
> arch/x86/include/asm/switch_to.h | 1 +
> 3 files changed, 5 insertions(+)
>
>--- a/arch/x86/entry/entry_32.S
>+++ b/arch/x86/entry/entry_32.S
>@@ -650,6 +650,7 @@ ENTRY(__switch_to_asm)
> pushl %ebx
> pushl %edi
> pushl %esi
>+ pushfl
>
> /* switch stack */
> movl %esp, TASK_threadsp(%eax)
>@@ -672,6 +673,7 @@ ENTRY(__switch_to_asm)
> #endif
>
> /* restore callee-saved registers */
>+ popfl
> popl %esi
> popl %edi
> popl %ebx
>--- a/arch/x86/entry/entry_64.S
>+++ b/arch/x86/entry/entry_64.S
>@@ -291,6 +291,7 @@ ENTRY(__switch_to_asm)
> pushq %r13
> pushq %r14
> pushq %r15
>+ pushfq
>
> /* switch stack */
> movq %rsp, TASK_threadsp(%rdi)
>@@ -313,6 +314,7 @@ ENTRY(__switch_to_asm)
> #endif
>
> /* restore callee-saved registers */
>+ popfq
> popq %r15
> popq %r14
> popq %r13
>--- a/arch/x86/include/asm/switch_to.h
>+++ b/arch/x86/include/asm/switch_to.h
>@@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void);
> * order of the fields must match the code in __switch_to_asm().
> */
> struct inactive_task_frame {
>+ unsigned long flags;
> #ifdef CONFIG_X86_64
> unsigned long r15;
> unsigned long r14;
This implies we invoke schedule -- a restricted operation (consider may_sleep) during execution of STAC-enabled code, but *not* as an exception or interrupt, since those preserve the flags.
I have serious concerns about this. This is more or less saying that we have left an unlimited gap and have had AC escape.
Does *anyone* see a need to allow this? I got a question at LPC from someone about this, and what they were trying to do once all the layers had been unwound was so far down the wrong track for a root problem that actually has a very simple solution.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
On Fri, Feb 15, 2019 at 08:06:56PM -0800, [email protected] wrote:
> This implies we invoke schedule -- a restricted operation (consider
> may_sleep) during execution of STAC-enabled code, but *not* as an
> exception or interrupt, since those preserve the flags.
Meet preempt_enable().
> I have serious concerns about this. This is more or less saying that
> we have left an unlimited gap and have had AC escape.
Yes; by allowing normal C in between STAC and CLAC this is so.
> Does *anyone* see a need to allow this? I got a question at LPC from
> someone about this, and what they were trying to do once all the
> layers had been unwound was so far down the wrong track for a root
> problem that actually has a very simple solution.
Have you read the rest of the thread?
All it takes for this to explode is a call to a C function that has
tracing on it in between the user_access_begin() and user_access_end()
things. That is a _very_ easy thing to do.
Heck, GCC is allowed to generate that broken code compiling
lib/strnlen_user.c; it doesn't _have_ to inline do_strnlen_user (esp.
with CONFIG_OPTIMIZE_INLINING), and making that a function call would
get us fentry hooks and ftrace and *BOOM*.
(Now, of course, its a static function with a single caller, and GCC
isn't _THAT_ stupid, but it could, if it wanted to)
Since the bar is _that_ low for mistakes, I figure we'd better fix it.
On Fri, Feb 15, 2019 at 04:21:46PM -0800, Linus Torvalds wrote:
> Even just a comment about it would be fine. But it might also be good
> to show that it's an explicit eflags value and just use
> X86_EFLAGS_FIXED as the initializer.
That is in fact what I have now; I'll repost on Monday or so.
Subject: sched/x86: Save [ER]FLAGS on context switch
From: Peter Zijlstra <[email protected]>
Date: Thu Feb 14 10:30:52 CET 2019
Effectively reverts commit:
2c7577a75837 ("sched/x86_64: Don't save flags on context switch")
Specifically because SMAP uses FLAGS.AC which invalidates the claim
that the kernel has clean flags.
In particular; while preemption from interrupt return is fine (the
IRET frame on the exception stack contains FLAGS) it breaks any code
that does synchonous scheduling, including preempt_enable().
This has become a significant issue ever since commit:
5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses")
provided for means of having 'normal' C code between STAC / CLAC,
exposing the FLAGS.AC state. So far this hasn't led to trouble,
however fix it before it comes apart.
Fixes: 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses")
Acked-by: Andy Lutomirski <[email protected]>
Reported-by: Julien Thierry <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/entry/entry_32.S | 2 ++
arch/x86/entry/entry_64.S | 2 ++
arch/x86/include/asm/switch_to.h | 1 +
arch/x86/kernel/process_32.c | 7 +++++++
arch/x86/kernel/process_64.c | 8 ++++++++
5 files changed, 20 insertions(+)
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -650,6 +650,7 @@ ENTRY(__switch_to_asm)
pushl %ebx
pushl %edi
pushl %esi
+ pushfl
/* switch stack */
movl %esp, TASK_threadsp(%eax)
@@ -672,6 +673,7 @@ ENTRY(__switch_to_asm)
#endif
/* restore callee-saved registers */
+ popfl
popl %esi
popl %edi
popl %ebx
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -291,6 +291,7 @@ ENTRY(__switch_to_asm)
pushq %r13
pushq %r14
pushq %r15
+ pushfq
/* switch stack */
movq %rsp, TASK_threadsp(%rdi)
@@ -313,6 +314,7 @@ ENTRY(__switch_to_asm)
#endif
/* restore callee-saved registers */
+ popfq
popq %r15
popq %r14
popq %r13
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void);
* order of the fields must match the code in __switch_to_asm().
*/
struct inactive_task_frame {
+ unsigned long flags;
#ifdef CONFIG_X86_64
unsigned long r15;
unsigned long r14;
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -127,6 +127,13 @@ int copy_thread_tls(unsigned long clone_
struct task_struct *tsk;
int err;
+ /*
+ * For a new task use the RESET flags value since there is no before.
+ * All the status flags are zero; DF and all the system flags must also
+ * be 0, specifically IF must be 0 because we context switch to the new
+ * task with interrupts disabled.
+ */
+ frame->flags = X86_EFLAGS_FIXED;
frame->bp = 0;
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -392,6 +392,14 @@ int copy_thread_tls(unsigned long clone_
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
+
+ /*
+ * For a new task use the RESET flags value since there is no before.
+ * All the status flags are zero; DF and all the system flags must also
+ * be 0, specifically IF must be 0 because we context switch to the new
+ * task with interrupts disabled.
+ */
+ frame->flags = X86_EFLAGS_FIXED;
frame->bp = 0;
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
On 2/16/19 2:30 AM, Peter Zijlstra wrote:
> On Fri, Feb 15, 2019 at 08:06:56PM -0800, [email protected] wrote:
>> This implies we invoke schedule -- a restricted operation (consider
>> may_sleep) during execution of STAC-enabled code, but *not* as an
>> exception or interrupt, since those preserve the flags.
>
> Meet preempt_enable().
I believe this falls under "doctor, it hurts when I do that." And it hurts for
very good reasons. See below.
>> I have serious concerns about this. This is more or less saying that
>> we have left an unlimited gap and have had AC escape.
>
> Yes; by allowing normal C in between STAC and CLAC this is so.
>
>> Does *anyone* see a need to allow this? I got a question at LPC from
>> someone about this, and what they were trying to do once all the
>> layers had been unwound was so far down the wrong track for a root
>> problem that actually has a very simple solution.
>
> Have you read the rest of the thread?
>
> All it takes for this to explode is a call to a C function that has
> tracing on it in between the user_access_begin() and user_access_end()
> things. That is a _very_ easy thing to do.
>
> Heck, GCC is allowed to generate that broken code compiling
> lib/strnlen_user.c; it doesn't _have_ to inline do_strnlen_user (esp.
> with CONFIG_OPTIMIZE_INLINING), and making that a function call would
> get us fentry hooks and ftrace and *BOOM*.
>
> (Now, of course, its a static function with a single caller, and GCC
> isn't _THAT_ stupid, but it could, if it wanted to)
>
> Since the bar is _that_ low for mistakes, I figure we'd better fix it.
>
The question is what "fix it" means. I'm really concerned about AC escapes,
and everyone else should be, too.
For an issue specific to tracing, it would be more appropriate to do the
appropriate things in the tracing entry/exit than in schedule. Otherwise, we
don't want to silently paper over mistakes which could mean that we run a
large portion of the kernel without protection we thought we had.
In that sense, calling schedule() with AC set is in fact a great place to have
a WARN() or even BUG(), because such an event really could imply that the
kernel has been security compromised.
Does that make more sense?
-hpa
On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <[email protected]> wrote:
>
> The question is what "fix it" means. I'm really concerned about AC escapes,
> and everyone else should be, too.
I do think that it might be the right thing to do to add some kind of
WARN_ON_ONCE() for AC being set in various can-reschedule situations.
We'd just have to abstract it sanely. I'm sure arm64 has the exact
same issue with PAN - maybe it saves properly, but the same "we
wouldn't want to go through the scheduler with PAN clear".
On x86, we might as well check DF at the same time as AC.
Linus
> On Feb 18, 2019, at 4:24 PM, Linus Torvalds <[email protected]> wrote:
>
>> On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <[email protected]> wrote:
>>
>> The question is what "fix it" means. I'm really concerned about AC escapes,
>> and everyone else should be, too.
>
> I do think that it might be the right thing to do to add some kind of
> WARN_ON_ONCE() for AC being set in various can-reschedule situations.
>
> We'd just have to abstract it sanely. I'm sure arm64 has the exact
> same issue with PAN - maybe it saves properly, but the same "we
> wouldn't want to go through the scheduler with PAN clear".
>
> On x86, we might as well check DF at the same time as AC.
>
>
hpa is right, though — calling into tracing code with AC set is not really so good. And calling schedule() (via preempt_enable() or whatever) is also bad because it runs all the scheduler code with AC on. Admittedly, the scheduler is not *that* interesting of an attack surface.
On 2/18/19 6:20 PM, Andy Lutomirski wrote:
>
>
>> On Feb 18, 2019, at 4:24 PM, Linus Torvalds <[email protected]> wrote:
>>
>>> On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <[email protected]> wrote:
>>>
>>> The question is what "fix it" means. I'm really concerned about AC escapes,
>>> and everyone else should be, too.
>>
>> I do think that it might be the right thing to do to add some kind of
>> WARN_ON_ONCE() for AC being set in various can-reschedule situations.
>>
>> We'd just have to abstract it sanely. I'm sure arm64 has the exact
>> same issue with PAN - maybe it saves properly, but the same "we
>> wouldn't want to go through the scheduler with PAN clear".
>>
>> On x86, we might as well check DF at the same time as AC.
>>
>
> hpa is right, though — calling into tracing code with AC set is not really so good. And calling schedule() (via preempt_enable() or whatever) is also bad because it runs all the scheduler code with AC on. Admittedly, the scheduler is not *that* interesting of an attack surface.
>
Not just that, but the other question is just how much code we are running
with AC open. It really should only be done in some very small regions.
-hpa
On 19/02/2019 00:24, Linus Torvalds wrote:
> On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <[email protected]> wrote:
>>
>> The question is what "fix it" means. I'm really concerned about AC escapes,
>> and everyone else should be, too.
>
> I do think that it might be the right thing to do to add some kind of
> WARN_ON_ONCE() for AC being set in various can-reschedule situations.
>
> We'd just have to abstract it sanely. I'm sure arm64 has the exact
> same issue with PAN - maybe it saves properly, but the same "we
> wouldn't want to go through the scheduler with PAN clear".
>
As of right now, we have the same issue on arm64 as on x86. We don't
currently save the PAN bit on task switch, but I have a patch to do that.
Unless we decide to go down the route of warning against uses of
schedule() inside.
As for the abstraction, I had this patch[1] that added another primitive
for the user_access API (although this might not be suited for x86 if
you also want to check DF). However, an issue that appears is where to
perform the check to cover enough ground.
Checking inside the schedule() code you only cover cases where things
have already gone wrong, and not the use of functions that are unsafe to
call inside a user_access region.
[1]
http://lists.infradead.org/pipermail/linux-arm-kernel/2019-January/625385.html
Cheers,
--
Julien Thierry
On Mon, Feb 18, 2019 at 02:30:21PM -0800, H. Peter Anvin wrote:
> On 2/16/19 2:30 AM, Peter Zijlstra wrote:
> > On Fri, Feb 15, 2019 at 08:06:56PM -0800, [email protected] wrote:
> >> This implies we invoke schedule -- a restricted operation (consider
> >> may_sleep) during execution of STAC-enabled code, but *not* as an
> >> exception or interrupt, since those preserve the flags.
> >
> > Meet preempt_enable().
>
> I believe this falls under "doctor, it hurts when I do that." And it hurts for
> very good reasons. See below.
I disagree; the basic rule is that if you're preemptible you must also
be able to schedule and vice-versa. These AC regions violate this.
And, like illustrated, it is _very_ easy to get all sorts inside that AC
crap.
> >> I have serious concerns about this. This is more or less saying that
> >> we have left an unlimited gap and have had AC escape.
> >
> > Yes; by allowing normal C in between STAC and CLAC this is so.
> >
> >> Does *anyone* see a need to allow this? I got a question at LPC from
> >> someone about this, and what they were trying to do once all the
> >> layers had been unwound was so far down the wrong track for a root
> >> problem that actually has a very simple solution.
> >
> > Have you read the rest of the thread?
> >
> > All it takes for this to explode is a call to a C function that has
> > tracing on it in between the user_access_begin() and user_access_end()
> > things. That is a _very_ easy thing to do.
> >
> > Heck, GCC is allowed to generate that broken code compiling
> > lib/strnlen_user.c; it doesn't _have_ to inline do_strnlen_user (esp.
> > with CONFIG_OPTIMIZE_INLINING), and making that a function call would
> > get us fentry hooks and ftrace and *BOOM*.
> >
> > (Now, of course, its a static function with a single caller, and GCC
> > isn't _THAT_ stupid, but it could, if it wanted to)
> >
> > Since the bar is _that_ low for mistakes, I figure we'd better fix it.
> >
>
> The question is what "fix it" means.
It means that when we do schedule, the next task doesn't have AC set,
and when we schedule back, we'll restore our AC when we had it. Unlike
the current situation, where the next task will run with AC set.
IOW, it confines AC to the task context where it was set.
> I'm really concerned about AC escapes,
> and everyone else should be, too.
Then _that_ should be asserted.
> For an issue specific to tracing, it would be more appropriate to do the
> appropriate things in the tracing entry/exit than in schedule. Otherwise, we
> don't want to silently paper over mistakes which could mean that we run a
> large portion of the kernel without protection we thought we had.
>
> In that sense, calling schedule() with AC set is in fact a great place to have
> a WARN() or even BUG(), because such an event really could imply that the
> kernel has been security compromised.
It is not specific to tracing, tracing is just one of the most trivial
and non-obvious ways to make it go splat.
There's lot of fairly innocent stuff that does preempt_disable() /
preempt_enable(). And just a warning in schedule() isn't sufficient,
you'd have to actually trigger a reschedule before you know your code is
bad.
And like I said; the invariant is: if you're preemptible you can
schedule and v.v.
Now, clearly you don't want to mark these whole regions as !preemptible,
because then you can also not take faults, but somehow you're not
worried about the whole fault handler, but you are worried about the
scheduler ?!? How does that work? The fault handler can touch a _ton_
more code. Heck, the fault handler can schedule.
So either pre-fault, and run the whole AC crap with preemption disabled
and retry, or accept that we have to schedule.
> Does that make more sense?
It appears to me you're going about it backwards.
On 19/02/2019 02:46, H. Peter Anvin wrote:
> On 2/18/19 6:20 PM, Andy Lutomirski wrote:
>>
>>
>>> On Feb 18, 2019, at 4:24 PM, Linus Torvalds <[email protected]> wrote:
>>>
>>>> On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <[email protected]> wrote:
>>>>
>>>> The question is what "fix it" means. I'm really concerned about AC escapes,
>>>> and everyone else should be, too.
>>>
>>> I do think that it might be the right thing to do to add some kind of
>>> WARN_ON_ONCE() for AC being set in various can-reschedule situations.
>>>
>>> We'd just have to abstract it sanely. I'm sure arm64 has the exact
>>> same issue with PAN - maybe it saves properly, but the same "we
>>> wouldn't want to go through the scheduler with PAN clear".
>>>
>>> On x86, we might as well check DF at the same time as AC.
>>>
>>
>> hpa is right, though — calling into tracing code with AC set is not really so good. And calling schedule() (via preempt_enable() or whatever) is also bad because it runs all the scheduler code with AC on. Admittedly, the scheduler is not *that* interesting of an attack surface.
>>
>
> Not just that, but the other question is just how much code we are running
> with AC open. It really should only be done in some very small regions.
Yes, but we don't really have a way to enforce that, as far as I'm aware.
The user_access_begin/end() is generic API, meaning any arch is free to
implement it. If they don't have the same hardware behaviour as
x86/arm64, it might be that their interrupt/exception entry code will
run with user_access open until they reach the entry code that closes it
(and entry code could potentially be a more interesting attack surface
than the scheduler). This could be the case of software emulated PAN on
arm/arm64 (although currently arm, non-64bit, doesn't have
user_access_begin/end() at the time).
So the whole "very small region" restriction sounds a bit
loose/arbitrary to me...
Thanks,
--
Julien Thierry
On Mon, Feb 18, 2019 at 04:24:30PM -0800, Linus Torvalds wrote:
> On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <[email protected]> wrote:
> >
> > The question is what "fix it" means. I'm really concerned about AC escapes,
> > and everyone else should be, too.
>
> I do think that it might be the right thing to do to add some kind of
> WARN_ON_ONCE() for AC being set in various can-reschedule situations.
So I disagree.
Either we set AC with preempt disabled, and then we don't need an extra
warning, because we already have a warning about scheduling with
preemption disabled, or we accept that the fault handler can run.
On Tue, Feb 19, 2019 at 10:15:25AM +0100, Peter Zijlstra wrote:
> On Mon, Feb 18, 2019 at 04:24:30PM -0800, Linus Torvalds wrote:
> > On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <[email protected]> wrote:
> > >
> > > The question is what "fix it" means. I'm really concerned about AC escapes,
> > > and everyone else should be, too.
> >
> > I do think that it might be the right thing to do to add some kind of
> > WARN_ON_ONCE() for AC being set in various can-reschedule situations.
>
> So I disagree.
>
> Either we set AC with preempt disabled, and then we don't need an extra
> warning, because we already have a warning about scheduling with
> preemption disabled, or we accept that the fault handler can run.
n/m about the faults, forgot the obvious :/
I still really dislike wrecking the preemption model over this.
On February 19, 2019 1:04:09 AM PST, Peter Zijlstra <[email protected]> wrote:
>On Mon, Feb 18, 2019 at 02:30:21PM -0800, H. Peter Anvin wrote:
>> On 2/16/19 2:30 AM, Peter Zijlstra wrote:
>> > On Fri, Feb 15, 2019 at 08:06:56PM -0800, [email protected] wrote:
>> >> This implies we invoke schedule -- a restricted operation
>(consider
>> >> may_sleep) during execution of STAC-enabled code, but *not* as an
>> >> exception or interrupt, since those preserve the flags.
>> >
>> > Meet preempt_enable().
>>
>> I believe this falls under "doctor, it hurts when I do that." And it
>hurts for
>> very good reasons. See below.
>
>I disagree; the basic rule is that if you're preemptible you must also
>be able to schedule and vice-versa. These AC regions violate this.
>
>And, like illustrated, it is _very_ easy to get all sorts inside that
>AC
>crap.
>
>> >> I have serious concerns about this. This is more or less saying
>that
>> >> we have left an unlimited gap and have had AC escape.
>> >
>> > Yes; by allowing normal C in between STAC and CLAC this is so.
>> >
>> >> Does *anyone* see a need to allow this? I got a question at LPC
>from
>> >> someone about this, and what they were trying to do once all the
>> >> layers had been unwound was so far down the wrong track for a root
>> >> problem that actually has a very simple solution.
>> >
>> > Have you read the rest of the thread?
>> >
>> > All it takes for this to explode is a call to a C function that has
>> > tracing on it in between the user_access_begin() and
>user_access_end()
>> > things. That is a _very_ easy thing to do.
>> >
>> > Heck, GCC is allowed to generate that broken code compiling
>> > lib/strnlen_user.c; it doesn't _have_ to inline do_strnlen_user
>(esp.
>> > with CONFIG_OPTIMIZE_INLINING), and making that a function call
>would
>> > get us fentry hooks and ftrace and *BOOM*.
>> >
>> > (Now, of course, its a static function with a single caller, and
>GCC
>> > isn't _THAT_ stupid, but it could, if it wanted to)
>> >
>> > Since the bar is _that_ low for mistakes, I figure we'd better fix
>it.
>> >
>>
>> The question is what "fix it" means.
>
>It means that when we do schedule, the next task doesn't have AC set,
>and when we schedule back, we'll restore our AC when we had it. Unlike
>the current situation, where the next task will run with AC set.
>
>IOW, it confines AC to the task context where it was set.
>
>> I'm really concerned about AC escapes,
>> and everyone else should be, too.
>
>Then _that_ should be asserted.
>
>> For an issue specific to tracing, it would be more appropriate to do
>the
>> appropriate things in the tracing entry/exit than in schedule.
>Otherwise, we
>> don't want to silently paper over mistakes which could mean that we
>run a
>> large portion of the kernel without protection we thought we had.
>>
>> In that sense, calling schedule() with AC set is in fact a great
>place to have
>> a WARN() or even BUG(), because such an event really could imply that
>the
>> kernel has been security compromised.
>
>It is not specific to tracing, tracing is just one of the most trivial
>and non-obvious ways to make it go splat.
>
>There's lot of fairly innocent stuff that does preempt_disable() /
>preempt_enable(). And just a warning in schedule() isn't sufficient,
>you'd have to actually trigger a reschedule before you know your code
>is
>bad.
>
>And like I said; the invariant is: if you're preemptible you can
>schedule and v.v.
>
>Now, clearly you don't want to mark these whole regions as
>!preemptible,
>because then you can also not take faults, but somehow you're not
>worried about the whole fault handler, but you are worried about the
>scheduler ?!? How does that work? The fault handler can touch a _ton_
>more code. Heck, the fault handler can schedule.
>
>So either pre-fault, and run the whole AC crap with preemption disabled
>and retry, or accept that we have to schedule.
>
>> Does that make more sense?
>
>It appears to me you're going about it backwards.
I'm not worried about the fault handler, because AC is always presented/disabled on exception entry; otherwise I would of course be very concerned.
To be clear: I'm not worried about the scheduler itself. I'm worried about how much code we have gone through to get there. Perhaps the scheduler itself is not the right point to probe for it, but we do need to catch things that have gone wrong, rather than just leaving the door wide open.
I would personally be far more comfortable saying you have to disable preemption in user access regions. It may be an unnecessary constraint for x86 and ARM64, but it is safe.
And Julien, yes, it is "somewhat arbitrary", but so are many engineering tradeoffs. Not everything has a very sharply definable line.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
On Tue, Feb 19, 2019 at 10:04:09AM +0100, Peter Zijlstra wrote:
> > Does that make more sense?
>
> It appears to me you're going about it backwards.
So how about you do a GCC plugin that verifies limits on code-gen
between user_access_begin/user_access_end() ?
- No CALL/RET
- implies user_access_end() happens
- implies no fentry hooks
- No __preempt_count frobbing
- No tracepoints
- ...
That way you put the burden on the special code, not on the rest of the
kernel.
On Tue, 19 Feb 2019, Peter Zijlstra wrote:
> On Tue, Feb 19, 2019 at 10:04:09AM +0100, Peter Zijlstra wrote:
> > > Does that make more sense?
> >
> > It appears to me you're going about it backwards.
>
> So how about you do a GCC plugin that verifies limits on code-gen
> between user_access_begin/user_access_end() ?
>
> - No CALL/RET
> - implies user_access_end() happens
> - implies no fentry hooks
> - No __preempt_count frobbing
> - No tracepoints
> - ...
>
> That way you put the burden on the special code, not on the rest of the
> kernel.
And then you have kprobes ....
On Tue, Feb 19, 2019 at 12:38:42PM +0100, Thomas Gleixner wrote:
> On Tue, 19 Feb 2019, Peter Zijlstra wrote:
>
> > On Tue, Feb 19, 2019 at 10:04:09AM +0100, Peter Zijlstra wrote:
> > > > Does that make more sense?
> > >
> > > It appears to me you're going about it backwards.
> >
> > So how about you do a GCC plugin that verifies limits on code-gen
> > between user_access_begin/user_access_end() ?
> >
> > - No CALL/RET
> > - implies user_access_end() happens
> > - implies no fentry hooks
> > - No __preempt_count frobbing
> > - No tracepoints
> > - ...
> >
> > That way you put the burden on the special code, not on the rest of the
> > kernel.
>
> And then you have kprobes ....
They prod the INT3 byte and then take an exception, and exceptions are
'fine'.
On Tue, Feb 19, 2019 at 10:04:09AM +0100, Peter Zijlstra wrote:
> On Mon, Feb 18, 2019 at 02:30:21PM -0800, H. Peter Anvin wrote:
> > On 2/16/19 2:30 AM, Peter Zijlstra wrote:
> > > On Fri, Feb 15, 2019 at 08:06:56PM -0800, [email protected] wrote:
> > >> This implies we invoke schedule -- a restricted operation (consider
> > >> may_sleep) during execution of STAC-enabled code, but *not* as an
> > >> exception or interrupt, since those preserve the flags.
> > >
> > > Meet preempt_enable().
> >
> > I believe this falls under "doctor, it hurts when I do that." And it hurts for
> > very good reasons. See below.
>
> I disagree; the basic rule is that if you're preemptible you must also
> be able to schedule and vice-versa. These AC regions violate this.
>
> And, like illustrated, it is _very_ easy to get all sorts inside that AC
> crap.
>
> > >> I have serious concerns about this. This is more or less saying that
> > >> we have left an unlimited gap and have had AC escape.
> > >
> > > Yes; by allowing normal C in between STAC and CLAC this is so.
> > >
> > >> Does *anyone* see a need to allow this? I got a question at LPC from
> > >> someone about this, and what they were trying to do once all the
> > >> layers had been unwound was so far down the wrong track for a root
> > >> problem that actually has a very simple solution.
> > >
> > > Have you read the rest of the thread?
> > >
> > > All it takes for this to explode is a call to a C function that has
> > > tracing on it in between the user_access_begin() and user_access_end()
> > > things. That is a _very_ easy thing to do.
> > >
> > > Heck, GCC is allowed to generate that broken code compiling
> > > lib/strnlen_user.c; it doesn't _have_ to inline do_strnlen_user (esp.
> > > with CONFIG_OPTIMIZE_INLINING), and making that a function call would
> > > get us fentry hooks and ftrace and *BOOM*.
> > >
> > > (Now, of course, its a static function with a single caller, and GCC
> > > isn't _THAT_ stupid, but it could, if it wanted to)
> > >
> > > Since the bar is _that_ low for mistakes, I figure we'd better fix it.
> > >
> >
> > The question is what "fix it" means.
>
> It means that when we do schedule, the next task doesn't have AC set,
> and when we schedule back, we'll restore our AC when we had it. Unlike
> the current situation, where the next task will run with AC set.
>
> IOW, it confines AC to the task context where it was set.
>
> > I'm really concerned about AC escapes,
> > and everyone else should be, too.
>
> Then _that_ should be asserted.
>
> > For an issue specific to tracing, it would be more appropriate to do the
> > appropriate things in the tracing entry/exit than in schedule. Otherwise, we
> > don't want to silently paper over mistakes which could mean that we run a
> > large portion of the kernel without protection we thought we had.
> >
> > In that sense, calling schedule() with AC set is in fact a great place to have
> > a WARN() or even BUG(), because such an event really could imply that the
> > kernel has been security compromised.
>
> It is not specific to tracing, tracing is just one of the most trivial
> and non-obvious ways to make it go splat.
>
> There's lot of fairly innocent stuff that does preempt_disable() /
> preempt_enable(). And just a warning in schedule() isn't sufficient,
> you'd have to actually trigger a reschedule before you know your code is
> bad.
>
> And like I said; the invariant is: if you're preemptible you can
> schedule and v.v.
>
> Now, clearly you don't want to mark these whole regions as !preemptible,
> because then you can also not take faults, but somehow you're not
> worried about the whole fault handler, but you are worried about the
> scheduler ?!? How does that work? The fault handler can touch a _ton_
> more code. Heck, the fault handler can schedule.
>
> So either pre-fault, and run the whole AC crap with preemption disabled
> and retry, or accept that we have to schedule.
I think you'll still hate this, but could we not disable preemption during
the uaccess-enabled region, re-enabling it on the fault path after we've
toggled uaccess off and disable it again when we return back to the
uaccess-enabled region? Doesn't help with tracing, but it should at least
handle the common case.
Will
On 2/19/19 4:48 AM, Will Deacon wrote:
>
> I think you'll still hate this, but could we not disable preemption during
> the uaccess-enabled region, re-enabling it on the fault path after we've
> toggled uaccess off and disable it again when we return back to the
> uaccess-enabled region? Doesn't help with tracing, but it should at least
> handle the common case.
>
There is a worse problem with this, I still realize: this would mean blocking
preemption across what could possibly be a *very* large copy_from_user(), for
example.
Exceptions *have* to handle this; there is no way around it. Perhaps the
scheduler isn't the right place to put these kinds of asserts, either.
Now, __fentry__ is kind of a special beast; in some ways it is an "exception
implemented as a function call"; on x86 one could even consider using an INT
instruction in order to reduce the NOP footprint in the unarmed case. Nor is
__fentry__ a C function; it has far more of an exception-like ABI.
*Regardless* of what else we do, I believe __fentry__ ought to
save/disable/restore AC, just like an exception does.
The idea of using s/a gcc plugin/objtool/ for this isn't really a bad idea.
Obviously the general problem is undecidable :) but the enforcement of some
simple, fairly draconian rules ("as tight as possible, but no tighter")
shouldn't be a huge problem.
An actual gcc plugin -- which would probably be quite complex -- could make
gcc itself aware of user space accesses and be able to rearrange them to
minimize STAC/CLAC and avoid kernel-space accesses inside those brackets.
Finally, of course, there is the option of simply outlawing this practice as a
matter of policy and require that all structures be accessed through a limited
set of APIs. As I recall, the number of places where there were
performance-critical regions which could not use the normal accessors are
fairly small (signal frames being the main one.) Doing bulk copy to/from
kernel memory and then accessing them from there would have some performance
cost, but would eliminate the need for this complexity entirely.
-hpa
On 20/02/2019 22:55, H. Peter Anvin wrote:
> On 2/19/19 4:48 AM, Will Deacon wrote:
>>
>> I think you'll still hate this, but could we not disable preemption during
>> the uaccess-enabled region, re-enabling it on the fault path after we've
>> toggled uaccess off and disable it again when we return back to the
>> uaccess-enabled region? Doesn't help with tracing, but it should at least
>> handle the common case.
>>
>
> There is a worse problem with this, I still realize: this would mean blocking
> preemption across what could possibly be a *very* large copy_from_user(), for
> example.
>
Yes, that's a good point. And I guess a userspace program could forcibly
trigger the kernel into a large copy_from/to_user(), knowingly disabling
preemption.
I don't know how bad this could get.
> Exceptions *have* to handle this; there is no way around it. Perhaps the
> scheduler isn't the right place to put these kinds of asserts, either.
>
Maybe I'm misunderstanding what you mean with "Exceptions *have* to
handle this". Isn't the fact that the AC/PAN flags gets saved on
exception entry and set back to "user accesses disabled" already
handling it? Or are you referring to something else?
So far my understanding is that the exception/interrupt case is fine.
The worrying case is what gets called in the user access regions
(schedule(), preempt_enable(), tracing...).
Did I get lost somewhere?
> Now, __fentry__ is kind of a special beast; in some ways it is an "exception
> implemented as a function call"; on x86 one could even consider using an INT
> instruction in order to reduce the NOP footprint in the unarmed case. Nor is
> __fentry__ a C function; it has far more of an exception-like ABI.
> *Regardless* of what else we do, I believe __fentry__ ought to
> save/disable/restore AC, just like an exception does.
>
That does make sense to me. However it doesn't solve the issue of
calling (or preventing to call) some function that rescheds.
> The idea of using s/a gcc plugin/objtool/ for this isn't really a bad idea.
> Obviously the general problem is undecidable :) but the enforcement of some
> simple, fairly draconian rules ("as tight as possible, but no tighter")
> shouldn't be a huge problem.
>
> An actual gcc plugin -- which would probably be quite complex -- could make
> gcc itself aware of user space accesses and be able to rearrange them to
> minimize STAC/CLAC and avoid kernel-space accesses inside those brackets.
>
Not sure I get this. The data that is retrieved from/stored user space
is generally obtained from/saved into kernel-space address. And when you
start the user_access_begin() it means you plan on doing a bunch of user
access, so I wouldn't expect to be able to batch everything into registers.
Cheers,
--
Julien Thierry
On Wed, Feb 20, 2019 at 02:55:59PM -0800, H. Peter Anvin wrote:
> On 2/19/19 4:48 AM, Will Deacon wrote:
> >
> > I think you'll still hate this, but could we not disable preemption during
> > the uaccess-enabled region, re-enabling it on the fault path after we've
> > toggled uaccess off and disable it again when we return back to the
> > uaccess-enabled region? Doesn't help with tracing, but it should at least
> > handle the common case.
> >
>
> There is a worse problem with this, I still realize: this would mean blocking
> preemption across what could possibly be a *very* large copy_from_user(), for
> example.
I don't think it's legitimate to call copy_{to,from}_user() inside a
user_access_{begin,end} region. You'd need to add some unsafe variants,
which could periodically disable uaccess and call cond_resched() inside
the loop to avoid the problem you're eluding to.
For existing callers of copy_{to,from}_user(), there's no issue as they
don't call into the scheduler during the copy operation. Exceptions are
handled fine by the code in mainline today.
GCC plugins are a cool idea, but I'm just a bit nervous about relying on
them for things like this.
Will
On Thu, 21 Feb 2019, Julien Thierry wrote:
> On 20/02/2019 22:55, H. Peter Anvin wrote:
> > Now, __fentry__ is kind of a special beast; in some ways it is an "exception
> > implemented as a function call"; on x86 one could even consider using an INT
> > instruction in order to reduce the NOP footprint in the unarmed case. Nor is
> > __fentry__ a C function; it has far more of an exception-like ABI.
> > *Regardless* of what else we do, I believe __fentry__ ought to
> > save/disable/restore AC, just like an exception does.
> >
>
> That does make sense to me. However it doesn't solve the issue of
> calling (or preventing to call) some function that rescheds.
IMNSHO any call inside a AC region is a bug lurking round the corner. The
only thing which is tolerable is an exception of some sort.
Enforce that with objtool. End of story.
Thanks,
tglx
> On Feb 21, 2019, at 4:46 AM, Will Deacon <[email protected]> wrote:
>
>> On Wed, Feb 20, 2019 at 02:55:59PM -0800, H. Peter Anvin wrote:
>>> On 2/19/19 4:48 AM, Will Deacon wrote:
>>>
>>> I think you'll still hate this, but could we not disable preemption during
>>> the uaccess-enabled region, re-enabling it on the fault path after we've
>>> toggled uaccess off and disable it again when we return back to the
>>> uaccess-enabled region? Doesn't help with tracing, but it should at least
>>> handle the common case.
>>>
>>
>> There is a worse problem with this, I still realize: this would mean blocking
>> preemption across what could possibly be a *very* large copy_from_user(), for
>> example.
>
> I don't think it's legitimate to call copy_{to,from}_user() inside a
> user_access_{begin,end} region. You'd need to add some unsafe variants,
> which could periodically disable uaccess and call cond_resched() inside
> the loop to avoid the problem you're eluding to.
>
Definitely not safe. On x86 they do CLAC and everything breaks.
On Thu, Feb 21, 2019 at 1:35 PM Thomas Gleixner <[email protected]> wrote:
>
> IMNSHO any call inside a AC region is a bug lurking round the corner. The
> only thing which is tolerable is an exception of some sort.
>
> Enforce that with objtool. End of story.
Not quite that simple.
There are some use cases where you really do want a call - albeit to
special functions - inside the AC region.
The prime example of this is likely "filldir()" in fs/readdir.c, which
is actually somewhat important for some loads (eg samba).
And the whole AC thing made it horribly horribly slow because readdir
is one of those things that doesn't just copy one value (or one
structure) to user space, but writes several different things.
Kind of like signal frame setup does.
You may not realize that right now signal frame setup *does* actually
do a call with AC set. See ia32_setup_rt_frame() that does
put_user_try {
...
compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
...
} put_user_catch(err);
is a macro, but inside that macro is a call to sas_ss_reset().
Now, that is an inline function, so it hopefully doesn't actually
cause a call. But it's "only" an inline function, not "always_inline".
We did already have (and I rejected, for now) patches that made those
inlines not very strong...
[ Side note: currently signal frame setup uses the absolutely
*disgusting* put_user_ex() thing, but that's actually what
unsafe_put_user() was designed for. I just never got around to getting
rid of the nasty hot mess of put_user_ex() ]
Anyway, while signal frame setup just writes individual values,
"filldir()" does *both* several individual values _and_ does a
copy_to_user().
And it's that "copy_to_user()" that I want to eventually change to a
"unsafe_copy_to_user()", along with making the individual values be
written with unsafe_put_user().
Right now "filldir()" messes with AC a total of *six* times. It sets
four field values, and then does a "copy_to_user()", all of which
set/clear AC right now. It's wasting hundreds of cycles on this,
because AC is so slow to set/clear.
If AC was cheap, this wouldn't be much of an issue. But AC is really
really expensive. I think it's microcode and serializes the pipeline
or something.
Anyway. We already have a possible call site, and there are good
reasons for future ones too. But they are hopefully all very
controlled.
Linus
On Thu, Feb 21, 2019 at 02:08:11PM -0800, Linus Torvalds wrote:
> On Thu, Feb 21, 2019 at 1:35 PM Thomas Gleixner <[email protected]> wrote:
> >
> > IMNSHO any call inside a AC region is a bug lurking round the corner. The
> > only thing which is tolerable is an exception of some sort.
> >
> > Enforce that with objtool. End of story.
>
> Not quite that simple.
>
> There are some use cases where you really do want a call - albeit to
> special functions - inside the AC region.
>
> The prime example of this is likely "filldir()" in fs/readdir.c, which
> is actually somewhat important for some loads (eg samba).
>
> And the whole AC thing made it horribly horribly slow because readdir
> is one of those things that doesn't just copy one value (or one
> structure) to user space, but writes several different things.
>
> Kind of like signal frame setup does.
>
> You may not realize that right now signal frame setup *does* actually
> do a call with AC set. See ia32_setup_rt_frame() that does
>
> put_user_try {
> ...
> compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
> ...
> } put_user_catch(err);
>
> is a macro, but inside that macro is a call to sas_ss_reset().
>
> Now, that is an inline function, so it hopefully doesn't actually
> cause a call. But it's "only" an inline function, not "always_inline".
> We did already have (and I rejected, for now) patches that made those
> inlines not very strong...
That one does indeed not generate a call, but:
arch/x86/ia32/ia32_signal.o: warning: objtool: ia32_restore_sigcontext()+0x38: call to native_load_gs_index() with AC set
I've yet to look at detectoring __preempt_count mucking.
---
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 43c029cdc3fe..cd31e4433f4c 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -5,6 +5,7 @@
#ifdef __KERNEL__
+#include <linux/frame.h>
#include <asm/nops.h>
/*
@@ -135,6 +136,7 @@ static inline void native_wbinvd(void)
}
extern asmlinkage void native_load_gs_index(unsigned);
+AC_SAFE(native_load_gs_index);
static inline unsigned long __read_cr4(void)
{
diff --git a/include/linux/frame.h b/include/linux/frame.h
index 02d3ca2d9598..644662d8b8e8 100644
--- a/include/linux/frame.h
+++ b/include/linux/frame.h
@@ -15,9 +15,14 @@
static void __used __section(.discard.func_stack_frame_non_standard) \
*__func_stack_frame_non_standard_##func = func
+#define AC_SAFE(func) \
+ static void __used __section(.discard.ac_safe) \
+ *__func_ac_safe_##func = func
+
#else /* !CONFIG_STACK_VALIDATION */
#define STACK_FRAME_NON_STANDARD(func)
+#define AC_SAFE(func)
#endif /* CONFIG_STACK_VALIDATION */
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index c9d038f91af6..5a64e5fb9d7a 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -31,7 +31,7 @@ INCLUDES := -I$(srctree)/tools/include \
-I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \
-I$(srctree)/tools/objtool/arch/$(ARCH)/include
WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed
-CFLAGS += -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES)
+CFLAGS += -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) -ggdb3
LDFLAGS += -lelf $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS)
# Allow old libelf to be used:
diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
index b0d7dc3d71b5..48327099466d 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -33,7 +33,9 @@
#define INSN_STACK 8
#define INSN_BUG 9
#define INSN_NOP 10
-#define INSN_OTHER 11
+#define INSN_STAC 11
+#define INSN_CLAC 12
+#define INSN_OTHER 13
#define INSN_LAST INSN_OTHER
enum op_dest_type {
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 540a209b78ab..d1e99d1460a5 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -369,7 +369,19 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
case 0x0f:
- if (op2 >= 0x80 && op2 <= 0x8f) {
+ if (op2 == 0x01) {
+
+ if (modrm == 0xca) {
+
+ *type = INSN_CLAC;
+
+ } else if (modrm == 0xcb) {
+
+ *type = INSN_STAC;
+
+ }
+
+ } else if (op2 >= 0x80 && op2 <= 0x8f) {
*type = INSN_JUMP_CONDITIONAL;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 0414a0d52262..6fef157e244b 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -127,6 +127,24 @@ static bool ignore_func(struct objtool_file *file, struct symbol *func)
return false;
}
+static bool ac_safe_func(struct objtool_file *file, struct symbol *func)
+{
+ struct rela *rela;
+
+ /* check for AC_SAFE */
+ if (file->ac_safe && file->ac_safe->rela)
+ list_for_each_entry(rela, &file->ac_safe->rela->rela_list, list) {
+ if (rela->sym->type == STT_SECTION &&
+ rela->sym->sec == func->sec &&
+ rela->addend == func->offset)
+ return true;
+ if (/* rela->sym->type == STT_FUNC && */ rela->sym == func)
+ return true;
+ }
+
+ return false;
+}
+
/*
* This checks to see if the given function is a "noreturn" function.
*
@@ -439,6 +457,8 @@ static void add_ignores(struct objtool_file *file)
for_each_sec(file, sec) {
list_for_each_entry(func, &sec->symbol_list, list) {
+ func->ac_safe = ac_safe_func(file, func);
+
if (func->type != STT_FUNC)
continue;
@@ -1897,11 +1917,16 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
if (ret)
return 1;
}
- }
+ } else printf("ponies\n");
switch (insn->type) {
case INSN_RETURN:
+ if (state.ac) {
+ WARN_FUNC("return with AC set", sec, insn->offset);
+ return 1;
+ }
+
if (func && has_modified_stack_frame(&state)) {
WARN_FUNC("return with modified stack frame",
sec, insn->offset);
@@ -1917,6 +1942,12 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
return 0;
case INSN_CALL:
+ if (state.ac && !insn->call_dest->ac_safe) {
+ WARN_FUNC("call to %s() with AC set", sec, insn->offset,
+ insn->call_dest->name);
+ return 1;
+ }
+
if (is_fentry_call(insn))
break;
@@ -1928,6 +1959,11 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
/* fallthrough */
case INSN_CALL_DYNAMIC:
+ if (state.ac && !insn->call_dest->ac_safe) {
+ WARN_FUNC("call to %s() with AC set", sec, insn->offset,
+ insn->call_dest->name);
+ return 1;
+ }
if (!no_fp && func && !has_valid_stack_frame(&state)) {
WARN_FUNC("call without frame pointer save/setup",
sec, insn->offset);
@@ -1980,6 +2016,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
break;
+ case INSN_STAC:
+ state.ac = true;
+ break;
+
+ case INSN_CLAC:
+ state.ac = false;
+ break;
+
default:
break;
}
@@ -2198,6 +2242,7 @@ int check(const char *_objname, bool orc)
INIT_LIST_HEAD(&file.insn_list);
hash_init(file.insn_hash);
file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
+ file.ac_safe = find_section_by_name(file.elf, ".discard.ac_safe");
file.c_file = find_section_by_name(file.elf, ".comment");
file.ignore_unreachables = no_unreachable;
file.hints = false;
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index e6e8a655b556..d4ae59454fff 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -31,7 +31,7 @@ struct insn_state {
int stack_size;
unsigned char type;
bool bp_scratch;
- bool drap, end;
+ bool drap, end, ac;
int drap_reg, drap_offset;
struct cfi_reg vals[CFI_NUM_REGS];
};
@@ -61,6 +61,7 @@ struct objtool_file {
struct list_head insn_list;
DECLARE_HASHTABLE(insn_hash, 16);
struct section *whitelist;
+ struct section *ac_safe;
bool ignore_unreachables, c_file, hints, rodata;
};
diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
index bc97ed86b9cd..064c3df31e40 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -62,6 +62,7 @@ struct symbol {
unsigned long offset;
unsigned int len;
struct symbol *pfunc, *cfunc;
+ bool ac_safe;
};
struct rela {
diff --git a/tools/objtool/special.c b/tools/objtool/special.c
index 50af4e1274b3..72084cae8f35 100644
--- a/tools/objtool/special.c
+++ b/tools/objtool/special.c
@@ -166,8 +166,8 @@ int special_get_alts(struct elf *elf, struct list_head *alts)
continue;
if (sec->len % entry->size != 0) {
- WARN("%s size not a multiple of %d",
- sec->name, entry->size);
+ WARN("%s size (%d) not a multiple of %d",
+ sec->name, sec->len, entry->size);
return -1;
}
On Thu, 21 Feb 2019, Linus Torvalds wrote:
> On Thu, Feb 21, 2019 at 1:35 PM Thomas Gleixner <[email protected]> wrote:
> >
> > IMNSHO any call inside a AC region is a bug lurking round the corner. The
> > only thing which is tolerable is an exception of some sort.
> >
> > Enforce that with objtool. End of story.
>
> Not quite that simple.
But correct :)
> Right now "filldir()" messes with AC a total of *six* times. It sets
> four field values, and then does a "copy_to_user()", all of which
> set/clear AC right now. It's wasting hundreds of cycles on this,
> because AC is so slow to set/clear.
>
> If AC was cheap, this wouldn't be much of an issue. But AC is really
> really expensive. I think it's microcode and serializes the pipeline
> or something.
>
> Anyway. We already have a possible call site, and there are good
> reasons for future ones too. But they are hopefully all very
> controlled.
I agree, that a function which is doing the actual copy should be callable,
but random other functions? NO!
The problem is that once you open that can of worms the people who think
their problem is special will come around and do
begin()
copy_to_user_unsafe(uptr, collect_data())
end()
just because they can. That's the stuff, I'm worried about, not the well
defined copy_to/from_user() invocation. We can deal with that and make sure
that it's safe even with tracing and annotate with some special magic. It's
simpler to find and check a new '__safe_inside_ac' annotation than chasing
randomly added code within a gazillion of begin/end sections.
Thanks,
tglx
On Fri, Feb 22, 2019 at 07:10:34PM +0100, Thomas Gleixner wrote:
> But correct :)
> I agree, that a function which is doing the actual copy should be callable,
> but random other functions? NO!
So find the below patch -- which spotted fail in ptrace.c
It has an AC_SAFE(func) annotation which allows marking specific
functions as safe to call. The patch includes 2 instances which were
required to make arch/x86 'build':
arch/x86/ia32/ia32_signal.o: warning: objtool: ia32_restore_sigcontext()+0x3d: call to native_load_gs_index() with AC set
arch/x86/kernel/ptrace.o: warning: objtool: genregs_get()+0x8e: call to getreg() with AC set
It also screams (provided one has CONFIG_FUNCTION_TRACE=y) about the
lack of notrace annotations on functions marked AC_SAFE():
arch/x86/kernel/ptrace.o: warning: objtool: getreg()+0x0: call to __fentry__() with AC set
It builds arch/x86 relatively clean; it only complains about some
redundant CLACs in entry_64.S because it doesn't understand interrupts
and I've not bothered creating an annotation for them yet.
arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0x4d: redundant CLAC
arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0x5a: redundant CLAC
...
arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0xb1: redundant CLAC
Also, I realized we don't need special annotations for preempt_count;
preempt_disable() emits a CALL instruction which should readily trigger
the warnings added here.
The VDSO thing is a bit of a hack, but I couldn't quickly find anything
better.
Comments?
---
arch/x86/include/asm/special_insns.h | 2 ++
arch/x86/kernel/ptrace.c | 3 +-
include/linux/frame.h | 23 ++++++++++++++
tools/objtool/arch.h | 4 ++-
tools/objtool/arch/x86/decode.c | 14 ++++++++-
tools/objtool/check.c | 59 ++++++++++++++++++++++++++++++++++++
tools/objtool/check.h | 3 +-
tools/objtool/elf.h | 1 +
8 files changed, 105 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 43c029cdc3fe..cd31e4433f4c 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -5,6 +5,7 @@
#ifdef __KERNEL__
+#include <linux/frame.h>
#include <asm/nops.h>
/*
@@ -135,6 +136,7 @@ static inline void native_wbinvd(void)
}
extern asmlinkage void native_load_gs_index(unsigned);
+AC_SAFE(native_load_gs_index);
static inline unsigned long __read_cr4(void)
{
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 4b8ee05dd6ad..e278b4055a8b 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -420,7 +420,7 @@ static int putreg(struct task_struct *child,
return 0;
}
-static unsigned long getreg(struct task_struct *task, unsigned long offset)
+static notrace unsigned long getreg(struct task_struct *task, unsigned long offset)
{
switch (offset) {
case offsetof(struct user_regs_struct, cs):
@@ -444,6 +444,7 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
return *pt_regs_access(task_pt_regs(task), offset);
}
+AC_SAFE(getreg);
static int genregs_get(struct task_struct *target,
const struct user_regset *regset,
diff --git a/include/linux/frame.h b/include/linux/frame.h
index 02d3ca2d9598..5d354cf42a56 100644
--- a/include/linux/frame.h
+++ b/include/linux/frame.h
@@ -21,4 +21,27 @@
#endif /* CONFIG_STACK_VALIDATION */
+#if defined(CONFIG_STACK_VALIDATION) && !defined(BUILD_VDSO) && !defined(BUILD_VDSO32)
+/*
+ * This macro marks functions as AC-safe, that is, it is safe to call from an
+ * EFLAGS.AC enabled region (typically user_access_begin() /
+ * user_access_end()).
+ *
+ * These functions in turn will only call AC-safe functions themselves (which
+ * precludes tracing, including __fentry__ and scheduling, including
+ * preempt_enable).
+ *
+ * AC-safe functions will obviously also not change EFLAGS.AC themselves.
+ *
+ * Since STAC/CLAC are OPL-0 only, this is all irrelevant for VDSO builds
+ * (and the generated symbol reference will in fact cause link failures).
+ */
+#define AC_SAFE(func) \
+ static void __used __section(.discard.ac_safe) \
+ *__func_ac_safe_##func = func
+
+#else
+#define AC_SAFE(func)
+#endif
+
#endif /* _LINUX_FRAME_H */
diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
index b0d7dc3d71b5..48327099466d 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -33,7 +33,9 @@
#define INSN_STACK 8
#define INSN_BUG 9
#define INSN_NOP 10
-#define INSN_OTHER 11
+#define INSN_STAC 11
+#define INSN_CLAC 12
+#define INSN_OTHER 13
#define INSN_LAST INSN_OTHER
enum op_dest_type {
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 540a209b78ab..d1e99d1460a5 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -369,7 +369,19 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
case 0x0f:
- if (op2 >= 0x80 && op2 <= 0x8f) {
+ if (op2 == 0x01) {
+
+ if (modrm == 0xca) {
+
+ *type = INSN_CLAC;
+
+ } else if (modrm == 0xcb) {
+
+ *type = INSN_STAC;
+
+ }
+
+ } else if (op2 >= 0x80 && op2 <= 0x8f) {
*type = INSN_JUMP_CONDITIONAL;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 0414a0d52262..01852602ca31 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -127,6 +127,24 @@ static bool ignore_func(struct objtool_file *file, struct symbol *func)
return false;
}
+static bool ac_safe_func(struct objtool_file *file, struct symbol *func)
+{
+ struct rela *rela;
+
+ /* check for AC_SAFE */
+ if (file->ac_safe && file->ac_safe->rela)
+ list_for_each_entry(rela, &file->ac_safe->rela->rela_list, list) {
+ if (rela->sym->type == STT_SECTION &&
+ rela->sym->sec == func->sec &&
+ rela->addend == func->offset)
+ return true;
+ if (/* rela->sym->type == STT_FUNC && */ rela->sym == func)
+ return true;
+ }
+
+ return false;
+}
+
/*
* This checks to see if the given function is a "noreturn" function.
*
@@ -439,6 +457,8 @@ static void add_ignores(struct objtool_file *file)
for_each_sec(file, sec) {
list_for_each_entry(func, &sec->symbol_list, list) {
+ func->ac_safe = ac_safe_func(file, func);
+
if (func->type != STT_FUNC)
continue;
@@ -1902,6 +1922,11 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
switch (insn->type) {
case INSN_RETURN:
+ if (state.ac) {
+ WARN_FUNC("return with AC set", sec, insn->offset);
+ return 1;
+ }
+
if (func && has_modified_stack_frame(&state)) {
WARN_FUNC("return with modified stack frame",
sec, insn->offset);
@@ -1917,6 +1942,12 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
return 0;
case INSN_CALL:
+ if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
+ WARN_FUNC("call to %s() with AC set", sec, insn->offset,
+ insn->call_dest->name);
+ return 1;
+ }
+
if (is_fentry_call(insn))
break;
@@ -1928,6 +1959,11 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
/* fallthrough */
case INSN_CALL_DYNAMIC:
+ if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
+ WARN_FUNC("call to %s() with AC set", sec, insn->offset,
+ insn->call_dest->name);
+ return 1;
+ }
if (!no_fp && func && !has_valid_stack_frame(&state)) {
WARN_FUNC("call without frame pointer save/setup",
sec, insn->offset);
@@ -1980,6 +2016,26 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
break;
+ case INSN_STAC:
+ if (state.ac_safe || state.ac) {
+ WARN_FUNC("recursive STAC", sec, insn->offset);
+ return 1;
+ }
+ state.ac = true;
+ break;
+
+ case INSN_CLAC:
+ if (!state.ac) {
+ WARN_FUNC("redundant CLAC", sec, insn->offset);
+ return 1;
+ }
+ if (state.ac_safe) {
+ WARN_FUNC("AC-safe clears AC", sec, insn->offset);
+ return 1;
+ }
+ state.ac = false;
+ break;
+
default:
break;
}
@@ -2141,6 +2197,8 @@ static int validate_functions(struct objtool_file *file)
if (!insn || insn->ignore)
continue;
+ state.ac_safe = func->ac_safe;
+
ret = validate_branch(file, insn, state);
warnings += ret;
}
@@ -2198,6 +2256,7 @@ int check(const char *_objname, bool orc)
INIT_LIST_HEAD(&file.insn_list);
hash_init(file.insn_hash);
file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
+ file.ac_safe = find_section_by_name(file.elf, ".discard.ac_safe");
file.c_file = find_section_by_name(file.elf, ".comment");
file.ignore_unreachables = no_unreachable;
file.hints = false;
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index e6e8a655b556..c31ec3ca78f3 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -31,7 +31,7 @@ struct insn_state {
int stack_size;
unsigned char type;
bool bp_scratch;
- bool drap, end;
+ bool drap, end, ac, ac_safe;
int drap_reg, drap_offset;
struct cfi_reg vals[CFI_NUM_REGS];
};
@@ -61,6 +61,7 @@ struct objtool_file {
struct list_head insn_list;
DECLARE_HASHTABLE(insn_hash, 16);
struct section *whitelist;
+ struct section *ac_safe;
bool ignore_unreachables, c_file, hints, rodata;
};
diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
index bc97ed86b9cd..064c3df31e40 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -62,6 +62,7 @@ struct symbol {
unsigned long offset;
unsigned int len;
struct symbol *pfunc, *cfunc;
+ bool ac_safe;
};
struct rela {
On Fri, Feb 22, 2019 at 2:26 PM Peter Zijlstra <[email protected]> wrote:
>
> So find the below patch -- which spotted fail in ptrace.c
>
> It has an AC_SAFE(func) annotation which allows marking specific
> functions as safe to call. The patch includes 2 instances which were
> required to make arch/x86 'build':
Looks sane enough to me.
Can you make it do DF too while at it? I really think AC and DF are
the same in this context. If you call an arbitrary function with DF
set, things will very quickly go sideways (even if it might work in
practice as long as the function just doesn't do a memcpy or similar)
Linus
On February 22, 2019 2:26:35 PM PST, Peter Zijlstra <[email protected]> wrote:
>On Fri, Feb 22, 2019 at 07:10:34PM +0100, Thomas Gleixner wrote:
>
>> But correct :)
>
>> I agree, that a function which is doing the actual copy should be
>callable,
>> but random other functions? NO!
>
>So find the below patch -- which spotted fail in ptrace.c
>
>It has an AC_SAFE(func) annotation which allows marking specific
>functions as safe to call. The patch includes 2 instances which were
>required to make arch/x86 'build':
>
>arch/x86/ia32/ia32_signal.o: warning: objtool:
>ia32_restore_sigcontext()+0x3d: call to native_load_gs_index() with AC
>set
>arch/x86/kernel/ptrace.o: warning: objtool: genregs_get()+0x8e: call to
>getreg() with AC set
>
>It also screams (provided one has CONFIG_FUNCTION_TRACE=y) about the
>lack of notrace annotations on functions marked AC_SAFE():
>
>arch/x86/kernel/ptrace.o: warning: objtool: getreg()+0x0: call to
>__fentry__() with AC set
>
>It builds arch/x86 relatively clean; it only complains about some
>redundant CLACs in entry_64.S because it doesn't understand interrupts
>and I've not bothered creating an annotation for them yet.
>
>arch/x86/entry/entry_64.o: warning: objtool:
>.altinstr_replacement+0x4d: redundant CLAC
>arch/x86/entry/entry_64.o: warning: objtool:
>.altinstr_replacement+0x5a: redundant CLAC
> ...
>arch/x86/entry/entry_64.o: warning: objtool:
>.altinstr_replacement+0xb1: redundant CLAC
>
>Also, I realized we don't need special annotations for preempt_count;
>preempt_disable() emits a CALL instruction which should readily trigger
>the warnings added here.
>
>The VDSO thing is a bit of a hack, but I couldn't quickly find anything
>better.
>
>Comments?
>
>---
> arch/x86/include/asm/special_insns.h | 2 ++
> arch/x86/kernel/ptrace.c | 3 +-
> include/linux/frame.h | 23 ++++++++++++++
> tools/objtool/arch.h | 4 ++-
> tools/objtool/arch/x86/decode.c | 14 ++++++++-
>tools/objtool/check.c | 59
>++++++++++++++++++++++++++++++++++++
> tools/objtool/check.h | 3 +-
> tools/objtool/elf.h | 1 +
> 8 files changed, 105 insertions(+), 4 deletions(-)
>
>diff --git a/arch/x86/include/asm/special_insns.h
>b/arch/x86/include/asm/special_insns.h
>index 43c029cdc3fe..cd31e4433f4c 100644
>--- a/arch/x86/include/asm/special_insns.h
>+++ b/arch/x86/include/asm/special_insns.h
>@@ -5,6 +5,7 @@
>
> #ifdef __KERNEL__
>
>+#include <linux/frame.h>
> #include <asm/nops.h>
>
> /*
>@@ -135,6 +136,7 @@ static inline void native_wbinvd(void)
> }
>
> extern asmlinkage void native_load_gs_index(unsigned);
>+AC_SAFE(native_load_gs_index);
>
> static inline unsigned long __read_cr4(void)
> {
>diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
>index 4b8ee05dd6ad..e278b4055a8b 100644
>--- a/arch/x86/kernel/ptrace.c
>+++ b/arch/x86/kernel/ptrace.c
>@@ -420,7 +420,7 @@ static int putreg(struct task_struct *child,
> return 0;
> }
>
>-static unsigned long getreg(struct task_struct *task, unsigned long
>offset)
>+static notrace unsigned long getreg(struct task_struct *task, unsigned
>long offset)
> {
> switch (offset) {
> case offsetof(struct user_regs_struct, cs):
>@@ -444,6 +444,7 @@ static unsigned long getreg(struct task_struct
>*task, unsigned long offset)
>
> return *pt_regs_access(task_pt_regs(task), offset);
> }
>+AC_SAFE(getreg);
>
> static int genregs_get(struct task_struct *target,
> const struct user_regset *regset,
>diff --git a/include/linux/frame.h b/include/linux/frame.h
>index 02d3ca2d9598..5d354cf42a56 100644
>--- a/include/linux/frame.h
>+++ b/include/linux/frame.h
>@@ -21,4 +21,27 @@
>
> #endif /* CONFIG_STACK_VALIDATION */
>
>+#if defined(CONFIG_STACK_VALIDATION) && !defined(BUILD_VDSO) &&
>!defined(BUILD_VDSO32)
>+/*
>+ * This macro marks functions as AC-safe, that is, it is safe to call
>from an
>+ * EFLAGS.AC enabled region (typically user_access_begin() /
>+ * user_access_end()).
>+ *
>+ * These functions in turn will only call AC-safe functions themselves
>(which
>+ * precludes tracing, including __fentry__ and scheduling, including
>+ * preempt_enable).
>+ *
>+ * AC-safe functions will obviously also not change EFLAGS.AC
>themselves.
>+ *
>+ * Since STAC/CLAC are OPL-0 only, this is all irrelevant for VDSO
>builds
>+ * (and the generated symbol reference will in fact cause link
>failures).
>+ */
>+#define AC_SAFE(func) \
>+ static void __used __section(.discard.ac_safe) \
>+ *__func_ac_safe_##func = func
>+
>+#else
>+#define AC_SAFE(func)
>+#endif
>+
> #endif /* _LINUX_FRAME_H */
>diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
>index b0d7dc3d71b5..48327099466d 100644
>--- a/tools/objtool/arch.h
>+++ b/tools/objtool/arch.h
>@@ -33,7 +33,9 @@
> #define INSN_STACK 8
> #define INSN_BUG 9
> #define INSN_NOP 10
>-#define INSN_OTHER 11
>+#define INSN_STAC 11
>+#define INSN_CLAC 12
>+#define INSN_OTHER 13
> #define INSN_LAST INSN_OTHER
>
> enum op_dest_type {
>diff --git a/tools/objtool/arch/x86/decode.c
>b/tools/objtool/arch/x86/decode.c
>index 540a209b78ab..d1e99d1460a5 100644
>--- a/tools/objtool/arch/x86/decode.c
>+++ b/tools/objtool/arch/x86/decode.c
>@@ -369,7 +369,19 @@ int arch_decode_instruction(struct elf *elf,
>struct section *sec,
>
> case 0x0f:
>
>- if (op2 >= 0x80 && op2 <= 0x8f) {
>+ if (op2 == 0x01) {
>+
>+ if (modrm == 0xca) {
>+
>+ *type = INSN_CLAC;
>+
>+ } else if (modrm == 0xcb) {
>+
>+ *type = INSN_STAC;
>+
>+ }
>+
>+ } else if (op2 >= 0x80 && op2 <= 0x8f) {
>
> *type = INSN_JUMP_CONDITIONAL;
>
>diff --git a/tools/objtool/check.c b/tools/objtool/check.c
>index 0414a0d52262..01852602ca31 100644
>--- a/tools/objtool/check.c
>+++ b/tools/objtool/check.c
>@@ -127,6 +127,24 @@ static bool ignore_func(struct objtool_file *file,
>struct symbol *func)
> return false;
> }
>
>+static bool ac_safe_func(struct objtool_file *file, struct symbol
>*func)
>+{
>+ struct rela *rela;
>+
>+ /* check for AC_SAFE */
>+ if (file->ac_safe && file->ac_safe->rela)
>+ list_for_each_entry(rela, &file->ac_safe->rela->rela_list, list) {
>+ if (rela->sym->type == STT_SECTION &&
>+ rela->sym->sec == func->sec &&
>+ rela->addend == func->offset)
>+ return true;
>+ if (/* rela->sym->type == STT_FUNC && */ rela->sym == func)
>+ return true;
>+ }
>+
>+ return false;
>+}
>+
> /*
> * This checks to see if the given function is a "noreturn" function.
> *
>@@ -439,6 +457,8 @@ static void add_ignores(struct objtool_file *file)
>
> for_each_sec(file, sec) {
> list_for_each_entry(func, &sec->symbol_list, list) {
>+ func->ac_safe = ac_safe_func(file, func);
>+
> if (func->type != STT_FUNC)
> continue;
>
>@@ -1902,6 +1922,11 @@ static int validate_branch(struct objtool_file
>*file, struct instruction *first,
> switch (insn->type) {
>
> case INSN_RETURN:
>+ if (state.ac) {
>+ WARN_FUNC("return with AC set", sec, insn->offset);
>+ return 1;
>+ }
>+
> if (func && has_modified_stack_frame(&state)) {
> WARN_FUNC("return with modified stack frame",
> sec, insn->offset);
>@@ -1917,6 +1942,12 @@ static int validate_branch(struct objtool_file
>*file, struct instruction *first,
> return 0;
>
> case INSN_CALL:
>+ if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
>+ WARN_FUNC("call to %s() with AC set", sec, insn->offset,
>+ insn->call_dest->name);
>+ return 1;
>+ }
>+
> if (is_fentry_call(insn))
> break;
>
>@@ -1928,6 +1959,11 @@ static int validate_branch(struct objtool_file
>*file, struct instruction *first,
>
> /* fallthrough */
> case INSN_CALL_DYNAMIC:
>+ if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
>+ WARN_FUNC("call to %s() with AC set", sec, insn->offset,
>+ insn->call_dest->name);
>+ return 1;
>+ }
> if (!no_fp && func && !has_valid_stack_frame(&state)) {
> WARN_FUNC("call without frame pointer save/setup",
> sec, insn->offset);
>@@ -1980,6 +2016,26 @@ static int validate_branch(struct objtool_file
>*file, struct instruction *first,
>
> break;
>
>+ case INSN_STAC:
>+ if (state.ac_safe || state.ac) {
>+ WARN_FUNC("recursive STAC", sec, insn->offset);
>+ return 1;
>+ }
>+ state.ac = true;
>+ break;
>+
>+ case INSN_CLAC:
>+ if (!state.ac) {
>+ WARN_FUNC("redundant CLAC", sec, insn->offset);
>+ return 1;
>+ }
>+ if (state.ac_safe) {
>+ WARN_FUNC("AC-safe clears AC", sec, insn->offset);
>+ return 1;
>+ }
>+ state.ac = false;
>+ break;
>+
> default:
> break;
> }
>@@ -2141,6 +2197,8 @@ static int validate_functions(struct objtool_file
>*file)
> if (!insn || insn->ignore)
> continue;
>
>+ state.ac_safe = func->ac_safe;
>+
> ret = validate_branch(file, insn, state);
> warnings += ret;
> }
>@@ -2198,6 +2256,7 @@ int check(const char *_objname, bool orc)
> INIT_LIST_HEAD(&file.insn_list);
> hash_init(file.insn_hash);
> file.whitelist = find_section_by_name(file.elf,
>".discard.func_stack_frame_non_standard");
>+ file.ac_safe = find_section_by_name(file.elf, ".discard.ac_safe");
> file.c_file = find_section_by_name(file.elf, ".comment");
> file.ignore_unreachables = no_unreachable;
> file.hints = false;
>diff --git a/tools/objtool/check.h b/tools/objtool/check.h
>index e6e8a655b556..c31ec3ca78f3 100644
>--- a/tools/objtool/check.h
>+++ b/tools/objtool/check.h
>@@ -31,7 +31,7 @@ struct insn_state {
> int stack_size;
> unsigned char type;
> bool bp_scratch;
>- bool drap, end;
>+ bool drap, end, ac, ac_safe;
> int drap_reg, drap_offset;
> struct cfi_reg vals[CFI_NUM_REGS];
> };
>@@ -61,6 +61,7 @@ struct objtool_file {
> struct list_head insn_list;
> DECLARE_HASHTABLE(insn_hash, 16);
> struct section *whitelist;
>+ struct section *ac_safe;
> bool ignore_unreachables, c_file, hints, rodata;
> };
>
>diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
>index bc97ed86b9cd..064c3df31e40 100644
>--- a/tools/objtool/elf.h
>+++ b/tools/objtool/elf.h
>@@ -62,6 +62,7 @@ struct symbol {
> unsigned long offset;
> unsigned int len;
> struct symbol *pfunc, *cfunc;
>+ bool ac_safe;
> };
>
> struct rela {
I like it.
Objtool could also detect CLAC-STAC or STAC-CLAC sequences without memory operations and remove them; don't know how often that happens, but I know it *does* happen.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
On Fri, Feb 22, 2019 at 2:26 PM Peter Zijlstra <[email protected]> wrote:
>
> On Fri, Feb 22, 2019 at 07:10:34PM +0100, Thomas Gleixner wrote:
>
> > But correct :)
>
> > I agree, that a function which is doing the actual copy should be callable,
> > but random other functions? NO!
>
> So find the below patch -- which spotted fail in ptrace.c
>
> It has an AC_SAFE(func) annotation which allows marking specific
> functions as safe to call. The patch includes 2 instances which were
> required to make arch/x86 'build':
>
> arch/x86/ia32/ia32_signal.o: warning: objtool: ia32_restore_sigcontext()+0x3d: call to native_load_gs_index() with AC set
> arch/x86/kernel/ptrace.o: warning: objtool: genregs_get()+0x8e: call to getreg() with AC set
>
> It also screams (provided one has CONFIG_FUNCTION_TRACE=y) about the
> lack of notrace annotations on functions marked AC_SAFE():
>
> arch/x86/kernel/ptrace.o: warning: objtool: getreg()+0x0: call to __fentry__() with AC set
>
> It builds arch/x86 relatively clean; it only complains about some
> redundant CLACs in entry_64.S because it doesn't understand interrupts
> and I've not bothered creating an annotation for them yet.
>
> arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0x4d: redundant CLAC
> arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0x5a: redundant CLAC
> ...
> arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0xb1: redundant CLAC
Does objtool understand altinstr? If it understands that this is an
altinstr associated with a not-actually-a-fuction (i.e. END not
ENDPROC) piece of code, it could suppress this warning.
>
> -static unsigned long getreg(struct task_struct *task, unsigned long offset)
> +static notrace unsigned long getreg(struct task_struct *task, unsigned long offset)
> {
> switch (offset) {
> case offsetof(struct user_regs_struct, cs):
> @@ -444,6 +444,7 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
>
> return *pt_regs_access(task_pt_regs(task), offset);
> }
> +AC_SAFE(getreg);
This takes the address and could plausibly suppress some optimizations.
>
> +#if defined(CONFIG_STACK_VALIDATION) && !defined(BUILD_VDSO) && !defined(BUILD_VDSO32)
> +/*
> + * This macro marks functions as AC-safe, that is, it is safe to call from an
> + * EFLAGS.AC enabled region (typically user_access_begin() /
> + * user_access_end()).
> + *
> + * These functions in turn will only call AC-safe functions themselves (which
> + * precludes tracing, including __fentry__ and scheduling, including
> + * preempt_enable).
> + *
> + * AC-safe functions will obviously also not change EFLAGS.AC themselves.
> + *
> + * Since STAC/CLAC are OPL-0 only, this is all irrelevant for VDSO builds
> + * (and the generated symbol reference will in fact cause link failures).
> + */
> +#define AC_SAFE(func) \
> + static void __used __section(.discard.ac_safe) \
> + *__func_ac_safe_##func = func
Are we okay with annotating function *declarations* in a way that
their addresses get taken whenever the declaration is processed? It
would be nice if we could avoid this.
I'm wondering if we can just change the code that does getreg() and
load_gs_index() so it doesn't do it with AC set. Also, what about
paravirt kernels? They'll call into PV code for load_gs_index() with
AC set.
--Andy
On Fri, Feb 22, 2019 at 03:39:48PM -0800, [email protected] wrote:
> Objtool could also detect CLAC-STAC or STAC-CLAC sequences without
> memory operations and remove them; don't know how often that happens,
> but I know it *does* happen.
Objtool doesn't know about memops; that'd be a lot of work. Also,
objtool doesn't actually rewrite the text, at best it could warn about
such occurences.
On Fri, Feb 22, 2019 at 03:55:25PM -0800, Andy Lutomirski wrote:
> On Fri, Feb 22, 2019 at 2:26 PM Peter Zijlstra <[email protected]> wrote:
> > arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0xb1: redundant CLAC
>
> Does objtool understand altinstr?
Yep, otherwise it would've never found any of the STAC/CLAC instructions
to begin with, they're all alternatives. The emitted code is all NOPs.
> If it understands that this is an
> altinstr associated with a not-actually-a-fuction (i.e. END not
> ENDPROC) piece of code, it could suppress this warning.
Using readelf on entry_64.o the symbol type appears to be STT_NOTYPE for
these 'functions', so yes, I can try and ignore this warning for those.
> > +#define AC_SAFE(func) \
> > + static void __used __section(.discard.ac_safe) \
> > + *__func_ac_safe_##func = func
>
> Are we okay with annotating function *declarations* in a way that
> their addresses get taken whenever the declaration is processed? It
> would be nice if we could avoid this.
>
> I'm wondering if we can just change the code that does getreg() and
> load_gs_index() so it doesn't do it with AC set. Also, what about
> paravirt kernels? They'll call into PV code for load_gs_index() with
> AC set.
Fixing that code would be fine; but we need that annotation regardless,
read Linus' email a little further back, he wants things like
copy_{to,from}_user_unsafe().
Those really would need to be marked AC-safe, there's no inlining that.
That said, yes these annotations _suck_. But it's what we had and works,
I'm just copying it (from STACK_FRAME_NON_STANDARD).
That said; maybe we can do something like:
#define AC_SAFE(func) \
asm (".pushsection .discard.ac_safe_sym\n\t" \
"999: .ascii \"" #func "\"\n\t" \
".popsection\n\t" \
".pushsection .discard.ac_safe\n\t" \
_ASM_PTR " 999b\n\t" \
".popsection")
I just don't have a clue on how to read that in objtool; weak elf foo.
I'll see if I can make it work.
On Fri, Feb 22, 2019 at 03:34:00PM -0800, Linus Torvalds wrote:
> Can you make it do DF too while at it?
Sure.
On Sat, Feb 23, 2019 at 09:37:06AM +0100, Peter Zijlstra wrote:
> On Fri, Feb 22, 2019 at 03:55:25PM -0800, Andy Lutomirski wrote:
> > On Fri, Feb 22, 2019 at 2:26 PM Peter Zijlstra <[email protected]> wrote:
>
> > > arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0xb1: redundant CLAC
> >
> > Does objtool understand altinstr?
>
> Yep, otherwise it would've never found any of the STAC/CLAC instructions
> to begin with, they're all alternatives. The emitted code is all NOPs.
>
> > If it understands that this is an
> > altinstr associated with a not-actually-a-fuction (i.e. END not
> > ENDPROC) piece of code, it could suppress this warning.
>
> Using readelf on entry_64.o the symbol type appears to be STT_NOTYPE for
> these 'functions', so yes, I can try and ignore this warning for those.
>
> > > +#define AC_SAFE(func) \
> > > + static void __used __section(.discard.ac_safe) \
> > > + *__func_ac_safe_##func = func
> >
> > Are we okay with annotating function *declarations* in a way that
> > their addresses get taken whenever the declaration is processed? It
> > would be nice if we could avoid this.
> >
> > I'm wondering if we can just change the code that does getreg() and
> > load_gs_index() so it doesn't do it with AC set. Also, what about
> > paravirt kernels? They'll call into PV code for load_gs_index() with
> > AC set.
>
> Fixing that code would be fine; but we need that annotation regardless,
> read Linus' email a little further back, he wants things like
> copy_{to,from}_user_unsafe().
>
> Those really would need to be marked AC-safe, there's no inlining that.
>
> That said, yes these annotations _suck_. But it's what we had and works,
> I'm just copying it (from STACK_FRAME_NON_STANDARD).
>
> That said; maybe we can do something like:
>
> #define AC_SAFE(func) \
> asm (".pushsection .discard.ac_safe_sym\n\t" \
> "999: .ascii \"" #func "\"\n\t" \
> ".popsection\n\t" \
> ".pushsection .discard.ac_safe\n\t" \
> _ASM_PTR " 999b\n\t" \
> ".popsection")
>
> I just don't have a clue on how to read that in objtool; weak elf foo.
> I'll see if I can make it work.
Fixed all that. Should probably also convert that NON_STANDARD stuff to
this new shiny thing.
Retained the ptrace muck because it's a nice test case, your patch is
obviously a better fix there.
Haven't bothered looking at alternatives yet, this is a
defconfig+tracing build.
Weekend now.
---
arch/x86/include/asm/special_insns.h | 2 +
arch/x86/kernel/ptrace.c | 3 +-
include/linux/frame.h | 38 ++++++++++++
tools/objtool/Makefile | 2 +-
tools/objtool/arch.h | 6 +-
tools/objtool/arch/x86/decode.c | 22 ++++++-
tools/objtool/check.c | 117 ++++++++++++++++++++++++++++++++++-
tools/objtool/check.h | 2 +-
tools/objtool/elf.h | 1 +
9 files changed, 187 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 43c029cdc3fe..cd31e4433f4c 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -5,6 +5,7 @@
#ifdef __KERNEL__
+#include <linux/frame.h>
#include <asm/nops.h>
/*
@@ -135,6 +136,7 @@ static inline void native_wbinvd(void)
}
extern asmlinkage void native_load_gs_index(unsigned);
+AC_SAFE(native_load_gs_index);
static inline unsigned long __read_cr4(void)
{
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 4b8ee05dd6ad..e278b4055a8b 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -420,7 +420,7 @@ static int putreg(struct task_struct *child,
return 0;
}
-static unsigned long getreg(struct task_struct *task, unsigned long offset)
+static notrace unsigned long getreg(struct task_struct *task, unsigned long offset)
{
switch (offset) {
case offsetof(struct user_regs_struct, cs):
@@ -444,6 +444,7 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
return *pt_regs_access(task_pt_regs(task), offset);
}
+AC_SAFE(getreg);
static int genregs_get(struct task_struct *target,
const struct user_regset *regset,
diff --git a/include/linux/frame.h b/include/linux/frame.h
index 02d3ca2d9598..870a894bc586 100644
--- a/include/linux/frame.h
+++ b/include/linux/frame.h
@@ -21,4 +21,42 @@
#endif /* CONFIG_STACK_VALIDATION */
+#if defined(CONFIG_STACK_VALIDATION)
+/*
+ * This macro marks functions as AC-safe, that is, it is safe to call from an
+ * EFLAGS.AC enabled region (typically user_access_begin() /
+ * user_access_end()).
+ *
+ * These functions in turn will only call AC-safe functions themselves (which
+ * precludes tracing, including __fentry__ and scheduling, including
+ * preempt_enable).
+ *
+ * AC-safe functions will obviously also not change EFLAGS.AC themselves.
+ */
+#define AC_SAFE(func) \
+ asm (".pushsection .discard.ac_safe_sym, \"S\", @3\n\t" \
+ "999: .ascii \"" #func "\"\n\t" \
+ " .byte 0\n\t" \
+ ".popsection\n\t" \
+ ".pushsection .discard.ac_safe\n\t" \
+ ".long 999b - .\n\t" \
+ ".popsection")
+
+/*
+ * SHT_STRTAB sayeth:
+ *
+ * - The initial byte of a string table is \0. This allows an string offset
+ * value of zero to denote the NULL string.
+ *
+ * Works just fine without this, but since we set the proper section type, lets
+ * not confuse anybody reading this.
+ */
+asm(".pushsection .discard.ac_safe_sym, \"S\", @3\n\t"
+ ".byte 0\n\t"
+ ".popsection\n\t");
+
+#else
+#define AC_SAFE(func)
+#endif
+
#endif /* _LINUX_FRAME_H */
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index c9d038f91af6..5a64e5fb9d7a 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -31,7 +31,7 @@ INCLUDES := -I$(srctree)/tools/include \
-I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \
-I$(srctree)/tools/objtool/arch/$(ARCH)/include
WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed
-CFLAGS += -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES)
+CFLAGS += -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) -ggdb3
LDFLAGS += -lelf $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS)
# Allow old libelf to be used:
diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
index b0d7dc3d71b5..9795d83cbc01 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -33,7 +33,11 @@
#define INSN_STACK 8
#define INSN_BUG 9
#define INSN_NOP 10
-#define INSN_OTHER 11
+#define INSN_STAC 11
+#define INSN_CLAC 12
+#define INSN_STD 13
+#define INSN_CLD 14
+#define INSN_OTHER 15
#define INSN_LAST INSN_OTHER
enum op_dest_type {
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 540a209b78ab..bee32ad53ecf 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -369,7 +369,19 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
case 0x0f:
- if (op2 >= 0x80 && op2 <= 0x8f) {
+ if (op2 == 0x01) {
+
+ if (modrm == 0xca) {
+
+ *type = INSN_CLAC;
+
+ } else if (modrm == 0xcb) {
+
+ *type = INSN_STAC;
+
+ }
+
+ } else if (op2 >= 0x80 && op2 <= 0x8f) {
*type = INSN_JUMP_CONDITIONAL;
@@ -444,6 +456,14 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
*type = INSN_CALL;
break;
+ case 0xfc:
+ *type = INSN_CLD;
+ break;
+
+ case 0xfd:
+ *type = INSN_STD;
+ break;
+
case 0xff:
if (modrm_reg == 2 || modrm_reg == 3)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 0414a0d52262..3dedfa19cb09 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -451,6 +451,41 @@ static void add_ignores(struct objtool_file *file)
}
}
+static int add_ac_safe(struct objtool_file *file)
+{
+ struct section *sec_sym, *sec;
+ struct symbol *func;
+ struct rela *rela;
+ const char *name;
+
+ sec = find_section_by_name(file->elf, ".rela.discard.ac_safe");
+ if (!sec)
+ return 0;
+
+ sec_sym = find_section_by_name(file->elf, ".discard.ac_safe_sym");
+ if (!sec_sym) {
+ WARN("missing AC-safe symbols");
+ return -1;
+ }
+
+ list_for_each_entry(rela, &sec->rela_list, list) {
+ if (rela->sym->type != STT_SECTION) {
+ WARN("unexpected relocation symbol type in %s", sec->name);
+ return -1;
+ }
+
+ name = elf_strptr(file->elf->elf, sec_sym->idx, rela->addend);
+
+ func = find_symbol_by_name(file->elf, name);
+ if (!func)
+ continue;
+
+ func->ac_safe = true;
+ }
+
+ return 0;
+}
+
/*
* FIXME: For now, just ignore any alternatives which add retpolines. This is
* a temporary hack, as it doesn't allow ORC to unwind from inside a retpoline.
@@ -695,6 +730,7 @@ static int handle_group_alt(struct objtool_file *file,
last_new_insn = insn;
insn->ignore = orig_insn->ignore_alts;
+ insn->func = orig_insn->func;
if (insn->type != INSN_JUMP_CONDITIONAL &&
insn->type != INSN_JUMP_UNCONDITIONAL)
@@ -1239,6 +1275,10 @@ static int decode_sections(struct objtool_file *file)
add_ignores(file);
+ ret = add_ac_safe(file);
+ if (ret)
+ return ret;
+
ret = add_nospec_ignores(file);
if (ret)
return ret;
@@ -1902,6 +1942,15 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
switch (insn->type) {
case INSN_RETURN:
+ if (state.ac) {
+ WARN_FUNC("return with AC set", sec, insn->offset);
+ return 1;
+ }
+ if (state.df) {
+ WARN_FUNC("return with DF set", sec, insn->offset);
+ return 1;
+ }
+
if (func && has_modified_stack_frame(&state)) {
WARN_FUNC("return with modified stack frame",
sec, insn->offset);
@@ -1917,6 +1966,17 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
return 0;
case INSN_CALL:
+ if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
+ WARN_FUNC("call to %s() with AC set", sec, insn->offset,
+ insn->call_dest->name);
+ return 1;
+ }
+ if (state.df) {
+ WARN_FUNC("call to %s() with DF set", sec, insn->offset,
+ insn->call_dest->name);
+ return 1;
+ }
+
if (is_fentry_call(insn))
break;
@@ -1926,8 +1986,25 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
if (ret == -1)
return 1;
- /* fallthrough */
+ if (!no_fp && func && !has_valid_stack_frame(&state)) {
+ WARN_FUNC("call without frame pointer save/setup",
+ sec, insn->offset);
+ return 1;
+ }
+ break;
+
case INSN_CALL_DYNAMIC:
+ if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
+ WARN_FUNC("call to %s() with AC set", sec, insn->offset,
+ insn->call_dest->name);
+ return 1;
+ }
+ if (state.df) {
+ WARN_FUNC("call to %s() with DF set", sec, insn->offset,
+ insn->call_dest->name);
+ return 1;
+ }
+
if (!no_fp && func && !has_valid_stack_frame(&state)) {
WARN_FUNC("call without frame pointer save/setup",
sec, insn->offset);
@@ -1980,6 +2057,42 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
break;
+ case INSN_STAC:
+ if (state.ac_safe || state.ac) {
+ WARN_FUNC("recursive STAC", sec, insn->offset);
+ return 1;
+ }
+ state.ac = true;
+ break;
+
+ case INSN_CLAC:
+ if (!state.ac && insn->func) {
+ WARN_FUNC("redundant CLAC", sec, insn->offset);
+ return 1;
+ }
+ if (state.ac_safe) {
+ WARN_FUNC("AC-safe clears AC", sec, insn->offset);
+ return 1;
+ }
+ state.ac = false;
+ break;
+
+ case INSN_STD:
+ if (state.df) {
+ WARN_FUNC("recursive STD", sec, insn->offset);
+ return 1;
+ }
+ state.df = true;
+ break;
+
+ case INSN_CLD:
+ if (!state.df && insn->func) {
+ WARN_FUNC("redundant CLD", sec, insn->offset);
+ return 1;
+ }
+ state.df = false;
+ break;
+
default:
break;
}
@@ -2141,6 +2254,8 @@ static int validate_functions(struct objtool_file *file)
if (!insn || insn->ignore)
continue;
+ state.ac_safe = func->ac_safe;
+
ret = validate_branch(file, insn, state);
warnings += ret;
}
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index e6e8a655b556..602634792151 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -31,7 +31,7 @@ struct insn_state {
int stack_size;
unsigned char type;
bool bp_scratch;
- bool drap, end;
+ bool drap, end, ac, ac_safe, df;
int drap_reg, drap_offset;
struct cfi_reg vals[CFI_NUM_REGS];
};
diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
index bc97ed86b9cd..064c3df31e40 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -62,6 +62,7 @@ struct symbol {
unsigned long offset;
unsigned int len;
struct symbol *pfunc, *cfunc;
+ bool ac_safe;
};
struct rela {
On 22/02/2019 22:26, Peter Zijlstra wrote:
> On Fri, Feb 22, 2019 at 07:10:34PM +0100, Thomas Gleixner wrote:
>
>> But correct :)
>
>> I agree, that a function which is doing the actual copy should be callable,
>> but random other functions? NO!
>
> So find the below patch -- which spotted fail in ptrace.c
>
> It has an AC_SAFE(func) annotation which allows marking specific
> functions as safe to call. The patch includes 2 instances which were
> required to make arch/x86 'build':
>
> arch/x86/ia32/ia32_signal.o: warning: objtool: ia32_restore_sigcontext()+0x3d: call to native_load_gs_index() with AC set
> arch/x86/kernel/ptrace.o: warning: objtool: genregs_get()+0x8e: call to getreg() with AC set
>
> It also screams (provided one has CONFIG_FUNCTION_TRACE=y) about the
> lack of notrace annotations on functions marked AC_SAFE():
>
> arch/x86/kernel/ptrace.o: warning: objtool: getreg()+0x0: call to __fentry__() with AC set
>
> It builds arch/x86 relatively clean; it only complains about some
> redundant CLACs in entry_64.S because it doesn't understand interrupts
> and I've not bothered creating an annotation for them yet.
>
> arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0x4d: redundant CLAC
> arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0x5a: redundant CLAC
> ...
> arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0xb1: redundant CLAC
>
> Also, I realized we don't need special annotations for preempt_count;
> preempt_disable() emits a CALL instruction which should readily trigger
> the warnings added here.
>
> The VDSO thing is a bit of a hack, but I couldn't quickly find anything
> better.
>
> Comments?
I haven't looked at all the details. But could the annotation be called
UACCESS_SAFE() (and corresponding naming in the objtool checks)? Since
this is not an x86 only issue and the AC flags only exists for x86.
Cheers,
Julien
>
> ---
> arch/x86/include/asm/special_insns.h | 2 ++
> arch/x86/kernel/ptrace.c | 3 +-
> include/linux/frame.h | 23 ++++++++++++++
> tools/objtool/arch.h | 4 ++-
> tools/objtool/arch/x86/decode.c | 14 ++++++++-
> tools/objtool/check.c | 59 ++++++++++++++++++++++++++++++++++++
> tools/objtool/check.h | 3 +-
> tools/objtool/elf.h | 1 +
> 8 files changed, 105 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
> index 43c029cdc3fe..cd31e4433f4c 100644
> --- a/arch/x86/include/asm/special_insns.h
> +++ b/arch/x86/include/asm/special_insns.h
> @@ -5,6 +5,7 @@
>
> #ifdef __KERNEL__
>
> +#include <linux/frame.h>
> #include <asm/nops.h>
>
> /*
> @@ -135,6 +136,7 @@ static inline void native_wbinvd(void)
> }
>
> extern asmlinkage void native_load_gs_index(unsigned);
> +AC_SAFE(native_load_gs_index);
>
> static inline unsigned long __read_cr4(void)
> {
> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
> index 4b8ee05dd6ad..e278b4055a8b 100644
> --- a/arch/x86/kernel/ptrace.c
> +++ b/arch/x86/kernel/ptrace.c
> @@ -420,7 +420,7 @@ static int putreg(struct task_struct *child,
> return 0;
> }
>
> -static unsigned long getreg(struct task_struct *task, unsigned long offset)
> +static notrace unsigned long getreg(struct task_struct *task, unsigned long offset)
> {
> switch (offset) {
> case offsetof(struct user_regs_struct, cs):
> @@ -444,6 +444,7 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
>
> return *pt_regs_access(task_pt_regs(task), offset);
> }
> +AC_SAFE(getreg);
>
> static int genregs_get(struct task_struct *target,
> const struct user_regset *regset,
> diff --git a/include/linux/frame.h b/include/linux/frame.h
> index 02d3ca2d9598..5d354cf42a56 100644
> --- a/include/linux/frame.h
> +++ b/include/linux/frame.h
> @@ -21,4 +21,27 @@
>
> #endif /* CONFIG_STACK_VALIDATION */
>
> +#if defined(CONFIG_STACK_VALIDATION) && !defined(BUILD_VDSO) && !defined(BUILD_VDSO32)
> +/*
> + * This macro marks functions as AC-safe, that is, it is safe to call from an
> + * EFLAGS.AC enabled region (typically user_access_begin() /
> + * user_access_end()).
> + *
> + * These functions in turn will only call AC-safe functions themselves (which
> + * precludes tracing, including __fentry__ and scheduling, including
> + * preempt_enable).
> + *
> + * AC-safe functions will obviously also not change EFLAGS.AC themselves.
> + *
> + * Since STAC/CLAC are OPL-0 only, this is all irrelevant for VDSO builds
> + * (and the generated symbol reference will in fact cause link failures).
> + */
> +#define AC_SAFE(func) \
> + static void __used __section(.discard.ac_safe) \
> + *__func_ac_safe_##func = func
> +
> +#else
> +#define AC_SAFE(func)
> +#endif
> +
> #endif /* _LINUX_FRAME_H */
> diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
> index b0d7dc3d71b5..48327099466d 100644
> --- a/tools/objtool/arch.h
> +++ b/tools/objtool/arch.h
> @@ -33,7 +33,9 @@
> #define INSN_STACK 8
> #define INSN_BUG 9
> #define INSN_NOP 10
> -#define INSN_OTHER 11
> +#define INSN_STAC 11
> +#define INSN_CLAC 12
> +#define INSN_OTHER 13
> #define INSN_LAST INSN_OTHER
>
> enum op_dest_type {
> diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
> index 540a209b78ab..d1e99d1460a5 100644
> --- a/tools/objtool/arch/x86/decode.c
> +++ b/tools/objtool/arch/x86/decode.c
> @@ -369,7 +369,19 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
>
> case 0x0f:
>
> - if (op2 >= 0x80 && op2 <= 0x8f) {
> + if (op2 == 0x01) {
> +
> + if (modrm == 0xca) {
> +
> + *type = INSN_CLAC;
> +
> + } else if (modrm == 0xcb) {
> +
> + *type = INSN_STAC;
> +
> + }
> +
> + } else if (op2 >= 0x80 && op2 <= 0x8f) {
>
> *type = INSN_JUMP_CONDITIONAL;
>
> diff --git a/tools/objtool/check.c b/tools/objtool/check.c
> index 0414a0d52262..01852602ca31 100644
> --- a/tools/objtool/check.c
> +++ b/tools/objtool/check.c
> @@ -127,6 +127,24 @@ static bool ignore_func(struct objtool_file *file, struct symbol *func)
> return false;
> }
>
> +static bool ac_safe_func(struct objtool_file *file, struct symbol *func)
> +{
> + struct rela *rela;
> +
> + /* check for AC_SAFE */
> + if (file->ac_safe && file->ac_safe->rela)
> + list_for_each_entry(rela, &file->ac_safe->rela->rela_list, list) {
> + if (rela->sym->type == STT_SECTION &&
> + rela->sym->sec == func->sec &&
> + rela->addend == func->offset)
> + return true;
> + if (/* rela->sym->type == STT_FUNC && */ rela->sym == func)
> + return true;
> + }
> +
> + return false;
> +}
> +
> /*
> * This checks to see if the given function is a "noreturn" function.
> *
> @@ -439,6 +457,8 @@ static void add_ignores(struct objtool_file *file)
>
> for_each_sec(file, sec) {
> list_for_each_entry(func, &sec->symbol_list, list) {
> + func->ac_safe = ac_safe_func(file, func);
> +
> if (func->type != STT_FUNC)
> continue;
>
> @@ -1902,6 +1922,11 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
> switch (insn->type) {
>
> case INSN_RETURN:
> + if (state.ac) {
> + WARN_FUNC("return with AC set", sec, insn->offset);
> + return 1;
> + }
> +
> if (func && has_modified_stack_frame(&state)) {
> WARN_FUNC("return with modified stack frame",
> sec, insn->offset);
> @@ -1917,6 +1942,12 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
> return 0;
>
> case INSN_CALL:
> + if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
> + WARN_FUNC("call to %s() with AC set", sec, insn->offset,
> + insn->call_dest->name);
> + return 1;
> + }
> +
> if (is_fentry_call(insn))
> break;
>
> @@ -1928,6 +1959,11 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
>
> /* fallthrough */
> case INSN_CALL_DYNAMIC:
> + if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
> + WARN_FUNC("call to %s() with AC set", sec, insn->offset,
> + insn->call_dest->name);
> + return 1;
> + }
> if (!no_fp && func && !has_valid_stack_frame(&state)) {
> WARN_FUNC("call without frame pointer save/setup",
> sec, insn->offset);
> @@ -1980,6 +2016,26 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
>
> break;
>
> + case INSN_STAC:
> + if (state.ac_safe || state.ac) {
> + WARN_FUNC("recursive STAC", sec, insn->offset);
> + return 1;
> + }
> + state.ac = true;
> + break;
> +
> + case INSN_CLAC:
> + if (!state.ac) {
> + WARN_FUNC("redundant CLAC", sec, insn->offset);
> + return 1;
> + }
> + if (state.ac_safe) {
> + WARN_FUNC("AC-safe clears AC", sec, insn->offset);
> + return 1;
> + }
> + state.ac = false;
> + break;
> +
> default:
> break;
> }
> @@ -2141,6 +2197,8 @@ static int validate_functions(struct objtool_file *file)
> if (!insn || insn->ignore)
> continue;
>
> + state.ac_safe = func->ac_safe;
> +
> ret = validate_branch(file, insn, state);
> warnings += ret;
> }
> @@ -2198,6 +2256,7 @@ int check(const char *_objname, bool orc)
> INIT_LIST_HEAD(&file.insn_list);
> hash_init(file.insn_hash);
> file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
> + file.ac_safe = find_section_by_name(file.elf, ".discard.ac_safe");
> file.c_file = find_section_by_name(file.elf, ".comment");
> file.ignore_unreachables = no_unreachable;
> file.hints = false;
> diff --git a/tools/objtool/check.h b/tools/objtool/check.h
> index e6e8a655b556..c31ec3ca78f3 100644
> --- a/tools/objtool/check.h
> +++ b/tools/objtool/check.h
> @@ -31,7 +31,7 @@ struct insn_state {
> int stack_size;
> unsigned char type;
> bool bp_scratch;
> - bool drap, end;
> + bool drap, end, ac, ac_safe;
> int drap_reg, drap_offset;
> struct cfi_reg vals[CFI_NUM_REGS];
> };
> @@ -61,6 +61,7 @@ struct objtool_file {
> struct list_head insn_list;
> DECLARE_HASHTABLE(insn_hash, 16);
> struct section *whitelist;
> + struct section *ac_safe;
> bool ignore_unreachables, c_file, hints, rodata;
> };
>
> diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
> index bc97ed86b9cd..064c3df31e40 100644
> --- a/tools/objtool/elf.h
> +++ b/tools/objtool/elf.h
> @@ -62,6 +62,7 @@ struct symbol {
> unsigned long offset;
> unsigned int len;
> struct symbol *pfunc, *cfunc;
> + bool ac_safe;
> };
>
> struct rela {
>
--
Julien Thierry
On February 23, 2019 12:39:42 AM PST, Peter Zijlstra <[email protected]> wrote:
>On Fri, Feb 22, 2019 at 03:39:48PM -0800, [email protected] wrote:
>> Objtool could also detect CLAC-STAC or STAC-CLAC sequences without
>> memory operations and remove them; don't know how often that happens,
>> but I know it *does* happen.
>
>Objtool doesn't know about memops; that'd be a lot of work. Also,
>objtool doesn't actually rewrite the text, at best it could warn about
>such occurences.
It doesn't have to understand the contents of the memop, but it seems that the presence of a modrm with mode ≠ 3 should be plenty. It needs to know that much in order to know the length of instructions anyway. For extra credit, ignore LEA or hinting instructions.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
On February 23, 2019 12:39:42 AM PST, Peter Zijlstra <[email protected]> wrote:
>On Fri, Feb 22, 2019 at 03:39:48PM -0800, [email protected] wrote:
>> Objtool could also detect CLAC-STAC or STAC-CLAC sequences without
>> memory operations and remove them; don't know how often that happens,
>> but I know it *does* happen.
>
>Objtool doesn't know about memops; that'd be a lot of work. Also,
>objtool doesn't actually rewrite the text, at best it could warn about
>such occurences.
It doesn't even have to change the text per se, just nullify the altinstr.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
On Fri, Feb 22, 2019 at 03:55:25PM -0800, Andy Lutomirski wrote:
> I'm wondering if we can just change the code that does getreg() and
> load_gs_index() so it doesn't do it with AC set. Also, what about
> paravirt kernels? They'll call into PV code for load_gs_index() with
> AC set.
Paravirt can go bugger off. There's no sane way to fix that.
Luckily the load_gs_index() thing is part of the paravirt me harder crap
and so nobody sane should ever hit that.
I don't fully understand that code at all; I also have no clue why GS
has paravirt bits on but the other segments do not. But it looks like we
want to do that RELOAD_SEG() crap under SMAP because of the GET_SEG() ->
get_user_ex() thing.
Anyway, I only see 3 options here:
1) live with the paravirt me harder builds complaining
2) exclude the AC validation from the paravirt me harder builds
3) rewrite this code to no need that stupid call in the first place
2 seems like an exceptionally bad ideal, 3 would need someone that
understands this, so for now I'll pick 1 :-)
*thought*... we could delay the actual set_user_seg() thing until after
the get_user_catch(), would that work?
On Mon, Feb 25, 2019 at 11:51:44AM +0100, Peter Zijlstra wrote:
> On Fri, Feb 22, 2019 at 03:55:25PM -0800, Andy Lutomirski wrote:
> > I'm wondering if we can just change the code that does getreg() and
> > load_gs_index() so it doesn't do it with AC set. Also, what about
> > paravirt kernels? They'll call into PV code for load_gs_index() with
> > AC set.
>
> Paravirt can go bugger off. There's no sane way to fix that.
> I don't fully understand that code at all; I also have no clue why GS
> has paravirt bits on but the other segments do not.
*sigh* SWAPGS
> *thought*... we could delay the actual set_user_seg() thing until after
> the get_user_catch(), would that work?
How horrible / broken is this?
---
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 321fe5f5d0e9..67c866943102 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -60,17 +60,21 @@
regs->seg = GET_SEG(seg) | 3; \
} while (0)
-#define RELOAD_SEG(seg) { \
- unsigned int pre = GET_SEG(seg); \
- unsigned int cur = get_user_seg(seg); \
- pre |= 3; \
- if (pre != cur) \
- set_user_seg(seg, pre); \
+#define LOAD_SEG(seg) { \
+ pre_##seg = 3 | GET_SEG(seg); \
+ cur_##seg = get_user_seg(seg); \
+}
+
+#define RELOAD_SEG(seg) { \
+ if (pre_##seg != cur_##seg) \
+ set_user_seg(seg, pre_##seg); \
}
static int ia32_restore_sigcontext(struct pt_regs *regs,
struct sigcontext_32 __user *sc)
{
+ u16 pre_gs, pre_fs, pre_ds, pre_es;
+ u16 cur_gs, cur_fs, cur_ds, cur_es;
unsigned int tmpflags, err = 0;
void __user *buf;
u32 tmp;
@@ -85,10 +89,10 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
* the handler, but does not clobber them at least in the
* normal case.
*/
- RELOAD_SEG(gs);
- RELOAD_SEG(fs);
- RELOAD_SEG(ds);
- RELOAD_SEG(es);
+ LOAD_SEG(gs);
+ LOAD_SEG(fs);
+ LOAD_SEG(ds);
+ LOAD_SEG(es);
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
COPY(dx); COPY(cx); COPY(ip); COPY(ax);
@@ -106,6 +110,11 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
buf = compat_ptr(tmp);
} get_user_catch(err);
+ RELOAD_SEG(gs);
+ RELOAD_SEG(fs);
+ RELOAD_SEG(ds);
+ RELOAD_SEG(es);
+
err |= fpu__restore_sig(buf, 1);
force_iret();
On Mon, Feb 25, 2019 at 08:33:35AM +0000, Julien Thierry wrote:
> > It has an AC_SAFE(func) annotation which allows marking specific
> > functions as safe to call. The patch includes 2 instances which were
> > required to make arch/x86 'build':
> I haven't looked at all the details. But could the annotation be called
> UACCESS_SAFE() (and corresponding naming in the objtool checks)? Since
> this is not an x86 only issue and the AC flags only exists for x86.
Sure that works. Lemme sed the patches.
On Mon, Feb 25, 2019 at 12:47:00AM -0800, [email protected] wrote:
> It doesn't have to understand the contents of the memop, but it seems
> that the presence of a modrm with mode ≠ 3 should be plenty. It needs
> to know that much in order to know the length of instructions anyway.
> For extra credit, ignore LEA or hinting instructions.
A little something like so then?
arch/x86/kernel/fpu/signal.o: warning: objtool: .altinstr_replacement+0x9c: UACCESS disable without MEMOPs: copy_fpstate_to_sigframe()
00000000023c 000200000002 R_X86_64_PC32 0000000000000000 .text + 604
000000000240 000700000002 R_X86_64_PC32 0000000000000000 .altinstr_replacement + 99
000000000249 000200000002 R_X86_64_PC32 0000000000000000 .text + 610
00000000024d 000700000002 R_X86_64_PC32 0000000000000000 .altinstr_replacement + 9c
.text
604: 90 nop
605: 90 nop
606: 90 nop
607: 83 ce 03 or $0x3,%esi
60a: 89 b3 00 02 00 00 mov %esi,0x200(%rbx)
610: 90 nop
611: 90 nop
612: 90 nop
.altinstr_replacement
99: 0f 01 cb stac
9c: 0f 01 ca clac
Which looks like the tool failed to recognise that MOV as a memop.
---
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -37,7 +37,8 @@
#define INSN_CLAC 12
#define INSN_STD 13
#define INSN_CLD 14
-#define INSN_OTHER 15
+#define INSN_MEMOP 15
+#define INSN_OTHER 16
#define INSN_LAST INSN_OTHER
enum op_dest_type {
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -123,6 +123,9 @@ int arch_decode_instruction(struct elf *
modrm_mod = X86_MODRM_MOD(modrm);
modrm_reg = X86_MODRM_REG(modrm);
modrm_rm = X86_MODRM_RM(modrm);
+
+ if (modrm_mod != 3)
+ *type = INSN_MEMOP;
}
if (insn.sib.nbytes)
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2047,6 +2047,7 @@ static int validate_branch(struct objtoo
WARN_FUNC("recursive UACCESS enable", sec, insn->offset);
state.uaccess = true;
+ state.memop = false;
break;
case INSN_CLAC:
@@ -2058,6 +2059,9 @@ static int validate_branch(struct objtoo
break;
}
+ if (!state.memop && insn->func)
+ WARN_FUNC("UACCESS disable without MEMOPs: %s()", sec, insn->offset, insn->func->name);
+
state.uaccess = false;
break;
@@ -2075,6 +2079,10 @@ static int validate_branch(struct objtoo
state.df = false;
break;
+ case INSN_MEMOP:
+ state.memop = true;
+ break;
+
default:
break;
}
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -31,7 +31,7 @@ struct insn_state {
int stack_size;
unsigned char type;
bool bp_scratch;
- bool drap, end, uaccess, uaccess_safe, df;
+ bool drap, end, uaccess, uaccess_safe, df, memop;
int drap_reg, drap_offset;
struct cfi_reg vals[CFI_NUM_REGS];
};
> On Feb 25, 2019, at 3:53 AM, Peter Zijlstra <[email protected]> wrote:
>
>> On Mon, Feb 25, 2019 at 11:51:44AM +0100, Peter Zijlstra wrote:
>>> On Fri, Feb 22, 2019 at 03:55:25PM -0800, Andy Lutomirski wrote:
>>> I'm wondering if we can just change the code that does getreg() and
>>> load_gs_index() so it doesn't do it with AC set. Also, what about
>>> paravirt kernels? They'll call into PV code for load_gs_index() with
>>> AC set.
>>
>> Paravirt can go bugger off. There's no sane way to fix that.
>
>> I don't fully understand that code at all; I also have no clue why GS
>> has paravirt bits on but the other segments do not.
>
> *sigh* SWAPGS
>
>> *thought*... we could delay the actual set_user_seg() thing until after
>> the get_user_catch(), would that work?
>
>
> How horrible / broken is this?
>
> ---
>
> diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
> index 321fe5f5d0e9..67c866943102 100644
> --- a/arch/x86/ia32/ia32_signal.c
> +++ b/arch/x86/ia32/ia32_signal.c
> @@ -60,17 +60,21 @@
> regs->seg = GET_SEG(seg) | 3; \
> } while (0)
>
> -#define RELOAD_SEG(seg) { \
> - unsigned int pre = GET_SEG(seg); \
> - unsigned int cur = get_user_seg(seg); \
> - pre |= 3; \
> - if (pre != cur) \
> - set_user_seg(seg, pre); \
> +#define LOAD_SEG(seg) { \
> + pre_##seg = 3 | GET_SEG(seg); \
> + cur_##seg = get_user_seg(seg); \
> +}
> +
> +#define RELOAD_SEG(seg) { \
> + if (pre_##seg != cur_##seg) \
> + set_user_seg(seg, pre_##seg); \
> }
>
> static int ia32_restore_sigcontext(struct pt_regs *regs,
> struct sigcontext_32 __user *sc)
> {
> + u16 pre_gs, pre_fs, pre_ds, pre_es;
> + u16 cur_gs, cur_fs, cur_ds, cur_es;
> unsigned int tmpflags, err = 0;
> void __user *buf;
> u32 tmp;
> @@ -85,10 +89,10 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
> * the handler, but does not clobber them at least in the
> * normal case.
> */
> - RELOAD_SEG(gs);
> - RELOAD_SEG(fs);
> - RELOAD_SEG(ds);
> - RELOAD_SEG(es);
> + LOAD_SEG(gs);
> + LOAD_SEG(fs);
> + LOAD_SEG(ds);
> + LOAD_SEG(es);
>
> COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
> COPY(dx); COPY(cx); COPY(ip); COPY(ax);
> @@ -106,6 +110,11 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
> buf = compat_ptr(tmp);
> } get_user_catch(err);
>
> + RELOAD_SEG(gs);
> + RELOAD_SEG(fs);
> + RELOAD_SEG(ds);
> + RELOAD_SEG(es);
> +
> err |= fpu__restore_sig(buf, 1);
>
> force_iret();
I would call this pretty horrible. How about we do it without macros? :)
But yes, deferring the segment load until after the read seems fine to me. Frankly, we could also just copy_from_user the whole thing up front — thus code is not really a serious fast path.
On Mon, Feb 25, 2019 at 02:21:03PM +0100, Peter Zijlstra wrote:
> On Mon, Feb 25, 2019 at 12:47:00AM -0800, [email protected] wrote:
> > It doesn't have to understand the contents of the memop, but it seems
> > that the presence of a modrm with mode ≠ 3 should be plenty. It needs
> > to know that much in order to know the length of instructions anyway.
> > For extra credit, ignore LEA or hinting instructions.
>
> A little something like so then?
$ ./objtool check --no-fp --backtrace ../../defconfig-build/arch/x86/lib/usercopy_64.o
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: .altinstr_replacement+0x3: UACCESS disable without MEMOPs: __clear_user()
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: __clear_user()+0x3a: (alt)
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: __clear_user()+0x2e: (branch)
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: __clear_user()+0x18: (branch)
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: .altinstr_replacement+0xffffffffffffffff: (branch)
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: __clear_user()+0x5: (alt)
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: __clear_user()+0x0: <=== (func)
0000000000000000 <__clear_user>:
0: e8 00 00 00 00 callq 5 <__clear_user+0x5>
1: R_X86_64_PLT32 __fentry__-0x4
5: 90 nop
6: 90 nop
7: 90 nop
8: 48 89 f0 mov %rsi,%rax
b: 48 c1 ee 03 shr $0x3,%rsi
f: 83 e0 07 and $0x7,%eax
12: 48 89 f1 mov %rsi,%rcx
15: 48 85 c9 test %rcx,%rcx
18: 74 0f je 29 <__clear_user+0x29>
1a: 48 c7 07 00 00 00 00 movq $0x0,(%rdi)
21: 48 83 c7 08 add $0x8,%rdi
25: ff c9 dec %ecx
27: 75 f1 jne 1a <__clear_user+0x1a>
29: 48 89 c1 mov %rax,%rcx
2c: 85 c9 test %ecx,%ecx
2e: 74 0a je 3a <__clear_user+0x3a>
30: c6 07 00 movb $0x0,(%rdi)
33: 48 ff c7 inc %rdi
36: ff c9 dec %ecx
38: 75 f6 jne 30 <__clear_user+0x30>
3a: 90 nop
3b: 90 nop
3c: 90 nop
3d: 48 89 c8 mov %rcx,%rax
40: c3 retq
Seems correct. Not sure you want to go fix that though. Let me know if
you want more output.