LinuxLists.cc - Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

2008-08-06 18:02:24

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

Hello Herbert,

I think I finally found the problem.

Here a short description again: all our routers with a via C3 using padlock for AES-encryption are
crashing with 2.6.26 while they work fine with 2.6.25. Not using padlock
(i.e. using the i386 assembler version of AES) they just work fine.

After a kernel bisection I found that
fa5c4639419668cbb18ca3d20c1253559a3b43ae works fine,
2adee9b30d1382fba97825b9c50e4f50a0117c36 crashes.

I did not try to bisect the last 4 commits as they seem to be related.

Instead I took a 2.6.26 and reverted the following commits:

e8a496ac8cd00cabbdaa373db4818a9ad19a1c5a
fd3c3ed5d1e3ceb37635cbe6d220ab94aae0781d
2adee9b30d1382fba97825b9c50e4f50a0117c36
1679f2710ac58df580d3716fab1f42ae50a226eb
aa283f49276e7d840a40fb01eee6de97eaa7e012
61c4628b538608c1a85211ed8438136adfeb9a95

and changed
arch/x86/kernel/process.c
by hand.

With such a modified 2.6.26 the problem disappears.

Am Mittwoch, 6. August 2008 12:33 schrieben Sie:
> On Wed, Jul 30, 2008 at 02:11:01PM +0200, Wolfgang Walter wrote:
> > BUG: unable to handle kernel NULL pointer dereference at 000001f0
> > IP: [<c010280c>] __switch_to+0x23/0x103
> > *pde = 00000000
> > Oops: 0002 [#1] PREEMPT
> > Modules linked in:
> >
> > Pid: 1396, comm: dmesg Not tainted (2.6.26 #1)
> > EIP: 0060:[<c010280c>] EFLAGS: 00010002 CPU: 0
> > EIP is at __switch_to+0x23/0x103
> > EAX: 00000000 EBX: de87cdc0 ECX: 000021d9 EDX: de87cdc0
> > ESI: dc4e3810 EDI: de87cfe8 EBP: dc4e3a38 ESP: cfdb5b30
> > DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
> > Process dmesg (pid: 1396, ti=cfdb4000 task=dc4e3810 task.ti=cfda4000)
> > Stack: de87cdc0 00000000 cfdbd300 dc4e3810 c039bf79 de87cdc0 00000086
> > d13da410 c0119aca de87cf2c 00000046 d13da410 00000000 7fffffff 00000080
> > cfdb5f9c c039c369 c02381d0 c0234f09 cfdb5be4 00000104 00000000 cf3bb480
> > 00000000 Call Trace:
>

Here is the difference between 2.6.26 and the modified version:

=============================================
diff -ru kernels/linux-2.6.26/arch/x86/kernel/i387.c tests/linux-2.6ww/arch/x86/kernel/i387.c
--- kernels/linux-2.6.26/arch/x86/kernel/i387.c 2008-07-15 11:29:31.000000000 +0200
+++ tests/linux-2.6ww/arch/x86/kernel/i387.c 2008-08-06 18:40:54.000000000 +0200
@@ -35,18 +35,17 @@
#endif

static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
-unsigned int xstate_size;
-static struct i387_fxsave_struct fx_scratch __cpuinitdata;

-void __cpuinit mxcsr_feature_mask_init(void)
+void mxcsr_feature_mask_init(void)
{
unsigned long mask = 0;

clts();
if (cpu_has_fxsr) {
- memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
- asm volatile("fxsave %0" : : "m" (fx_scratch));
- mask = fx_scratch.mxcsr_mask;
+ memset(&current->thread.i387.fxsave, 0,
+ sizeof(struct i387_fxsave_struct));
+ asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
+ mask = current->thread.i387.fxsave.mxcsr_mask;
if (mask == 0)
mask = 0x0000ffbf;
}
@@ -54,21 +53,6 @@
stts();
}

-void __init init_thread_xstate(void)
-{
- if (!HAVE_HWFP) {
- xstate_size = sizeof(struct i387_soft_struct);
- return;
- }
-
- if (cpu_has_fxsr)
- xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
- else
- xstate_size = sizeof(struct i387_fsave_struct);
-#endif
-}
-
#ifdef CONFIG_X86_64
/*
* Called at bootup to set up the initial FPU state that is later cloned
@@ -77,6 +61,10 @@
void __cpuinit fpu_init(void)
{
unsigned long oldcr0 = read_cr0();
+ extern void __bad_fxsave_alignment(void);
+
+ if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
+ __bad_fxsave_alignment();

set_in_cr4(X86_CR4_OSFXSR);
set_in_cr4(X86_CR4_OSXMMEXCPT);
@@ -96,53 +84,32 @@
* value at reset if we support XMM instructions and then
* remeber the current task has used the FPU.
*/
-int init_fpu(struct task_struct *tsk)
+void init_fpu(struct task_struct *tsk)
{
if (tsk_used_math(tsk)) {
- if (HAVE_HWFP && tsk == current)
+ if (tsk == current)
unlazy_fpu(tsk);
- return 0;
- }
-
- /*
- * Memory allocation at the first usage of the FPU and other state.
- */
- if (!tsk->thread.xstate) {
- tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
- GFP_KERNEL);
- if (!tsk->thread.xstate)
- return -ENOMEM;
- }
-
-#ifdef CONFIG_X86_32
- if (!HAVE_HWFP) {
- memset(tsk->thread.xstate, 0, xstate_size);
- finit();
- set_stopped_child_used_math(tsk);
- return 0;
+ return;
}
-#endif

if (cpu_has_fxsr) {
- struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
-
- memset(fx, 0, xstate_size);
- fx->cwd = 0x37f;
+ memset(&tsk->thread.i387.fxsave, 0,
+ sizeof(struct i387_fxsave_struct));
+ tsk->thread.i387.fxsave.cwd = 0x37f;
if (cpu_has_xmm)
- fx->mxcsr = MXCSR_DEFAULT;
+ tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT;
} else {
- struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
- memset(fp, 0, xstate_size);
- fp->cwd = 0xffff037fu;
- fp->swd = 0xffff0000u;
- fp->twd = 0xffffffffu;
- fp->fos = 0xffff0000u;
+ memset(&tsk->thread.i387.fsave, 0,
+ sizeof(struct i387_fsave_struct));
+ tsk->thread.i387.fsave.cwd = 0xffff037fu;
+ tsk->thread.i387.fsave.swd = 0xffff0000u;
+ tsk->thread.i387.fsave.twd = 0xffffffffu;
+ tsk->thread.i387.fsave.fos = 0xffff0000u;
}
/*
* Only the device not available exception or ptrace can call init_fpu.
*/
set_stopped_child_used_math(tsk);
- return 0;
}

int fpregs_active(struct task_struct *target, const struct user_regset *regset)
@@ -159,17 +126,13 @@
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- int ret;
-
if (!cpu_has_fxsr)
return -EIO;

- ret = init_fpu(target);
- if (ret)
- return ret;
+ init_fpu(target);

return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fxsave, 0, -1);
+ &target->thread.i387.fxsave, 0, -1);
}

int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -181,19 +144,16 @@
if (!cpu_has_fxsr)
return -EIO;

- ret = init_fpu(target);
- if (ret)
- return ret;
-
+ init_fpu(target);
set_stopped_child_used_math(target);

ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fxsave, 0, -1);
+ &target->thread.i387.fxsave, 0, -1);

/*
* mxcsr reserved bits must be masked to zero for security reasons.
*/
- target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+ target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;

return ret;
}
@@ -273,7 +233,7 @@
static void
convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
{
- struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+ struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
int i;
@@ -313,7 +273,7 @@
const struct user_i387_ia32_struct *env)

{
- struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+ struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
int i;
@@ -342,19 +302,15 @@
void *kbuf, void __user *ubuf)
{
struct user_i387_ia32_struct env;
- int ret;
-
- ret = init_fpu(target);
- if (ret)
- return ret;

if (!HAVE_HWFP)
return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);

+ init_fpu(target);
+
if (!cpu_has_fxsr) {
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fsave, 0,
- -1);
+ &target->thread.i387.fsave, 0, -1);
}

if (kbuf && pos == 0 && count == sizeof(env)) {
@@ -374,18 +330,15 @@
struct user_i387_ia32_struct env;
int ret;

- ret = init_fpu(target);
- if (ret)
- return ret;
-
- set_stopped_child_used_math(target);
-
if (!HAVE_HWFP)
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);

+ init_fpu(target);
+ set_stopped_child_used_math(target);
+
if (!cpu_has_fxsr) {
return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fsave, 0, -1);
+ &target->thread.i387.fsave, 0, -1);
}

if (pos > 0 || count < sizeof(env))
@@ -405,11 +358,11 @@
static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
{
struct task_struct *tsk = current;
- struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;

unlazy_fpu(tsk);
- fp->status = fp->swd;
- if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
+ tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
+ if (__copy_to_user(buf, &tsk->thread.i387.fsave,
+ sizeof(struct i387_fsave_struct)))
return -1;
return 1;
}
@@ -417,7 +370,6 @@
static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
{
struct task_struct *tsk = current;
- struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
struct user_i387_ia32_struct env;
int err = 0;

@@ -427,12 +379,12 @@
if (__copy_to_user(buf, &env, sizeof(env)))
return -1;

- err |= __put_user(fx->swd, &buf->status);
+ err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
if (err)
return -1;

- if (__copy_to_user(&buf->_fxsr_env[0], fx,
+ if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
sizeof(struct i387_fxsave_struct)))
return -1;
return 1;
@@ -464,7 +416,8 @@
{
struct task_struct *tsk = current;

- return __copy_from_user(&tsk->thread.xstate->fsave, buf,
+ clear_fpu(tsk);
+ return __copy_from_user(&tsk->thread.i387.fsave, buf,
sizeof(struct i387_fsave_struct));
}

@@ -474,10 +427,11 @@
struct user_i387_ia32_struct env;
int err;

- err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
+ clear_fpu(tsk);
+ err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
sizeof(struct i387_fxsave_struct));
/* mxcsr reserved bits must be masked to zero for security reasons */
- tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+ tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
if (err || __copy_from_user(&env, buf, sizeof(env)))
return 1;
convert_to_fxsr(tsk, &env);
@@ -488,16 +442,6 @@
int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
{
int err;
- struct task_struct *tsk = current;
-
- if (HAVE_HWFP)
- clear_fpu(tsk);
-
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
- }

if (HAVE_HWFP) {
if (cpu_has_fxsr)
diff -ru kernels/linux-2.6.26/arch/x86/kernel/process.c tests/linux-2.6ww/arch/x86/kernel/process.c
--- kernels/linux-2.6.26/arch/x86/kernel/process.c 2008-07-15 11:29:31.000000000 +0200
+++ tests/linux-2.6ww/arch/x86/kernel/process.c 2008-08-06 18:43:11.000000000 +0200
@@ -7,43 +7,7 @@
#include <linux/module.h>
#include <linux/pm.h>

-struct kmem_cache *task_xstate_cachep;
-
-int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
-{
- *dst = *src;
- if (src->thread.xstate) {
- dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
- GFP_KERNEL);
- if (!dst->thread.xstate)
- return -ENOMEM;
- WARN_ON((unsigned long)dst->thread.xstate & 15);
- memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
- }
- return 0;
-}
-
-void free_thread_xstate(struct task_struct *tsk)
-{
- if (tsk->thread.xstate) {
- kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
- tsk->thread.xstate = NULL;
- }
-}
-
-void free_thread_info(struct thread_info *ti)
-{
- free_thread_xstate(ti->task);
- free_pages((unsigned long)ti, get_order(THREAD_SIZE));
-}
-
-void arch_task_cache_init(void)
-{
- task_xstate_cachep =
- kmem_cache_create("task_xstate", xstate_size,
- __alignof__(union thread_xstate),
- SLAB_PANIC, NULL);
-}
+static struct kmem_cache *task_xstate_cachep;

static void do_nothing(void *unused)
{
diff -ru kernels/linux-2.6.26/arch/x86/kernel/process_32.c tests/linux-2.6ww/arch/x86/kernel/process_32.c
--- kernels/linux-2.6.26/arch/x86/kernel/process_32.c 2008-07-15 11:29:31.000000000 +0200
+++ tests/linux-2.6ww/arch/x86/kernel/process_32.c 2008-08-06 18:40:54.000000000 +0200
@@ -412,10 +412,6 @@
regs->cs = __USER_CS;
regs->ip = new_ip;
regs->sp = new_sp;
- /*
- * Free the old FP and other extended state
- */
- free_thread_xstate(current);
}
EXPORT_SYMBOL_GPL(start_thread);

@@ -598,7 +594,7 @@

/* we're going to use this soon, after a few expensive things */
if (next_p->fpu_counter > 5)
- prefetch(next->xstate);
+ prefetch(&next->i387.fxsave);

/*
* Reload esp0.
diff -ru kernels/linux-2.6.26/arch/x86/kernel/process_64.c tests/linux-2.6ww/arch/x86/kernel/process_64.c
--- kernels/linux-2.6.26/arch/x86/kernel/process_64.c 2008-07-15 11:29:31.000000000 +0200
+++ tests/linux-2.6ww/arch/x86/kernel/process_64.c 2008-08-06 18:40:54.000000000 +0200
@@ -417,10 +417,6 @@
regs->ss = __USER_DS;
regs->flags = 0x200;
set_fs(USER_DS);
- /*
- * Free the old FP and other extended state
- */
- free_thread_xstate(current);
}
EXPORT_SYMBOL_GPL(start_thread);

@@ -570,7 +566,7 @@

/* we're going to use this soon, after a few expensive things */
if (next_p->fpu_counter>5)
- prefetch(next->xstate);
+ prefetch(&next->i387.fxsave);

/*
* Reload esp0, LDT and the page table pointer:
diff -ru kernels/linux-2.6.26/arch/x86/kernel/traps_32.c tests/linux-2.6ww/arch/x86/kernel/traps_32.c
--- kernels/linux-2.6.26/arch/x86/kernel/traps_32.c 2008-07-15 11:29:32.000000000 +0200
+++ tests/linux-2.6ww/arch/x86/kernel/traps_32.c 2008-08-06 18:40:54.000000000 +0200
@@ -1149,22 +1149,9 @@
struct thread_info *thread = current_thread_info();
struct task_struct *tsk = thread->task;

- if (!tsk_used_math(tsk)) {
- local_irq_enable();
- /*
- * does a slab alloc which can sleep
- */
- if (init_fpu(tsk)) {
- /*
- * ran out of memory!
- */
- do_group_exit(SIGKILL);
- return;
- }
- local_irq_disable();
- }
-
clts(); /* Allow maths ops (or we recurse) */
+ if (!tsk_used_math(tsk))
+ init_fpu(tsk);
restore_fpu(tsk);
thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
tsk->fpu_counter++;
@@ -1222,6 +1209,11 @@
#endif
set_trap_gate(19, &simd_coprocessor_error);

+ /*
+ * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
+ * Generate a build-time error if the alignment is wrong.
+ */
+ BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
if (cpu_has_fxsr) {
printk(KERN_INFO "Enabling fast FPU save and restore... ");
set_in_cr4(X86_CR4_OSFXSR);
@@ -1242,7 +1234,6 @@

set_bit(SYSCALL_VECTOR, used_vectors);

- init_thread_xstate();
/*
* Should be a barrier for any external CPU state:
*/
diff -ru kernels/linux-2.6.26/arch/x86/kernel/traps_64.c tests/linux-2.6ww/arch/x86/kernel/traps_64.c
--- kernels/linux-2.6.26/arch/x86/kernel/traps_64.c 2008-07-15 11:29:32.000000000 +0200
+++ tests/linux-2.6ww/arch/x86/kernel/traps_64.c 2008-08-06 18:40:54.000000000 +0200
@@ -1124,24 +1124,11 @@
asmlinkage void math_state_restore(void)
{
struct task_struct *me = current;
-
- if (!used_math()) {
- local_irq_enable();
- /*
- * does a slab alloc which can sleep
- */
- if (init_fpu(me)) {
- /*
- * ran out of memory!
- */
- do_group_exit(SIGKILL);
- return;
- }
- local_irq_disable();
- }
-
clts(); /* Allow maths ops (or we recurse) */
- restore_fpu_checking(&me->thread.xstate->fxsave);
+
+ if (!used_math())
+ init_fpu(me);
+ restore_fpu_checking(&me->thread.i387.fxsave);
task_thread_info(me)->status |= TS_USEDFPU;
me->fpu_counter++;
}
@@ -1177,10 +1164,6 @@
#endif

/*
- * initialize the per thread extended state:
- */
- init_thread_xstate();
- /*
* Should be a barrier for any external CPU state.
*/
cpu_init();
diff -ru kernels/linux-2.6.26/arch/x86/math-emu/fpu_entry.c tests/linux-2.6ww/arch/x86/math-emu/fpu_entry.c
--- kernels/linux-2.6.26/arch/x86/math-emu/fpu_entry.c 2008-07-15 11:29:32.000000000 +0200
+++ tests/linux-2.6ww/arch/x86/math-emu/fpu_entry.c 2008-08-06 18:40:54.000000000 +0200
@@ -30,7 +30,6 @@
#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/user.h>
-#include <asm/i387.h>

#include "fpu_system.h"
#include "fpu_emu.h"
@@ -147,13 +146,6 @@
unsigned long code_limit = 0; /* Initialized to stop compiler warnings */
struct desc_struct code_descriptor;

- if (!used_math()) {
- if (init_fpu(current)) {
- do_group_exit(SIGKILL);
- return;
- }
- }
-
#ifdef RE_ENTRANT_CHECKING
if (emulating) {
printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
@@ -161,6 +153,11 @@
RE_ENTRANT_CHECK_ON;
#endif /* RE_ENTRANT_CHECKING */

+ if (!used_math()) {
+ finit();
+ set_used_math();
+ }
+
SETUP_DATA_AREA(arg);

FPU_ORIG_EIP = FPU_EIP;
@@ -681,7 +678,7 @@
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- struct i387_soft_struct *s387 = &target->thread.xstate->soft;
+ struct i387_soft_struct *s387 = &target->thread.i387.soft;
void *space = s387->st_space;
int ret;
int offset, other, i, tags, regnr, tag, newtop;
@@ -733,7 +730,7 @@
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- struct i387_soft_struct *s387 = &target->thread.xstate->soft;
+ struct i387_soft_struct *s387 = &target->thread.i387.soft;
const void *space = s387->st_space;
int ret;
int offset = (S387->ftop & 7) * 10, other = 80 - offset;
diff -ru kernels/linux-2.6.26/arch/x86/math-emu/fpu_system.h tests/linux-2.6ww/arch/x86/math-emu/fpu_system.h
--- kernels/linux-2.6.26/arch/x86/math-emu/fpu_system.h 2008-07-15 11:29:32.000000000 +0200
+++ tests/linux-2.6ww/arch/x86/math-emu/fpu_system.h 2008-08-06 18:40:54.000000000 +0200
@@ -35,8 +35,8 @@
#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \
== (1 << 10))

-#define I387 (current->thread.xstate)
-#define FPU_info (I387->soft.info)
+#define I387 (current->thread.i387)
+#define FPU_info (I387.soft.info)

#define FPU_CS (*(unsigned short *) &(FPU_info->___cs))
#define FPU_SS (*(unsigned short *) &(FPU_info->___ss))
@@ -46,25 +46,25 @@
#define FPU_EIP (FPU_info->___eip)
#define FPU_ORIG_EIP (FPU_info->___orig_eip)

-#define FPU_lookahead (I387->soft.lookahead)
+#define FPU_lookahead (I387.soft.lookahead)

/* nz if ip_offset and cs_selector are not to be set for the current
instruction. */
-#define no_ip_update (*(u_char *)&(I387->soft.no_update))
-#define FPU_rm (*(u_char *)&(I387->soft.rm))
+#define no_ip_update (*(u_char *)&(I387.soft.no_update))
+#define FPU_rm (*(u_char *)&(I387.soft.rm))

/* Number of bytes of data which can be legally accessed by the current
instruction. This only needs to hold a number <= 108, so a byte will do. */
-#define access_limit (*(u_char *)&(I387->soft.alimit))
+#define access_limit (*(u_char *)&(I387.soft.alimit))

-#define partial_status (I387->soft.swd)
-#define control_word (I387->soft.cwd)
-#define fpu_tag_word (I387->soft.twd)
-#define registers (I387->soft.st_space)
-#define top (I387->soft.ftop)
+#define partial_status (I387.soft.swd)
+#define control_word (I387.soft.cwd)
+#define fpu_tag_word (I387.soft.twd)
+#define registers (I387.soft.st_space)
+#define top (I387.soft.ftop)

-#define instruction_address (*(struct address *)&I387->soft.fip)
-#define operand_address (*(struct address *)&I387->soft.foo)
+#define instruction_address (*(struct address *)&I387.soft.fip)
+#define operand_address (*(struct address *)&I387.soft.foo)

#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \
math_abort(FPU_info,SIGSEGV)
diff -ru kernels/linux-2.6.26/arch/x86/math-emu/reg_ld_str.c tests/linux-2.6ww/arch/x86/math-emu/reg_ld_str.c
--- kernels/linux-2.6.26/arch/x86/math-emu/reg_ld_str.c 2008-07-15 11:29:32.000000000 +0200
+++ tests/linux-2.6ww/arch/x86/math-emu/reg_ld_str.c 2008-08-06 18:40:54.000000000 +0200
@@ -1180,8 +1180,8 @@
control_word |= 0xffff0040;
partial_status = status_word() | 0xffff0000;
fpu_tag_word |= 0xffff0000;
- I387->soft.fcs &= ~0xf8000000;
- I387->soft.fos |= 0xffff0000;
+ I387.soft.fcs &= ~0xf8000000;
+ I387.soft.fos |= 0xffff0000;
#endif /* PECULIAR_486 */
if (__copy_to_user(d, &control_word, 7 * 4))
FPU_abort;
Only in tests/linux-2.6ww/drivers/ide: cris
diff -ru kernels/linux-2.6.26/include/asm-x86/i387.h tests/linux-2.6ww/include/asm-x86/i387.h
--- kernels/linux-2.6.26/include/asm-x86/i387.h 2008-07-15 11:29:34.000000000 +0200
+++ tests/linux-2.6ww/include/asm-x86/i387.h 2008-08-06 18:40:54.000000000 +0200
@@ -21,9 +21,8 @@

extern void fpu_init(void);
extern void mxcsr_feature_mask_init(void);
-extern int init_fpu(struct task_struct *child);
+extern void init_fpu(struct task_struct *child);
extern asmlinkage void math_state_restore(void);
-extern void init_thread_xstate(void);

extern user_regset_active_fn fpregs_active, xfpregs_active;
extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
@@ -118,22 +117,24 @@
/* Using "fxsaveq %0" would be the ideal choice, but is only supported
starting with gas 2.16. */
__asm__ __volatile__("fxsaveq %0"
- : "=m" (tsk->thread.xstate->fxsave));
+ : "=m" (tsk->thread.i387.fxsave));
#elif 0
/* Using, as a workaround, the properly prefixed form below isn't
accepted by any binutils version so far released, complaining that
the same type of prefix is used twice if an extended register is
needed for addressing (fix submitted to mainline 2005-11-21). */
__asm__ __volatile__("rex64/fxsave %0"
- : "=m" (tsk->thread.xstate->fxsave));
+ : "=m" (tsk->thread.i387.fxsave));
#else
/* This, however, we can work around by forcing the compiler to select
an addressing mode that doesn't require extended registers. */
- __asm__ __volatile__("rex64/fxsave (%1)"
- : "=m" (tsk->thread.xstate->fxsave)
- : "cdaSDb" (&tsk->thread.xstate->fxsave));
+ __asm__ __volatile__("rex64/fxsave %P2(%1)"
+ : "=m" (tsk->thread.i387.fxsave)
+ : "cdaSDb" (tsk),
+ "i" (offsetof(__typeof__(*tsk),
+ thread.i387.fxsave)));
#endif
- clear_fpu_state(&tsk->thread.xstate->fxsave);
+ clear_fpu_state(&tsk->thread.i387.fxsave);
task_thread_info(tsk)->status &= ~TS_USEDFPU;
}

@@ -147,7 +148,7 @@
int err = 0;

BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
- sizeof(tsk->thread.xstate->fxsave));
+ sizeof(tsk->thread.i387.fxsave));

if ((unsigned long)buf % 16)
printk("save_i387: bad fpstate %p\n", buf);
@@ -163,7 +164,7 @@
task_thread_info(tsk)->status &= ~TS_USEDFPU;
stts();
} else {
- if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
+ if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
sizeof(struct i387_fxsave_struct)))
return -1;
}
@@ -175,15 +176,7 @@
*/
static inline int restore_i387(struct _fpstate __user *buf)
{
- struct task_struct *tsk = current;
- int err;
-
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
- }
-
+ set_used_math();
if (!(task_thread_info(current)->status & TS_USEDFPU)) {
clts();
task_thread_info(current)->status |= TS_USEDFPU;
@@ -193,8 +186,6 @@

#else /* CONFIG_X86_32 */

-extern void finit(void);
-
static inline void tolerant_fwait(void)
{
asm volatile("fnclex ; fwait");
@@ -210,7 +201,7 @@
"nop ; frstor %1",
"fxrstor %1",
X86_FEATURE_FXSR,
- "m" (tsk->thread.xstate->fxsave));
+ "m" ((tsk)->thread.i387.fxsave));
}

/* We need a safe address that is cheap to find and that is already
@@ -234,8 +225,8 @@
"fxsave %[fx]\n"
"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
X86_FEATURE_FXSR,
- [fx] "m" (tsk->thread.xstate->fxsave),
- [fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
+ [fx] "m" (tsk->thread.i387.fxsave),
+ [fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. safe_address is a random variable that should be in L1 */
@@ -336,25 +327,25 @@
static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
{
if (cpu_has_fxsr) {
- return tsk->thread.xstate->fxsave.cwd;
+ return tsk->thread.i387.fxsave.cwd;
} else {
- return (unsigned short)tsk->thread.xstate->fsave.cwd;
+ return (unsigned short)tsk->thread.i387.fsave.cwd;
}
}

static inline unsigned short get_fpu_swd(struct task_struct *tsk)
{
if (cpu_has_fxsr) {
- return tsk->thread.xstate->fxsave.swd;
+ return tsk->thread.i387.fxsave.swd;
} else {
- return (unsigned short)tsk->thread.xstate->fsave.swd;
+ return (unsigned short)tsk->thread.i387.fsave.swd;
}
}

static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
{
if (cpu_has_xmm) {
- return tsk->thread.xstate->fxsave.mxcsr;
+ return tsk->thread.i387.fxsave.mxcsr;
} else {
return MXCSR_DEFAULT;
}
diff -ru kernels/linux-2.6.26/include/asm-x86/processor.h tests/linux-2.6ww/include/asm-x86/processor.h
--- kernels/linux-2.6.26/include/asm-x86/processor.h 2008-07-15 11:29:34.000000000 +0200
+++ tests/linux-2.6ww/include/asm-x86/processor.h 2008-08-06 18:40:54.000000000 +0200
@@ -350,7 +350,7 @@
u32 entry_eip;
};

-union thread_xstate {
+union i387_union {
struct i387_fsave_struct fsave;
struct i387_fxsave_struct fxsave;
struct i387_soft_struct soft;
@@ -361,9 +361,6 @@
#endif

extern void print_cpu_info(struct cpuinfo_x86 *);
-extern unsigned int xstate_size;
-extern void free_thread_xstate(struct task_struct *);
-extern struct kmem_cache *task_xstate_cachep;
extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
extern unsigned short num_cache_leaves;
@@ -396,8 +393,8 @@
unsigned long cr2;
unsigned long trap_no;
unsigned long error_code;
- /* floating point and extended processor state */
- union thread_xstate *xstate;
+ /* Floating point info: */
+ union i387_union i387 __attribute__((aligned(16)));;
#ifdef CONFIG_X86_32
/* Virtual 86 mode info */
struct vm86_struct __user *vm86_info;
diff -ru kernels/linux-2.6.26/include/asm-x86/thread_info.h tests/linux-2.6ww/include/asm-x86/thread_info.h
--- kernels/linux-2.6.26/include/asm-x86/thread_info.h 2008-07-15 11:29:34.000000000 +0200
+++ tests/linux-2.6ww/include/asm-x86/thread_info.h 2008-08-06 18:40:54.000000000 +0200
@@ -1,14 +1,5 @@
-#ifndef _ASM_X86_THREAD_INFO_H
#ifdef CONFIG_X86_32
# include "thread_info_32.h"
#else
# include "thread_info_64.h"
#endif
-
-#ifndef __ASSEMBLY__
-extern void arch_task_cache_init(void);
-extern void free_thread_info(struct thread_info *ti);
-extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
-#define arch_task_cache_init arch_task_cache_init
-#endif
-#endif /* _ASM_X86_THREAD_INFO_H */
diff -ru kernels/linux-2.6.26/include/asm-x86/thread_info_32.h tests/linux-2.6ww/include/asm-x86/thread_info_32.h
--- kernels/linux-2.6.26/include/asm-x86/thread_info_32.h 2008-07-15 11:29:34.000000000 +0200
+++ tests/linux-2.6ww/include/asm-x86/thread_info_32.h 2008-08-06 18:40:54.000000000 +0200
@@ -102,6 +102,8 @@
__get_free_pages(GFP_KERNEL, get_order(THREAD_SIZE)))
#endif

+#define free_thread_info(info) free_pages((unsigned long)(info), get_order(THREAD_SIZE))
+
#else /* !__ASSEMBLY__ */

/* how to get the thread information struct from ASM */
diff -ru kernels/linux-2.6.26/include/asm-x86/thread_info_64.h tests/linux-2.6ww/include/asm-x86/thread_info_64.h
--- kernels/linux-2.6.26/include/asm-x86/thread_info_64.h 2008-07-15 11:29:34.000000000 +0200
+++ tests/linux-2.6ww/include/asm-x86/thread_info_64.h 2008-08-06 18:40:54.000000000 +0200
@@ -85,6 +85,8 @@
#define alloc_thread_info(tsk) \
((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER))

+#define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER)
+
#else /* !__ASSEMBLY__ */

/* how to get the thread information struct from ASM */
diff -ru kernels/linux-2.6.26/kernel/fork.c tests/linux-2.6ww/kernel/fork.c
--- kernels/linux-2.6.26/kernel/fork.c 2008-07-15 11:29:34.000000000 +0200
+++ tests/linux-2.6ww/kernel/fork.c 2008-08-06 18:40:54.000000000 +0200
@@ -133,14 +133,6 @@
free_task(tsk);
}

-/*
- * macro override instead of weak attribute alias, to workaround
- * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
- */
-#ifndef arch_task_cache_init
-#define arch_task_cache_init()
-#endif
-
void __init fork_init(unsigned long mempages)
{
#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
@@ -153,9 +145,6 @@
ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
#endif

- /* do the arch specific task caches init */
- arch_task_cache_init();
-
/*
* The default maximum number of threads is set to a safe
* value: the thread structures can take up at most half
@@ -175,13 +164,6 @@
init_task.signal->rlim[RLIMIT_NPROC];
}

-int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
- struct task_struct *src)
-{
- *dst = *src;
- return 0;
-}
-
static struct task_struct *dup_task_struct(struct task_struct *orig)
{
struct task_struct *tsk;
@@ -200,15 +182,15 @@
return NULL;
}

- err = arch_dup_task_struct(tsk, orig);
- if (err)
- goto out;
-
+ *tsk = *orig;
tsk->stack = ti;

err = prop_local_init_single(&tsk->dirties);
- if (err)
- goto out;
+ if (err) {
+ free_thread_info(ti);
+ free_task_struct(tsk);
+ return NULL;
+ }

setup_thread_stack(tsk, orig);

@@ -224,11 +206,6 @@
#endif
tsk->splice_pipe = NULL;
return tsk;
-
-out:
- free_thread_info(ti);
- free_task_struct(tsk);
- return NULL;
}

#ifdef CONFIG_MMU
======================================================================

Regards,
--
Wolfgang Walter
Studentenwerk M?nchen
Anstalt des ?ffentlichen Rechts

2008-08-06 20:14:22

by Suresh Siddha

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

On Wed, Aug 06, 2008 at 10:33:25AM -0700, Wolfgang Walter wrote:
> Hello Herbert,
>
> I think I finally found the problem.
>
> Here a short description again: all our routers with a via C3 using padlock for AES-encryption are
> crashing with 2.6.26 while they work fine with 2.6.25. Not using padlock
> (i.e. using the i386 assembler version of AES) they just work fine.

Both the padlock version or asm version don't use FP/math registers, right?
It is interesting that you don't see the problem with asm version
but see the problem with padlock version.

Does disabling CONFIG_PREEMPT in 2.6.26 change anything? And also,
can you provide the complete kernel log till the point of failure(oops
that you sent doesn't have the call trace info)

thanks,
suresh

2008-08-06 21:30:59

by Suresh Siddha

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

On Wed, Aug 06, 2008 at 01:14:02PM -0700, Siddha, Suresh B wrote:
> On Wed, Aug 06, 2008 at 10:33:25AM -0700, Wolfgang Walter wrote:
> > Hello Herbert,
> >
> > I think I finally found the problem.
> >
> > Here a short description again: all our routers with a via C3 using padlock for AES-encryption are
> > crashing with 2.6.26 while they work fine with 2.6.25. Not using padlock
> > (i.e. using the i386 assembler version of AES) they just work fine.
>
> Both the padlock version or asm version don't use FP/math registers, right?
> It is interesting that you don't see the problem with asm version
> but see the problem with padlock version.
>
> Does disabling CONFIG_PREEMPT in 2.6.26 change anything? And also,
> can you provide the complete kernel log till the point of failure(oops
> that you sent doesn't have the call trace info)

BTW, in one of your oops, I see:

note: cron[1207] exited with preempt_count 268435459

I smell some kind of stack corruption here which is corrupting
thread_info (in the above case preempt_count in the thread_info).

Similarly, if the status field(in thread_info) gets corrupted(setting
TS_USEDFPU) without proper math state allocated(present in thread_struct),
we can end up oops in __switch_to.

But you seem to say, reverting recent fpu patches make the problem go away.
hmm, just wondering if your test kernel (with fpu patches reverted) is stable
enough and don't see other oops/issues?

Recently Vegard also noticed some stack corruptions (in network stack) leading
to similar problems. Not sure if Vegard has root caused his issue. copying him
for his comments.

thanks,
suresh

2008-08-07 00:39:21

by Wolfgang Walter

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

Thanks for your answer.

On Wednesday 06 August 2008, Suresh Siddha wrote:
> On Wed, Aug 06, 2008 at 01:14:02PM -0700, Siddha, Suresh B wrote:
> > On Wed, Aug 06, 2008 at 10:33:25AM -0700, Wolfgang Walter wrote:
> > > Hello Herbert,
> > >
> > > I think I finally found the problem.
> > >
> > > Here a short description again: all our routers with a via C3 using
> > > padlock for AES-encryption are
> > > crashing with 2.6.26 while they work fine with 2.6.25. Not using padlock
> > > (i.e. using the i386 assembler version of AES) they just work fine.
> >
> > Both the padlock version or asm version don't use FP/math registers,
> > right?
> > It is interesting that you don't see the problem with asm version
> > but see the problem with padlock version.

I don't know how padlock exactly works and I don't know anything of i386's
architecture on hardware and assembler level. So I can only speculate:

Maybe padlock aes does influence FP/math.

http://linux.via.com.tw/support/beginDownload.action?eleid=181&fid=261

states:

3. SSE instructions must be enabled via the standard x86 method of enabling
the FXSAVE/FXRSTOR instructions using CR4[9] This enables the full set of SSE
instructions. If CR4[9] is not set, PadLock behaves as if it were disabled via
the MSR, regardless of the setting of the enable bits MSRs.

> >
> > Does disabling CONFIG_PREEMPT in 2.6.26 change anything? And also,
> > can you provide the complete kernel log till the point of failure(oops
> > that you sent doesn't have the call trace info)
>
> BTW, in one of your oops, I see:
>
> note: cron[1207] exited with preempt_count 268435459
>
> I smell some kind of stack corruption here which is corrupting
> thread_info (in the above case preempt_count in the thread_info).
>
> Similarly, if the status field(in thread_info) gets corrupted(setting
> TS_USEDFPU) without proper math state allocated(present in thread_struct),
> we can end up oops in __switch_to.
>
> But you seem to say, reverting recent fpu patches make the problem go away.
> hmm, just wondering if your test kernel (with fpu patches reverted) is
stable
> enough and don't see other oops/issues?

No oops yet.

2.6.26 crashes here within 1 or 2 minutes if (and only) if there is ipsec
traffic using padlock aes and there are actually processes running (i.e.
ssh).

The modified 2.6.26 did not crash yet (now running hours).

Unmodified 2.6.26 where we use i386 assembler aes instead of padlock runs
since 2 weeks.

We further use almost same kernels (only compiled for K7 or Intel Core Duo,
respectively) on K7 and an Intel Quad Core without problems.

>
> Recently Vegard also noticed some stack corruptions (in network stack)
leading
> to similar problems. Not sure if Vegard has root caused his issue. copying
him
> for his comments.
>
> thanks,
> suresh
>
>

Regards,
--
Wolfgang Walter
Studentenwerk M?nchen
Anstalt des ?ffentlichen Rechts

2008-08-07 16:23:22

by Wolfgang Walter

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

Am Mittwoch, 6. August 2008 22:14 schrieb Suresh Siddha:
> On Wed, Aug 06, 2008 at 10:33:25AM -0700, Wolfgang Walter wrote:
> > Hello Herbert,
> >
> > I think I finally found the problem.
> >
> > Here a short description again: all our routers with a via C3 using
> > padlock for AES-encryption are crashing with 2.6.26 while they work fine
> > with 2.6.25. Not using padlock (i.e. using the i386 assembler version of
> > AES) they just work fine.
>
> Both the padlock version or asm version don't use FP/math registers, right?
> It is interesting that you don't see the problem with asm version
> but see the problem with padlock version.
>
> Does disabling CONFIG_PREEMPT in 2.6.26 change anything? And also,

Didn't check that yet as I'm still running my modified 2.6.26. It now runs
almost one day flawlessly.

But yesterday I tried the following patch on top of a vanilla 2.6.26:

=======================================================
diff -ur ../linux-2.6.26/drivers/crypto/padlock-aes.c ./drivers/crypto/padlock-aes.c
--- ../linux-2.6.26/drivers/crypto/padlock-aes.c 2008-07-15 11:29:32.000000000 +0200
+++ ./drivers/crypto/padlock-aes.c 2008-08-07 17:46:55.000000000 +0200
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <asm/byteorder.h>
+#include <asm/i387.h>
#include "padlock.h"

/* Control word. */
@@ -144,9 +145,11 @@
static inline void padlock_xcrypt(const u8 *input, u8 *output, void *key,
void *control_word)
{
+ kernel_fpu_begin();
asm volatile (".byte 0xf3,0x0f,0xa7,0xc8" /* rep xcryptecb */
: "+S"(input), "+D"(output)
: "d"(control_word), "b"(key), "c"(1));
+ kernel_fpu_end();
}

static void aes_crypt_copy(const u8 *in, u8 *out, u32 *key, struct cword *cword)
@@ -179,6 +182,7 @@
return;
}

+ kernel_fpu_begin();
asm volatile ("test $1, %%cl;"
"je 1f;"
"lea -1(%%ecx), %%eax;"
@@ -190,15 +194,18 @@
: "+S"(input), "+D"(output)
: "d"(control_word), "b"(key), "c"(count)
: "ax");
+ kernel_fpu_end();
}

static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void *key,
u8 *iv, void *control_word, u32 count)
{
/* rep xcryptcbc */
+ kernel_fpu_begin();
asm volatile (".byte 0xf3,0x0f,0xa7,0xd0"
: "+S" (input), "+D" (output), "+a" (iv)
: "d" (control_word), "b" (key), "c" (count));
+ kernel_fpu_end();
return iv;
}

=============================================================

I found that kernel_fpu_begin(); kernel_fpu_begin(); is used with
MMX and/or SSE:

include/asm/xor_32.h
drivers/md/raid6mmx.c
drivers/md/raid6sse1.c
drivers/md/raid6sse2.c

With this change I its a little bit more stable, I needed more then 5
minutes to crash the kernel (repeated it several times). If I read
the code correctly this disables preemption for the time the padlock cmd
is executing.

> can you provide the complete kernel log till the point of failure(oops
> that you sent doesn't have the call trace info)
>

Here are some of the oopses (all with unmodified 2.6.26):

##################################################################

BUG: unable to handle kernel NULL pointer dereference at 000001f0
IP: [<c010280c>] __switch_to+0x23/0x103
*pde = 00000000
Oops: 0002 [#1] PREEMPT
Modules linked in:

Pid: 2014, comm: date Not tainted (2.6.26 #3)
EIP: 0060:[<c010280c>] EFLAGS: 00010002 CPU: 0
EIP is at __switch_to+0x23/0x103
EAX: 00000000 EBX: dc4e2dc0 ECX: 0000015f EDX: dc4e2dc0
ESI: de9634a0 EDI: dc4e2fe8 EBP: de9636c8 ESP: cee9deec
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
Process date (pid: 2014, ti=cee9c000 task=de9634a0 task.ti=d13c6000)
Stack: dc4e2dc0 00000000 cee81300 de9634a0 c039bf41 dc4e2dc0 00000082 de9634a0
00000000 dc4e2f2c 00000046 00000000 ffffffea 00000001 dc4e2db8 00000000
c011e766 0000000e 00000000 00000003 dc4e2dc0 00000001 00000010 dc4e2dc0
Call Trace:
[<c039bf41>] schedule+0x257/0x2bf
[<c011e766>] do_wait+0x88e/0x963
[<c01480c5>] handle_mm_fault+0x441/0x4aa
[<c023698d>] tty_ioctl+0x0/0x877
[<c0119205>] default_wake_function+0x0/0x8
[<c011e8ba>] sys_wait4+0x7f/0x92
[<c011e8e0>] sys_waitpid+0x13/0x17
[<c01037a2>] syscall_call+0x7/0xb
=======================
Code: 00 90 5d 5b 5e 5f 5d c3 55 57 56 89 c6 53 89 d3 8d a8 28 02 00 00 8b 40 04 8d ba 28 02 00 00 f6 40 0c 01 74 31 8b 86 7c 02 00 00 <0f> ae 00 0f ba 60 02 07 73 02 db e2 8d 76 00 90 8d b4 26 00 00
EIP: [<c010280c>] __switch_to+0x23/0x103 SS:ESP 0068:cee9deec
---[ end trace 55bd4c8258a0a1eb ]---
Fixing recursive fault but reboot is needed!
BUG: scheduling while atomic: bash/1363/0x00000003
Pid: 2014, comm: date Tainted: G D 2.6.26 #3
[<c039bd42>] schedule+0x58/0x2bf
[<c011cea8>] printk+0x14/0x18
[<c011ea08>] do_exit+0x9f/0x54a
[<c011cea8>] printk+0x14/0x18
[<c010500f>] die+0xfd/0x102
[<c01123d8>] do_page_fault+0x488/0x53c
[<c0111f50>] do_page_fault+0x0/0x53c
[<c039d4da>] error_code+0x6a/0x70
[<c010280c>] __switch_to+0x23/0x103
[<c039bf41>] schedule+0x257/0x2bf
[<c011e766>] do_wait+0x88e/0x963
[<c01480c5>] handle_mm_fault+0x441/0x4aa
[<c023698d>] tty_ioctl+0x0/0x877
[<c0119205>] default_wake_function+0x0/0x8
[<c011e8ba>] sys_wait4+0x7f/0x92
[<c011e8e0>] sys_waitpid+0x13/0x17
[<c01037a2>] syscall_call+0x7/0xb
=======================
note: bash[1363] exited with preempt_count 2
BUG: unable to handle kernel NULL pointer dereference at 00000028
IP: [<c011a900>] mm_release+0x39/0x64
*pde = 00000000
Oops: 0000 [#2] PREEMPT
Modules linked in:

Pid: 1363, comm: bash Tainted: G D (2.6.26 #3)
EIP: 0060:[<c011a900>] EFLAGS: 00010246 CPU: 0
EIP is at mm_release+0x39/0x64
EAX: 00000000 EBX: de9634a0 ECX: b7e306f8 EDX: 00000000
ESI: 00000000 EDI: de9634a0 EBP: c041e382 ESP: cee9de34
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
Process bash (pid: 1363, ti=cee9c000 task=dc4e2dc0 task.ti=cee9c000)
Stack: 00000000 de9634a0 c011d7d1 00000002 de9634a0 0000000b c011eb01 00000000
c041e382 c011cea8 c041ebdc cee9de70 cee9de70 cee9deb4 00000006 00000002
c041e382 c010500f 00000000 000001f0 de9634a0 00000038 c01123d8 00000002
Call Trace:
[<c011d7d1>] exit_mm+0x12/0xb4
[<c011eb01>] do_exit+0x198/0x54a
[<c011cea8>] printk+0x14/0x18
[<c010500f>] die+0xfd/0x102
[<c01123d8>] do_page_fault+0x488/0x53c
[<c0111f50>] do_page_fault+0x0/0x53c
[<c039d4da>] error_code+0x6a/0x70
[<c010280c>] __switch_to+0x23/0x103
[<c039bf41>] schedule+0x257/0x2bf
[<c011e766>] do_wait+0x88e/0x963
[<c01480c5>] handle_mm_fault+0x441/0x4aa
[<c023698d>] tty_ioctl+0x0/0x877
[<c0119205>] default_wake_function+0x0/0x8
[<c011e8ba>] sys_wait4+0x7f/0x92
[<c011e8e0>] sys_waitpid+0x13/0x17
[<c01037a2>] syscall_call+0x7/0xb
=======================
Code: 8e e8 85 d2 74 11 c7 83 40 01 00 00 00 00 00 00 89 d0 e8 42 f2 ff ff 8b 8b 48 01 00 00 85 c9 74 32 8b 43 0c 25 00 04 00 00 75 28 <83> 7e 28 01 7e 22 c7 83 48 01 00 00 00 00 00 00 e8 4b 41 0e 00
EIP: [<c011a900>] mm_release+0x39/0x64 SS:ESP 0068:cee9de34
---[ end trace 55bd4c8258a0a1eb ]---

##################################################################

BUG: unable to handle kernel NULL pointer dereference at 000001f0
IP: [<c010280c>] __switch_to+0x23/0x103
*pde = 00000000
Oops: 0002 [#1] PREEMPT
Modules linked in:

Pid: 1569, comm: date Not tainted (2.6.26 #7)
EIP: 0060:[<c010280c>] EFLAGS: 00010002 CPU: 0
EIP is at __switch_to+0x23/0x103
EAX: 00000000 EBX: de978000 ECX: 00000091 EDX: de978000
ESI: de979810 EDI: de978228 EBP: de979a38 ESP: d13ddeec
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
Process date (pid: 1569, ti=d13dc000 task=de979810 task.ti=d185e000)
Stack: de978000 00000000 cfdbe780 de979810 c039c229 de978000 00000082 de979810
00000000 de97816c 00000046 00000000 ffffffea 00000001 de977ff8 00000000
c011e766 0000000e 00000000 00000003 de978000 00000001 00000010 de978000
Call Trace:
[<c039c229>] schedule+0x257/0x2bf
[<c011e766>] do_wait+0x88e/0x963
[<c01480c5>] handle_mm_fault+0x441/0x4aa
[<c023698d>] tty_ioctl+0x0/0x877
[<c0119205>] default_wake_function+0x0/0x8
[<c011e8ba>] sys_wait4+0x7f/0x92
[<c011e8e0>] sys_waitpid+0x13/0x17
[<c01037a2>] syscall_call+0x7/0xb
=======================
Code: 00 90 5d 5b 5e 5f 5d c3 55 57 56 89 c6 53 89 d3 8d a8 28 02 00 00 8b 40 04 8d ba 28 02 00 00 f6 40 0c 01 74 31 8b 86 7c 02 00 00 <0f> ae 00 0f ba 60 02 07 73 02 db e2 8d 76 00 90 8d b4 26 00 00
EIP: [<c010280c>] __switch_to+0x23/0x103 SS:ESP 0068:d13ddeec
---[ end trace 8061cb3e89ac8fe5 ]---
Fixing recursive fault but reboot is needed!
BUG: scheduling while atomic: bash/1374/0x00000003
Pid: 1569, comm: date Tainted: G D 2.6.26 #7
[<c039c02a>] schedule+0x58/0x2bf
[<c011cea8>] printk+0x14/0x18
[<c011ea08>] do_exit+0x9f/0x54a
[<c011cea8>] printk+0x14/0x18
[<c010500f>] die+0xfd/0x102
[<c01123d8>] do_page_fault+0x488/0x53c
[<c0111f50>] do_page_fault+0x0/0x53c
[<c039d7ba>] error_code+0x6a/0x70
[<c010280c>] __switch_to+0x23/0x103
[<c039c229>] schedule+0x257/0x2bf
[<c011e766>] do_wait+0x88e/0x963
[<c01480c5>] handle_mm_fault+0x441/0x4aa
[<c023698d>] tty_ioctl+0x0/0x877
[<c0119205>] default_wake_function+0x0/0x8
[<c011e8ba>] sys_wait4+0x7f/0x92
[<c011e8e0>] sys_waitpid+0x13/0x17
[<c01037a2>] syscall_call+0x7/0xb
=======================
note: bash[1374] exited with preempt_count 2
BUG: unable to handle kernel NULL pointer dereference at 00000028
IP: [<c011a900>] mm_release+0x39/0x64
*pde = 00000000
Oops: 0000 [#2] PREEMPT
Modules linked in:

Pid: 1374, comm: bash Tainted: G D (2.6.26 #7)
EIP: 0060:[<c011a900>] EFLAGS: 00010246 CPU: 0
EIP is at mm_release+0x39/0x64
EAX: 00000000 EBX: de979810 ECX: b7e316f8 EDX: 00000000
ESI: 00000000 EDI: de979810 EBP: c041e382 ESP: d13dde34
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
Process bash (pid: 1374, ti=d13dc000 task=de978000 task.ti=d13dc000)
Stack: 00000000 de979810 c011d7d1 00000002 de979810 0000000b c011eb01 00000000
c041e382 c011cea8 c041ebdc d13dde70 d13dde70 d13ddeb4 00000006 00000002
c041e382 c010500f 00000000 000001f0 de979810 00000038 c01123d8 00000002
Call Trace:
[<c011d7d1>] exit_mm+0x12/0xb4
[<c011eb01>] do_exit+0x198/0x54a
[<c011cea8>] printk+0x14/0x18
[<c010500f>] die+0xfd/0x102
[<c01123d8>] do_page_fault+0x488/0x53c
[<c0111f50>] do_page_fault+0x0/0x53c
[<c039d7ba>] error_code+0x6a/0x70
[<c010280c>] __switch_to+0x23/0x103
[<c039c229>] schedule+0x257/0x2bf
[<c011e766>] do_wait+0x88e/0x963
[<c01480c5>] handle_mm_fault+0x441/0x4aa
[<c023698d>] tty_ioctl+0x0/0x877
[<c0119205>] default_wake_function+0x0/0x8
[<c011e8ba>] sys_wait4+0x7f/0x92
[<c011e8e0>] sys_waitpid+0x13/0x17
[<c01037a2>] syscall_call+0x7/0xb
=======================
Code: 8e e8 85 d2 74 11 c7 83 40 01 00 00 00 00 00 00 89 d0 e8 42 f2 ff ff 8b 8b 48 01 00 00 85 c9 74 32 8b 43 0c 25 00 04 00 00 75 28 <83> 7e 28 01 7e 22 c7 83 48 01 00 00 00 00 00 00 e8 4b 41 0e 00
EIP: [<c011a900>] mm_release+0x39/0x64 SS:ESP 0068:d13dde34
---[ end trace 8061cb3e89ac8fe5 ]---
note: bash[1374] exited with preempt_count 2

##################################################################

BUG: unable to handle kernel NULL pointer dereference at 000001f0
IP: [<c010280c>] __switch_to+0x23/0x103
*pde = 00000000
Oops: 0002 [#1] PREEMPT
Modules linked in:

Pid: 1591, comm: date Not tainted (2.6.26 #8)
EIP: 0060:[<c010280c>] EFLAGS: 00010002 CPU: 0
EIP is at __switch_to+0x23/0x103
EAX: 00000000 EBX: dc4e3b80 ECX: 000000e6 EDX: dc4e3b80
ESI: dc42bb80 EDI: dc4e3da8 EBP: dc42bda8 ESP: cc8d5eec
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
Process date (pid: 1591, ti=cc8d4000 task=dc42bb80 task.ti=cf542000)
Stack: dc4e3b80 00000000 dc4b8000 dc42bb80 c039bbc9 dc4e3b80 00000082 dc42bb80
00000000 dc4e3cec 00000046 00000000 ffffffea 00000001 dc4e3b78 00000000
c011e766 0000000e 00000000 00000003 dc4e3b80 00000001 00000010 dc4e3b80
Call Trace:
[<c039bbc9>] schedule+0x257/0x2bf
[<c011e766>] do_wait+0x88e/0x963
[<c01480c5>] handle_mm_fault+0x441/0x4aa
[<c02365dd>] tty_ioctl+0x0/0x877
[<c0119205>] default_wake_function+0x0/0x8
[<c011e8ba>] sys_wait4+0x7f/0x92
[<c011e8e0>] sys_waitpid+0x13/0x17
[<c01037a2>] syscall_call+0x7/0xb
=======================
Code: 00 90 5d 5b 5e 5f 5d c3 55 57 56 89 c6 53 89 d3 8d a8 28 02 00 00 8b 40 04 8d ba 28 02 00 00 f6 40 0c 01 74 31 8b 86 7c 02 00 00 <0f> ae 00 0f ba 60 02 07 73 02 db e2 8d 76 00 90 8d b4 26 00 00
EIP: [<c010280c>] __switch_to+0x23/0x103 SS:ESP 0068:cc8d5eec
---[ end trace fa9f688d2faab2aa ]---
Fixing recursive fault but reboot is needed!
BUG: scheduling while atomic: bash/1374/0x00000003
Pid: 1591, comm: date Tainted: G D 2.6.26 #8
[<c039b9ca>] schedule+0x58/0x2bf
[<c011cea8>] printk+0x14/0x18
[<c011ea08>] do_exit+0x9f/0x54a
[<c011cea8>] printk+0x14/0x18
[<c010500f>] die+0xfd/0x102
[<c01123d8>] do_page_fault+0x488/0x53c
[<c0111f50>] do_page_fault+0x0/0x53c
[<c039d15a>] error_code+0x6a/0x70
[<c010280c>] __switch_to+0x23/0x103
[<c039bbc9>] schedule+0x257/0x2bf
[<c011e766>] do_wait+0x88e/0x963
[<c01480c5>] handle_mm_fault+0x441/0x4aa
[<c02365dd>] tty_ioctl+0x0/0x877
[<c0119205>] default_wake_function+0x0/0x8
[<c011e8ba>] sys_wait4+0x7f/0x92
[<c011e8e0>] sys_waitpid+0x13/0x17
[<c01037a2>] syscall_call+0x7/0xb
=======================
note: bash[1374] exited with preempt_count 2
BUG: unable to handle kernel NULL pointer dereference at 00000028
IP: [<c011a900>] mm_release+0x39/0x64
*pde = 00000000
Oops: 0000 [#2] PREEMPT
Modules linked in:

Pid: 1374, comm: bash Tainted: G D (2.6.26 #8)
EIP: 0060:[<c011a900>] EFLAGS: 00010246 CPU: 0
EIP is at mm_release+0x39/0x64
EAX: 00000000 EBX: dc42bb80 ECX: b7e4a6f8 EDX: 00000000
ESI: 00000000 EDI: dc42bb80 EBP: c041e216 ESP: cc8d5e34
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
Process bash (pid: 1374, ti=cc8d4000 task=dc4e3b80 task.ti=cc8d4000)
Stack: 00000000 dc42bb80 c011d7d1 00000002 dc42bb80 0000000b c011eb01 00000000
c041e216 c011cea8 c041ea70 cc8d5e70 cc8d5e70 cc8d5eb4 00000006 00000002
c041e216 c010500f 00000000 000001f0 dc42bb80 00000038 c01123d8 00000002
Call Trace:
[<c011d7d1>] exit_mm+0x12/0xb4
[<c011eb01>] do_exit+0x198/0x54a
[<c011cea8>] printk+0x14/0x18
[<c010500f>] die+0xfd/0x102
[<c01123d8>] do_page_fault+0x488/0x53c
[<c0111f50>] do_page_fault+0x0/0x53c
[<c039d15a>] error_code+0x6a/0x70
[<c010280c>] __switch_to+0x23/0x103
[<c039bbc9>] schedule+0x257/0x2bf
[<c011e766>] do_wait+0x88e/0x963
[<c01480c5>] handle_mm_fault+0x441/0x4aa
[<c02365dd>] tty_ioctl+0x0/0x877
[<c0119205>] default_wake_function+0x0/0x8
[<c011e8ba>] sys_wait4+0x7f/0x92
[<c011e8e0>] sys_waitpid+0x13/0x17
[<c01037a2>] syscall_call+0x7/0xb
=======================
Code: 8e e8 85 d2 74 11 c7 83 40 01 00 00 00 00 00 00 89 d0 e8 42 f2 ff ff 8b 8b 48 01 00 00 85 c9 74 32 8b 43 0c 25 00 04 00 00 75 28 <83> 7e 28 01 7e 22 c7 83 48 01 00 00 00 00 00 00 e8 9b 3d 0e 00
EIP: [<c011a900>] mm_release+0x39/0x64 SS:ESP 0068:cc8d5e34
---[ end trace fa9f688d2faab2aa ]---
note: bash[1374] exited with preempt_count 2
BUG: scheduling while atomic: bash/1374/0x10000003
Pid: 1374, comm: bash Tainted: G D 2.6.26 #8
[<c039b9ca>] schedule+0x58/0x2bf
[<c0165ec2>] dput+0x15/0xfc
[<c01f9ae8>] _atomic_dec_and_lock+0x30/0x38
[<c016a0e5>] mntput_no_expire+0x11/0xd2
[<c01184b4>] __cond_resched+0x13/0x2f
[<c039bf4c>] _cond_resched+0x21/0x2a
[<c011daf2>] put_files_struct+0x63/0xa6
[<c011eb0f>] do_exit+0x1a6/0x54a
[<c011cea8>] printk+0x14/0x18
[<c010500f>] die+0xfd/0x102
[<c01123d8>] do_page_fault+0x488/0x53c
[<c0111f50>] do_page_fault+0x0/0x53c
[<c039d15a>] error_code+0x6a/0x70
[<c011a900>] mm_release+0x39/0x64
[<c011d7d1>] exit_mm+0x12/0xb4
[<c011eb01>] do_exit+0x198/0x54a
[<c011cea8>] printk+0x14/0x18
[<c010500f>] die+0xfd/0x102
[<c01123d8>] do_page_fault+0x488/0x53c
[<c0111f50>] do_page_fault+0x0/0x53c
[<c039d15a>] error_code+0x6a/0x70
[<c010280c>] __switch_to+0x23/0x103
[<c039bbc9>] schedule+0x257/0x2bf
[<c011e766>] do_wait+0x88e/0x963
[<c01480c5>] handle_mm_fault+0x441/0x4aa
[<c02365dd>] tty_ioctl+0x0/0x877
[<c0119205>] default_wake_function+0x0/0x8
[<c011e8ba>] sys_wait4+0x7f/0x92
[<c011e8e0>] sys_waitpid+0x13/0x17
[<c01037a2>] syscall_call+0x7/0xb
=======================

##################################################################

BUG: unable to handle kernel NULL pointer dereference at 000001f0
IP: [<c01028c5>] __switch_to+0x30/0x117
*pde = 00000000
Oops: 0002 [#1] PREEMPT
Modules linked in:

Pid: 2071, comm: sleep Not tainted (2.6.26 #11)
EIP: 0060:[<c01028c5>] EFLAGS: 00010002 CPU: 0
EIP is at __switch_to+0x30/0x117
EAX: 00000000 EBX: c0493300 ECX: dc48dd00 EDX: c0493300
ESI: dc48dd00 EDI: c0493530 EBP: c04cff8c ESP: c04cff7c
DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
Process sleep (pid: 2071, ti=c04ce000 task=dc48dd00 task.ti=d2fe6000)
Stack: dc48df30 c0493300 00000000 00000000 d2fe7f44 c03b5b43 c04cffc8 00000046
c0131856 0000005a dc472d3c c0493300 c0493470 d983ae00 00002696 00000000
c0239f54 00000000 c04c4000 c04cffd8 c01025fe c04f3740 00049800 c04cffe0
Call Trace:
[<c03b5b43>] ? schedule+0x285/0x2ff
[<c0131856>] ? pm_qos_requirement+0x3c/0x53
[<c0239f54>] ? acpi_processor_idle+0x0/0x434
[<c01025fe>] ? cpu_idle+0x73/0x7f
[<c03a4dcd>] ? rest_init+0x61/0x63
=======================
Code: 89 c6 53 89 d3 83 ec 04 8d 80 30 02 00 00 89 45 f0 8d ba 30 02 00 00 e8 a2 bd 10 00 8b 46 04 f6 40 0c 01 74 31 8b 86 84 02 00 00 <0f> ae 00 0f ba 60 02 07 73 02 db e2 8d 76 00 90 8d b4 26 00 00
EIP: [<c01028c5>] __switch_to+0x30/0x117 SS:ESP 0068:c04cff7c
---[ end trace 042daf7e67838617 ]---
note: sleep[2071] exited with preempt_count 2
BUG: scheduling while atomic: swapper/0/0x00000004
Pid: 2071, comm: sleep Tainted: G D 2.6.26 #11
[<c0118c95>] __schedule_bug+0x42/0x47
[<c03b5923>] schedule+0x65/0x2ff
[<c011f8d8>] ? put_fs_struct+0x39/0x3c
[<c0120dc2>] do_exit+0x54e/0x55d
[<c011de9e>] ? print_oops_end_marker+0x1e/0x23
[<c010522a>] die+0x104/0x10c
[<c0112553>] do_page_fault+0x477/0x52c
[<c01120dc>] ? do_page_fault+0x0/0x52c
[<c03b729a>] error_code+0x6a/0x70
[<c01028c5>] ? __switch_to+0x30/0x117
[<c03b5b43>] schedule+0x285/0x2ff
[<c0131856>] ? pm_qos_requirement+0x3c/0x53
[<c0239f54>] ? acpi_processor_idle+0x0/0x434
[<c01025fe>] ? cpu_idle+0x73/0x7f
[<c03a4dcd>] ? rest_init+0x61/0x63

##################################################################

Regards,
--
Wolfgang Walter
Studentenwerk M?nchen
Anstalt des ?ffentlichen Rechts

2008-08-08 08:44:19

by Wolfgang Walter

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

On Wednesday 06 August 2008, Suresh Siddha wrote:
> On Wed, Aug 06, 2008 at 01:14:02PM -0700, Siddha, Suresh B wrote:
> > On Wed, Aug 06, 2008 at 10:33:25AM -0700, Wolfgang Walter wrote:
> > > Hello Herbert,
> > >
> > > I think I finally found the problem.
> > >
> > > Here a short description again: all our routers with a via C3 using
padlock for AES-encryption are
> > > crashing with 2.6.26 while they work fine with 2.6.25. Not using padlock
> > > (i.e. using the i386 assembler version of AES) they just work fine.
> >
> > Both the padlock version or asm version don't use FP/math registers,
right?
> > It is interesting that you don't see the problem with asm version
> > but see the problem with padlock version.
> >
> > Does disabling CONFIG_PREEMPT in 2.6.26 change anything? And also,
> > can you provide the complete kernel log till the point of failure(oops
> > that you sent doesn't have the call trace info)
>
> BTW, in one of your oops, I see:
>
> note: cron[1207] exited with preempt_count 268435459
>
> I smell some kind of stack corruption here which is corrupting
> thread_info (in the above case preempt_count in the thread_info).
>

I don't think 268435459 is a strong indication of stack corruption:

268435459 == 0x10000003 == PREEMPT_ACTIVE|0x03

Regards,
--
Wolfgang Walter
Studentenwerk M?nchen
Anstalt des ?ffentlichen Rechts

2008-08-08 10:37:13

by Wolfgang Walter

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

Am Donnerstag, 7. August 2008 18:23 schrieb Wolfgang Walter:
> Am Mittwoch, 6. August 2008 22:14 schrieb Suresh Siddha:
> > On Wed, Aug 06, 2008 at 10:33:25AM -0700, Wolfgang Walter wrote:
> > > Hello Herbert,
> > >
> > > I think I finally found the problem.
> > >
> > > Here a short description again: all our routers with a via C3 using
> > > padlock for AES-encryption are crashing with 2.6.26 while they work
> > > fine with 2.6.25. Not using padlock (i.e. using the i386 assembler
> > > version of AES) they just work fine.
> >
> > Both the padlock version or asm version don't use FP/math registers,
> > right? It is interesting that you don't see the problem with asm version
> > but see the problem with padlock version.
> >
> > Does disabling CONFIG_PREEMPT in 2.6.26 change anything? And also,
>
> Didn't check that yet as I'm still running my modified 2.6.26. It now runs
> almost one day flawlessly.
>
> But yesterday I tried the following patch on top of a vanilla 2.6.26:
>
> =======================================================
> diff -ur ../linux-2.6.26/drivers/crypto/padlock-aes.c
> ./drivers/crypto/padlock-aes.c ---
> ../linux-2.6.26/drivers/crypto/padlock-aes.c 2008-07-15 11:29:32.000000000
> +0200 +++ ./drivers/crypto/padlock-aes.c 2008-08-07 17:46:55.000000000
> +0200 @@ -16,6 +16,7 @@
> #include <linux/interrupt.h>
> #include <linux/kernel.h>
> #include <asm/byteorder.h>
> +#include <asm/i387.h>
> #include "padlock.h"
>
> /* Control word. */
> @@ -144,9 +145,11 @@
> static inline void padlock_xcrypt(const u8 *input, u8 *output, void *key,
> void *control_word)
> {
> + kernel_fpu_begin();
> asm volatile (".byte 0xf3,0x0f,0xa7,0xc8" /* rep xcryptecb */
>
> : "+S"(input), "+D"(output)
> : "d"(control_word), "b"(key), "c"(1));
>
> + kernel_fpu_end();
> }
>
> static void aes_crypt_copy(const u8 *in, u8 *out, u32 *key, struct cword
> *cword) @@ -179,6 +182,7 @@
> return;
> }
>
> + kernel_fpu_begin();
> asm volatile ("test $1, %%cl;"
> "je 1f;"
> "lea -1(%%ecx), %%eax;"
> @@ -190,15 +194,18 @@
>
> : "+S"(input), "+D"(output)
> : "d"(control_word), "b"(key), "c"(count)
> : "ax");
>
> + kernel_fpu_end();
> }
>
> static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void
> *key, u8 *iv, void *control_word, u32 count)
> {
> /* rep xcryptcbc */
> + kernel_fpu_begin();
> asm volatile (".byte 0xf3,0x0f,0xa7,0xd0"
>
> : "+S" (input), "+D" (output), "+a" (iv)
> : "d" (control_word), "b" (key), "c" (count));
>
> + kernel_fpu_end();
> return iv;
> }
>
> =============================================================
>
> I found that kernel_fpu_begin(); kernel_fpu_begin(); is used with
> MMX and/or SSE:
>
> include/asm/xor_32.h
> drivers/md/raid6mmx.c
> drivers/md/raid6sse1.c
> drivers/md/raid6sse2.c
>
>
> With this change I its a little bit more stable, I needed more then 5
> minutes to crash the kernel (repeated it several times). If I read
> the code correctly this disables preemption for the time the padlock cmd
> is executing.

Forget that - I booted the wrong kernel.

I now really test this modification and it seems to be stable. The router now
runs for 45 minutes without oops.

Regards,
--
Wolfgang Walter
Studentenwerk M?nchen
Anstalt des ?ffentlichen Rechts
Leiter EDV
Leopoldstra?e 15
80802 M?nchen
Tel: +49 89 38196 276
Fax: +49 89 38196 150
Email: [email protected]
http://www.studentenwerk-muenchen.de/

2008-08-08 18:31:37

by Vegard Nossum

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

On Wed, Aug 6, 2008 at 11:21 PM, Suresh Siddha
<[email protected]> wrote:
> BTW, in one of your oops, I see:
>
> note: cron[1207] exited with preempt_count 268435459
>
> I smell some kind of stack corruption here which is corrupting
> thread_info (in the above case preempt_count in the thread_info).
>
> Similarly, if the status field(in thread_info) gets corrupted(setting
> TS_USEDFPU) without proper math state allocated(present in thread_struct),
> we can end up oops in __switch_to.
>
> But you seem to say, reverting recent fpu patches make the problem go away.
> hmm, just wondering if your test kernel (with fpu patches reverted) is stable
> enough and don't see other oops/issues?
>
> Recently Vegard also noticed some stack corruptions (in network stack) leading
> to similar problems. Not sure if Vegard has root caused his issue. copying him
> for his comments.

I don't think this is the same problem. What I see is almost certainly
a problem with netpoll, netconsole, or the 8139too driver. I see a UDP
packet in a task_struct.

There is also the fact that reverting fpu patches makes it go away
(for Wolfgang), while for the issue I am seeing, oops in FP code is
just one out of several different corruptions (sometimes it happens in
other slabs).

(Sorry for the little late reply.)

Thanks.

Vegard

--
"The animistic metaphor of the bug that maliciously sneaked in while
the programmer was not looking is intellectually dishonest as it
disguises that the error is the programmer's own creation."
-- E. W. Dijkstra, EWD1036

2008-08-08 18:54:18

by Suresh Siddha

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

On Fri, Aug 08, 2008 at 03:36:54AM -0700, Wolfgang Walter wrote:
> Am Donnerstag, 7. August 2008 18:23 schrieb Wolfgang Walter:
> > But yesterday I tried the following patch on top of a vanilla 2.6.26:
> >
> > =======================================================
> > diff -ur ../linux-2.6.26/drivers/crypto/padlock-aes.c
> > ./drivers/crypto/padlock-aes.c ---
> > ../linux-2.6.26/drivers/crypto/padlock-aes.c 2008-07-15 11:29:32.000000000
> > +0200 +++ ./drivers/crypto/padlock-aes.c 2008-08-07 17:46:55.000000000
> > +0200 @@ -16,6 +16,7 @@
> > #include <linux/interrupt.h>
> > #include <linux/kernel.h>
> > #include <asm/byteorder.h>
> > +#include <asm/i387.h>
> > #include "padlock.h"
> >
> > /* Control word. */
> > @@ -144,9 +145,11 @@
> > static inline void padlock_xcrypt(const u8 *input, u8 *output, void *key,
> > void *control_word)
> > {
> > + kernel_fpu_begin();
> > asm volatile (".byte 0xf3,0x0f,0xa7,0xc8" /* rep xcryptecb */
> >
> > : "+S"(input), "+D"(output)
> > : "d"(control_word), "b"(key), "c"(1));
> >
> > + kernel_fpu_end();
> > }
> >
> > static void aes_crypt_copy(const u8 *in, u8 *out, u32 *key, struct cword
> > *cword) @@ -179,6 +182,7 @@
> > return;
> > }
> >
> > + kernel_fpu_begin();
> > asm volatile ("test $1, %%cl;"
> > "je 1f;"
> > "lea -1(%%ecx), %%eax;"
> > @@ -190,15 +194,18 @@
> >
> > : "+S"(input), "+D"(output)
> > : "d"(control_word), "b"(key), "c"(count)
> > : "ax");
> >
> > + kernel_fpu_end();
> > }
> >
> > static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void
> > *key, u8 *iv, void *control_word, u32 count)
> > {
> > /* rep xcryptcbc */
> > + kernel_fpu_begin();
> > asm volatile (".byte 0xf3,0x0f,0xa7,0xd0"
> >
> > : "+S" (input), "+D" (output), "+a" (iv)
> > : "d" (control_word), "b" (key), "c" (count));
> >
> > + kernel_fpu_end();
> > return iv;
> > }
> >
> > =============================================================
> >
> > I found that kernel_fpu_begin(); kernel_fpu_begin(); is used with
> > MMX and/or SSE:
> >
> > include/asm/xor_32.h
> > drivers/md/raid6mmx.c
> > drivers/md/raid6sse1.c
> > drivers/md/raid6sse2.c
> >
> >
> > With this change I its a little bit more stable, I needed more then 5
> > minutes to crash the kernel (repeated it several times). If I read
> > the code correctly this disables preemption for the time the padlock cmd
> > is executing.
>
> Forget that - I booted the wrong kernel.
>
> I now really test this modification and it seems to be stable. The router now
> runs for 45 minutes without oops.

Ok. Thanks. I think I can explain the failure:

These padlock instructions though don't use/touch SSE registers, but it behaves
similar to other SSE instructions. For example, it might cause DNA faults
when cr0.ts is set. While this is a spurious DNA trap, it might cause
problems with the recent fpu code changes.

This is the code sequence that is probably causing this problem:

a) new app is getting exec'd and it is somewhere in between
start_thread() and flush_old_exec() in the load_xyz_binary()

b) At pont "a", task's fpu state (like TS_USEDFPU, used_math() etc) is
cleared.

c) Now we get an interrupt/softirq which starts using these encrypt/decrypt
routines in the network stack. This generates a math fault (as
cr0.ts is '1') which sets TS_USEDFPU and restores the math that is
in the task's xstate.

d) Return to exec code path, which does start_thread() which does
free_thread_xstate() and sets xstate pointer to NULL while
the TS_USEDFPU is still set.

e) At the next context switch from the new exec'd task to another task,
we have a scenarios where TS_USEDFPU is set but xstate pointer is null.
This can cause an oops during unlazy_fpu() in __switch_to()

Now:

1) This should happen with or with out pre-emption. Viro also encountered
similar problem with out CONFIG_PREEMPT.

2) kernel_fpu_begin() and kernel_fpu_end() will fix this problem, because
kernel_fpu_begin() will manually do a clts() and won't run in to the
situation of setting TS_USEDFPU in step "c" above.

3) This was working before the fpu changes, because its a spurious
math fault which doesn't corrupt any fpu/sse registers and the task's
math state was always in an allocated state.

I propose to go with wolf's patch which add's kernel_fpu_begin() and
kernel_fpu_end() around these instructions. This is the correct fix
(unless there is something wrong in my above understanding).

thanks,
suresh

2008-08-08 19:08:27

by H. Peter Anvin

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

Suresh Siddha wrote:
>
> I propose to go with wolf's patch which add's kernel_fpu_begin() and
> kernel_fpu_end() around these instructions. This is the correct fix
> (unless there is something wrong in my above understanding).
>

It's technically overkill, if (and only if!) these instructions don't
actually touch the SSE state (most likely they're using the SSE
pipeline, and need this stuff to deal with power management issues.)

However, overkill is a good way to make sure something is dead.
Applying the patch will make sure we fix the regression, and we can
worry about optimizing this further post-2.6.27.

-hpa

2008-08-08 19:10:05

by Wolfgang Walter

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

Am Freitag, 8. August 2008 12:36 schrieb Wolfgang Walter:
> Am Donnerstag, 7. August 2008 18:23 schrieb Wolfgang Walter:
> > =======================================================
> > diff -ur ../linux-2.6.26/drivers/crypto/padlock-aes.c
> > ./drivers/crypto/padlock-aes.c ---
> > ../linux-2.6.26/drivers/crypto/padlock-aes.c 2008-07-15
> > 11:29:32.000000000 +0200 +++ ./drivers/crypto/padlock-aes.c 2008-08-07
> > 17:46:55.000000000 +0200 @@ -16,6 +16,7 @@
> > #include <linux/interrupt.h>
> > #include <linux/kernel.h>
> > #include <asm/byteorder.h>
> > +#include <asm/i387.h>
> > #include "padlock.h"
> >
> > /* Control word. */
> > @@ -144,9 +145,11 @@
> > static inline void padlock_xcrypt(const u8 *input, u8 *output, void
> > *key, void *control_word)
> > {
> > + kernel_fpu_begin();
> > asm volatile (".byte 0xf3,0x0f,0xa7,0xc8" /* rep xcryptecb */
> >
> > : "+S"(input), "+D"(output)
> > : "d"(control_word), "b"(key), "c"(1));
> >
> > + kernel_fpu_end();
> > }
> >
> > static void aes_crypt_copy(const u8 *in, u8 *out, u32 *key, struct cword
> > *cword) @@ -179,6 +182,7 @@
> > return;
> > }
> >
> > + kernel_fpu_begin();
> > asm volatile ("test $1, %%cl;"
> > "je 1f;"
> > "lea -1(%%ecx), %%eax;"
> > @@ -190,15 +194,18 @@
> >
> > : "+S"(input), "+D"(output)
> > : "d"(control_word), "b"(key), "c"(count)
> > : "ax");
> >
> > + kernel_fpu_end();
> > }
> >
> > static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void
> > *key, u8 *iv, void *control_word, u32 count)
> > {
> > /* rep xcryptcbc */
> > + kernel_fpu_begin();
> > asm volatile (".byte 0xf3,0x0f,0xa7,0xd0"
> >
> > : "+S" (input), "+D" (output), "+a" (iv)
> > : "d" (control_word), "b" (key), "c" (count));
> >
> > + kernel_fpu_end();
> > return iv;
> > }
> >
> > =============================================================
> >
> > I found that kernel_fpu_begin(); kernel_fpu_begin(); is used with
> > MMX and/or SSE:
> >
> > include/asm/xor_32.h
> > drivers/md/raid6mmx.c
> > drivers/md/raid6sse1.c
> > drivers/md/raid6sse2.c
> >
> >
> > With this change I its a little bit more stable, I needed more then 5
> > minutes to crash the kernel (repeated it several times). If I read
> > the code correctly this disables preemption for the time the padlock cmd
> > is executing.
>
> Forget that - I booted the wrong kernel.
>
> I now really test this modification and it seems to be stable. The router
> now runs for 45 minutes without oops.

It now runs for 8 hours.

So how to proceed?

Here a summary:

1) 2.6.25.x works

2) 2.6.26 without padlock works

3a) 2.6.26 using padlock and ipsec chrashes.

Always the same reason:

__switch_to() wants to fxsave but there is no memory allocated

which means that TS_USEDFPU has been set.

3b) 2.6.26 with padlock code from 2.6.25: using padlock and ipsec chrashes.

4) Reverting the fpu patches (from 2.6.25 to 2.6.26) fixes the problem.

5) Protecting the padlock cmds with kernel_fpu_begin(); kernel_fpu_begin();
fixes the problem.

Some thoughts:

4) probably fixes the problem because memory for fxsave is always allocated.

Maybe this is also the reason why 2.6.25 and earlier work.

5) is interesting:

kernel_fpu_begin() will save the fpu state if TS_USEDFPU is set - exactly as
__unlazy_fpu in __switch_to would do. So when the crypto code calls padlock
the world is ok: if TS_USEDFPU is set then memory has been allocated.

This makes me rather sure that there is no memory corruption by the network
code and/or crypto code itself overwriting the task_info stucture and setting
TS_USEDFPU.

I don't know though why kernel_fpu_begin exactly fixes the problem:

Maybe preemption must be disabled when padlocks RNG, ACE, Hash-engine etc. is
used. Maybe just some barrier is needed which preempt_disable provides.

The padlock programming guide states that padlock's RNG, ACE, ... internally
use SSE. Especially "it temporary enables SSE until no further SSE
instruction or XSTORE, ... is seen". Further it may overlap with execution of
non-SSE instructions.

Probably it is the best to treat the padlock opcodes as SSE instructions.

What should I do now?

Regards,
--
Wolfgang Walter
Studentenwerk M?nchen
Anstalt des ?ffentlichen Rechts

2008-08-08 19:19:51

by Wolfgang Walter

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

Am Freitag, 8. August 2008 21:01 schrieb H. Peter Anvin:
> Suresh Siddha wrote:
> > I propose to go with wolf's patch which add's kernel_fpu_begin() and
> > kernel_fpu_end() around these instructions. This is the correct fix
> > (unless there is something wrong in my above understanding).
>
> It's technically overkill, if (and only if!) these instructions don't
> actually touch the SSE state (most likely they're using the SSE
> pipeline, and need this stuff to deal with power management issues.)
>
> However, overkill is a good way to make sure something is dead.
> Applying the patch will make sure we fix the regression, and we can
> worry about optimizing this further post-2.6.27.
>
> -hpa

Probably

drivers/crypto/padlock-sha.c
drivers/char/hw_random/via-rng.c

needs a fix, too.

Regards,
--
Wolfgang Walter
Studentenwerk M?nchen
Anstalt des ?ffentlichen Rechts
Leiter EDV
Leopoldstra?e 15
80802 M?nchen
Tel: +49 89 38196 276
Fax: +49 89 38196 150
Email: [email protected]
http://www.studentenwerk-muenchen.de/

2008-08-08 19:33:14

by Suresh Siddha

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

On Fri, Aug 08, 2008 at 12:09:46PM -0700, Wolfgang Walter wrote:
> I don't know though why kernel_fpu_begin exactly fixes the problem:

because it prevents taking a DNA fault inside the kernel (resulting
in TS_USEDFPU not set).

>
> Maybe preemption must be disabled when padlocks RNG, ACE, Hash-engine etc. is
> used. Maybe just some barrier is needed which preempt_disable provides.

problem can happen even when preemption is disabled.

> What should I do now?

I will post your fix(including the other usages) with the appropriate changelog
shortly.

thanks,
suresh

2008-08-08 23:10:55

by Wolfgang Walter

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

On Friday 08 August 2008, Suresh Siddha wrote:
> On Fri, Aug 08, 2008 at 12:09:46PM -0700, Wolfgang Walter wrote:
> > I don't know though why kernel_fpu_begin exactly fixes the problem:
>
> because it prevents taking a DNA fault inside the kernel (resulting
> in TS_USEDFPU not set).
>
> >
> > Maybe preemption must be disabled when padlocks RNG, ACE, Hash-engine etc.
is
> > used. Maybe just some barrier is needed which preempt_disable provides.
>
> problem can happen even when preemption is disabled.
>
> > What should I do now?
>
> I will post your fix(including the other usages) with the appropriate
changelog
> shortly.
>

Fine.

By the way, if padlock is used from userspace, could this be aproblem, too? I
mean not via the kernel crypto layer but directly. I think these extensions
are not restricted to a privileged user.

Regards,
--
Wolfgang Walter
Studentenwerk M?nchen
Anstalt des ?ffentlichen Rechts
Leiter EDV
Leopoldstra?e 15
80802 M?nchen
Tel: +49 89 38196-276
Fax: +49 89 38196-144
[email protected]
http://www.studentenwerk-muenchen.de/

2008-08-08 23:11:43

by Suresh Siddha

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

Walter, Viro,

As I can't test, can you please test this and Ack.

thanks,
suresh
---
[patch] fix via padlock instruction usage with kernel_fpu_begin/end()

Wolfgang Walter reported this oops on his via C3 using padlock for
AES-encryption:

##################################################################

BUG: unable to handle kernel NULL pointer dereference at 000001f0
IP: [<c01028c5>] __switch_to+0x30/0x117
*pde = 00000000
Oops: 0002 [#1] PREEMPT
Modules linked in:

Pid: 2071, comm: sleep Not tainted (2.6.26 #11)
EIP: 0060:[<c01028c5>] EFLAGS: 00010002 CPU: 0
EIP is at __switch_to+0x30/0x117
EAX: 00000000 EBX: c0493300 ECX: dc48dd00 EDX: c0493300
ESI: dc48dd00 EDI: c0493530 EBP: c04cff8c ESP: c04cff7c
DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
Process sleep (pid: 2071, ti=c04ce000 task=dc48dd00 task.ti=d2fe6000)
Stack: dc48df30 c0493300 00000000 00000000 d2fe7f44 c03b5b43 c04cffc8 00000046
c0131856 0000005a dc472d3c c0493300 c0493470 d983ae00 00002696 00000000
c0239f54 00000000 c04c4000 c04cffd8 c01025fe c04f3740 00049800 c04cffe0
Call Trace:
[<c03b5b43>] ? schedule+0x285/0x2ff
[<c0131856>] ? pm_qos_requirement+0x3c/0x53
[<c0239f54>] ? acpi_processor_idle+0x0/0x434
[<c01025fe>] ? cpu_idle+0x73/0x7f
[<c03a4dcd>] ? rest_init+0x61/0x63
=======================

Wolfgang also found out that adding kernel_fpu_begin() and kernel_fpu_end()
around the padlock instructions fix the oops.

Suresh wrote:

These padlock instructions though don't use/touch SSE registers, but it behaves
similar to other SSE instructions. For example, it might cause DNA faults
when cr0.ts is set. While this is a spurious DNA trap, it might cause
oops with the recent fpu code changes.

This is the code sequence that is probably causing this problem:

a) new app is getting exec'd and it is somewhere in between
start_thread() and flush_old_exec() in the load_xyz_binary()

b) At pont "a", task's fpu state (like TS_USEDFPU, used_math() etc) is
cleared.

c) Now we get an interrupt/softirq which starts using these encrypt/decrypt
routines in the network stack. This generates a math fault (as
cr0.ts is '1') which sets TS_USEDFPU and restores the math that is
in the task's xstate.

d) Return to exec code path, which does start_thread() which does
free_thread_xstate() and sets xstate pointer to NULL while
the TS_USEDFPU is still set.

e) At the next context switch from the new exec'd task to another task,
we have a scenarios where TS_USEDFPU is set but xstate pointer is null.
This can cause an oops during unlazy_fpu() in __switch_to()

Now:

1) This should happen with or with out pre-emption. Viro also encountered
similar problem with out CONFIG_PREEMPT.

2) kernel_fpu_begin() and kernel_fpu_end() will fix this problem, because
kernel_fpu_begin() will manually do a clts() and won't run in to the
situation of setting TS_USEDFPU in step "c" above.

3) This was working before the fpu changes, because its a spurious
math fault which doesn't corrupt any fpu/sse registers and the task's
math state was always in an allocated state.

With out the recent dynamic fpu allocation changes, while we don't see oops,
there is a possible race still present in older kernels(for example,
while kernel is using kernel_fpu_begin() in some optimized clear/copy
page and an interrupt/softirq happens which uses these padlock
instructions generating DNA fault).

For now, fix the padlock instruction usage by calling them inside the
context of kernel_fpu_begin() and kernel_fpu_end()

Next steps:

a) Based on the need, possible introduction of light weight kernel_fpu_*
routines which will optimize the padlock usage case, where they don't
touch SSE/FPU registers, but generate DNA.

b) Looking deeper, do we need to disable interrupts in the kernel_fpu_begin()?
Is there a recursive case, where interrupt context also touches FPU/SSE
registers?

Reported-and-bisected-by: Wolfgang Walter <[email protected]>
Signed-off-by: Wolfgang Walter <[email protected]>
Signed-off-by: Suresh Siddha <[email protected]>
---

diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c
index f7feae4..3dee9e5 100644
--- a/drivers/char/hw_random/via-rng.c
+++ b/drivers/char/hw_random/via-rng.c
@@ -31,6 +31,7 @@
#include <asm/io.h>
#include <asm/msr.h>
#include <asm/cpufeature.h>
+#include <asm/i387.h>

#define PFX KBUILD_MODNAME ": "
@@ -67,16 +68,22 @@ enum {
* Another possible performance boost may come from simply buffering
* until we have 4 bytes, thus returning a u32 at a time,
* instead of the current u8-at-a-time.
+ *
+ * Padlock instructions can generate a spurious DNA fault, so
+ * we will call them in the context of kernel_fpu_[begin,end].
*/

static inline u32 xstore(u32 *addr, u32 edx_in)
{
u32 eax_out;

+ kernel_fpu_begin();
+
asm(".byte 0x0F,0xA7,0xC0 /* xstore %%edi (addr=%0) */"
:"=m"(*addr), "=a"(eax_out)
:"D"(addr), "d"(edx_in));

+ kernel_fpu_end();
return eax_out;
}

diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 54a2a16..2c96d85 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <asm/byteorder.h>
+#include <asm/i387.h>
#include "padlock.h"

/* Control word. */
@@ -141,6 +142,12 @@ static inline void padlock_reset_key(void)
asm volatile ("pushfl; popfl");
}

+/*
+ * While the padlock instructions don't use FP/SSE registers, they
+ * generate a spurious DNA fault when cr0.ts is '1'. These instructions
+ * should be used only inside the kernel_fpu_[begin, end] context.
+ */
+
static inline void padlock_xcrypt(const u8 *input, u8 *output, void *key,
void *control_word)
{
@@ -206,14 +213,20 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct aes_ctx *ctx = aes_ctx(tfm);
padlock_reset_key();
+
+ kernel_fpu_begin();
aes_crypt(in, out, ctx->E, &ctx->cword.encrypt);
+ kernel_fpu_end();
}

static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct aes_ctx *ctx = aes_ctx(tfm);
padlock_reset_key();
+
+ kernel_fpu_begin();
aes_crypt(in, out, ctx->D, &ctx->cword.decrypt);
+ kernel_fpu_end();
}

static struct crypto_alg aes_alg = {
@@ -250,6 +263,7 @@ static int ecb_aes_encrypt(struct blkcipher_desc *desc,
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);

+ kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
padlock_xcrypt_ecb(walk.src.virt.addr, walk.dst.virt.addr,
ctx->E, &ctx->cword.encrypt,
@@ -257,6 +271,7 @@ static int ecb_aes_encrypt(struct blkcipher_desc *desc,
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ kernel_fpu_end();

return err;
}
@@ -274,6 +289,7 @@ static int ecb_aes_decrypt(struct blkcipher_desc *desc,
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);

+ kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
padlock_xcrypt_ecb(walk.src.virt.addr, walk.dst.virt.addr,
ctx->D, &ctx->cword.decrypt,
@@ -281,6 +297,7 @@ static int ecb_aes_decrypt(struct blkcipher_desc *desc,
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ kernel_fpu_end();

return err;
}
@@ -320,6 +337,7 @@ static int cbc_aes_encrypt(struct blkcipher_desc *desc,
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);

+ kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
u8 *iv = padlock_xcrypt_cbc(walk.src.virt.addr,
walk.dst.virt.addr, ctx->E,
@@ -329,6 +347,7 @@ static int cbc_aes_encrypt(struct blkcipher_desc *desc,
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ kernel_fpu_end();

return err;
}
@@ -346,6 +365,7 @@ static int cbc_aes_decrypt(struct blkcipher_desc *desc,
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);

+ kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
padlock_xcrypt_cbc(walk.src.virt.addr, walk.dst.virt.addr,
ctx->D, walk.iv, &ctx->cword.decrypt,
@@ -353,6 +373,7 @@ static int cbc_aes_decrypt(struct blkcipher_desc *desc,
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ kernel_fpu_end();

return err;
}
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index 40d5680..cea8830 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -22,6 +22,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/scatterlist.h>
+#include <asm/i387.h>
#include "padlock.h"

#define SHA1_DEFAULT_FALLBACK "sha1-generic"
@@ -109,9 +110,12 @@ static void padlock_do_sha1(const char *in, char *out, int count)
((uint32_t *)result)[3] = SHA1_H3;
((uint32_t *)result)[4] = SHA1_H4;

+ /* prevent taking the spurious DNA fault with padlock. */
+ kernel_fpu_begin();
asm volatile (".byte 0xf3,0x0f,0xa6,0xc8" /* rep xsha1 */
: "+S"(in), "+D"(result)
: "c"(count), "a"(0));
+ kernel_fpu_end();

padlock_output_block((uint32_t *)result, (uint32_t *)out, 5);
}
@@ -133,9 +137,12 @@ static void padlock_do_sha256(const char *in, char *out, int count)
((uint32_t *)result)[6] = SHA256_H6;
((uint32_t *)result)[7] = SHA256_H7;

+ /* prevent taking the spurious DNA fault with padlock. */
+ kernel_fpu_begin();
asm volatile (".byte 0xf3,0x0f,0xa6,0xd0" /* rep xsha256 */
: "+S"(in), "+D"(result)
: "c"(count), "a"(0));
+ kernel_fpu_end();

padlock_output_block((uint32_t *)result, (uint32_t *)out, 8);
}

2008-08-08 23:15:26

by Suresh Siddha

[permalink] [raw]

Subject: Re: Kernel oops with 2.6.26, padlock and ipsec: probably problem with fpu state changes

On Fri, Aug 08, 2008 at 04:10:42PM -0700, Wolfgang Walter wrote:
> By the way, if padlock is used from userspace, could this be aproblem, too? I
> mean not via the kernel crypto layer but directly. I think these extensions
> are not restricted to a privileged user.

No. Userspace will not have any such issue. Kernel is special, as it
has to preserve the user-space context and also use these instructions
for its own kernel purpose.

It is ok to get a spurious DNA from userland.

thanks,
suresh

2008-08-09 00:39:04