Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id ; Fri, 25 Jan 2002 13:58:23 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id ; Fri, 25 Jan 2002 13:56:26 -0500 Received: from dell-paw-3.cambridge.redhat.com ([195.224.55.237]:23792 "HELO executor.cambridge.redhat.com") by vger.kernel.org with SMTP id ; Fri, 25 Jan 2002 13:54:04 -0500 To: torvalds@transmeta.com Cc: linux-kernel@vger.kernel.org, dhowells@redhat.com Subject: [PATCH] syscall latency improvement #1 User-Agent: EMH/1.14.1 SEMI/1.14.3 (Ushinoya) FLIM/1.14.3 (=?ISO-8859-4?Q?Unebigory=F2mae?=) APEL/10.3 Emacs/21.1 (i386-redhat-linux-gnu) MULE/5.0 (SAKAKI) MIME-Version: 1.0 (generated by SEMI 1.14.3 - "Ushinoya") Content-Type: text/plain; charset=US-ASCII Date: Fri, 25 Jan 2002 18:54:02 +0000 Message-ID: <18993.1011984842@warthog.cambridge.redhat.com> From: David Howells Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Hi Linus, The attached patch does the following to 2.5.3-pre5: * consolidates various status items that are found in the lower reaches of task_struct into one 32-bit word, thus allowing them to be tested atomically without the need to disable interrupts in entry.S. * optimises the instructions in the system_call path in entry.S * frees up a hole in the bottom part of the task_struct (on the 1st cache line). * improves base syscall latency by approximately 5.4% (dual PIII) or 3.6% (dual Athlon) as measured by lmbench's "lat_syscall null" command against the vanilla kernel. Most notable are the changes to the following files: arch/i386/kernel/entry.S include/linux/sched.h David diff -uNr linux-2.5.3-pre5/arch/i386/kernel/entry.S linux-work-253p5/arch/i386/kernel/entry.S --- linux-2.5.3-pre5/arch/i386/kernel/entry.S Tue Jan 22 09:06:49 2002 +++ linux-work-253p5/arch/i386/kernel/entry.S Fri Jan 25 15:01:07 2002 @@ -72,10 +72,13 @@ */ state = 0 flags = 4 -sigpending = 8 +work = 8 +need_resched = work+0 +syscall_trace = work+1 +sigpending = work+2 +notify_resume = work+3 addr_limit = 12 exec_domain = 16 -need_resched = 20 tsk_ptrace = 24 processor = 52 @@ -151,7 +154,7 @@ call *%edx addl $4, %esp popl %eax - jmp ret_from_sys_call + jmp resume_userspace ENTRY(lcall27) pushfl # We get a different stack layout with call gates, @@ -172,7 +175,7 @@ call *%edx addl $4, %esp popl %eax - jmp ret_from_sys_call + jmp resume_userspace ENTRY(ret_from_fork) @@ -180,9 +183,7 @@ call SYMBOL_NAME(schedule_tail) addl $4, %esp GET_CURRENT(%ebx) - testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS - jne tracesys_exit - jmp ret_from_sys_call + jmp syscall_exit /* * Return to user mode is not as complex as all this looks, @@ -191,73 +192,99 @@ * less clear than it otherwise should be. */ + # userspace resumption stub bypassing syscall exit tracing + ALIGN +ENTRY(ret_from_intr) + GET_CURRENT(%ebx) +ret_from_exception: + movl EFLAGS(%esp),%eax # mix EFLAGS and CS + movb CS(%esp),%al + testl $(VM_MASK | 3),%eax + jz restore_all # returning to kernel-space or vm86-space + sti # we may have come from an interrupt handler +ENTRY(resume_userspace) + movl work(%ebx),%ecx + andl $0xffff00ff,%ecx # current->work (ignoring syscall_trace) + jne work_pending + jmp restore_all + + # system call handler stub + ALIGN ENTRY(system_call) pushl %eax # save orig_eax SAVE_ALL GET_CURRENT(%ebx) - testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS - jne tracesys cmpl $(NR_syscalls),%eax - jae badsys + jae syscall_badsys + testb $0xff,syscall_trace(%ebx) # system call tracing in operation + jnz syscall_trace_entry +syscall_traced: call *SYMBOL_NAME(sys_call_table)(,%eax,4) - movl %eax,EAX(%esp) # save the return value -ENTRY(ret_from_sys_call) - cli # need_resched and signals atomic test - cmpl $0,need_resched(%ebx) - jne reschedule - cmpl $0,sigpending(%ebx) - jne signal_return + movl %eax,EAX(%esp) # store the return value +syscall_exit: + movl work(%ebx),%ecx + testl %ecx,%ecx # current->work + jne syscall_exit_work restore_all: RESTORE_ALL + # perform work that needs to be done immediately before resumption ALIGN -signal_return: - sti # we can get here from an interrupt handler +work_pending: + testb %cl,%cl # current->work.need_resched + jz work_notifysig +work_resched: + call SYMBOL_NAME(schedule) + movl work(%ebx),%ecx + andl $0xffff00ff,%ecx # ignore the syscall trace counter + jz restore_all + testb %cl,%cl # current->work.need_resched + jnz work_resched + +work_notifysig: # deal with pending signals and notify-resume requests testl $(VM_MASK),EFLAGS(%esp) movl %esp,%eax - jne v86_signal_return + jne work_notifysig_v86 # returning to kernel-space or vm86-space xorl %edx,%edx - call SYMBOL_NAME(do_signal) + call SYMBOL_NAME(do_notify_resume) jmp restore_all ALIGN -v86_signal_return: +work_notifysig_v86: + pushl %ecx call SYMBOL_NAME(save_v86_state) + popl %ecx movl %eax,%esp xorl %edx,%edx - call SYMBOL_NAME(do_signal) + call SYMBOL_NAME(do_notify_resume) jmp restore_all + # perform syscall exit tracing ALIGN -tracesys: +syscall_trace_entry: movl $-ENOSYS,EAX(%esp) - call SYMBOL_NAME(syscall_trace) + movl %esp,%eax + xorl %edx,%edx + call SYMBOL_NAME(do_syscall_trace) movl ORIG_EAX(%esp),%eax cmpl $(NR_syscalls),%eax - jae tracesys_exit - call *SYMBOL_NAME(sys_call_table)(,%eax,4) - movl %eax,EAX(%esp) # save the return value -tracesys_exit: - call SYMBOL_NAME(syscall_trace) - jmp ret_from_sys_call -badsys: - movl $-ENOSYS,EAX(%esp) - jmp ret_from_sys_call + jnae syscall_traced + jmp syscall_exit + # perform syscall exit tracing ALIGN -ENTRY(ret_from_intr) - GET_CURRENT(%ebx) -ret_from_exception: - movl EFLAGS(%esp),%eax # mix EFLAGS and CS - movb CS(%esp),%al - testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor? - jne ret_from_sys_call - jmp restore_all +syscall_exit_work: + testb %ch,%ch # current->work.syscall_trace + jz work_pending + movl %esp,%eax + movl $1,%edx + call SYMBOL_NAME(do_syscall_trace) + jmp resume_userspace ALIGN -reschedule: - call SYMBOL_NAME(schedule) # test - jmp ret_from_sys_call +syscall_badsys: + movl $-ENOSYS,EAX(%esp) + jmp resume_userspace ENTRY(divide_error) pushl $0 # no error code diff -uNr linux-2.5.3-pre5/arch/i386/kernel/process.c linux-work-253p5/arch/i386/kernel/process.c --- linux-2.5.3-pre5/arch/i386/kernel/process.c Fri Jan 25 14:52:14 2002 +++ linux-work-253p5/arch/i386/kernel/process.c Fri Jan 25 15:01:07 2002 @@ -89,7 +89,7 @@ /* * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->need_resched flag instead of waiting for the + * to poll the ->work.need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ static void poll_idle (void) @@ -102,15 +102,15 @@ * Deal with another CPU just having chosen a thread to * run here: */ - oldval = xchg(¤t->need_resched, -1); + oldval = xchg(¤t->work.need_resched, -1); if (!oldval) asm volatile( "2:" - "cmpl $-1, %0;" + "cmpb $-1, %0;" "rep; nop;" "je 2b;" - : :"m" (current->need_resched)); + : :"m" (current->work.need_resched)); } /* diff -uNr linux-2.5.3-pre5/arch/i386/kernel/ptrace.c linux-work-253p5/arch/i386/kernel/ptrace.c --- linux-2.5.3-pre5/arch/i386/kernel/ptrace.c Tue Jan 22 09:06:49 2002 +++ linux-work-253p5/arch/i386/kernel/ptrace.c Fri Jan 25 15:01:07 2002 @@ -277,10 +277,18 @@ ret = -EIO; if ((unsigned long) data > _NSIG) break; - if (request == PTRACE_SYSCALL) - child->ptrace |= PT_TRACESYS; - else - child->ptrace &= ~PT_TRACESYS; + if (request == PTRACE_SYSCALL) { + if (!(child->ptrace & PT_SYSCALLTRACE)) { + child->ptrace |= PT_SYSCALLTRACE; + child->work.syscall_trace++; + } + } + else { + if (child->ptrace & PT_SYSCALLTRACE) { + child->ptrace &= ~PT_SYSCALLTRACE; + child->work.syscall_trace--; + } + } child->exit_code = data; /* make sure the single step bit is not set. */ tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG; @@ -315,7 +323,10 @@ ret = -EIO; if ((unsigned long) data > _NSIG) break; - child->ptrace &= ~PT_TRACESYS; + if (child->ptrace & PT_SYSCALLTRACE) { + child->ptrace &= ~PT_SYSCALLTRACE; + child->work.syscall_trace--; + } if ((child->ptrace & PT_DTRACE) == 0) { /* Spurious delayed TF traps may occur */ child->ptrace |= PT_DTRACE; @@ -439,10 +450,14 @@ return ret; } -asmlinkage void syscall_trace(void) +/* notification of system call entry/exit + * - triggered by current->work.syscall_trace + */ +__attribute__((regparm(3))) +void do_syscall_trace(struct pt_regs *regs, int entryexit) { - if ((current->ptrace & (PT_PTRACED|PT_TRACESYS)) != - (PT_PTRACED|PT_TRACESYS)) + if ((current->ptrace & (PT_PTRACED|PT_SYSCALLTRACE)) != + (PT_PTRACED|PT_SYSCALLTRACE)) return; /* the 0x80 provides a way for the tracing parent to distinguish between a syscall stop and SIGTRAP delivery */ @@ -461,3 +476,15 @@ current->exit_code = 0; } } + +/* notification of userspace execution resumption + * - triggered by current->work.notify_resume + */ +__attribute__((regparm(3))) +void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, + struct task_work work_pending) +{ + /* deal with pending signal delivery */ + if (work_pending.sigpending) + do_signal(regs,oldset); +} diff -uNr linux-2.5.3-pre5/arch/i386/kernel/signal.c linux-work-253p5/arch/i386/kernel/signal.c --- linux-2.5.3-pre5/arch/i386/kernel/signal.c Tue Jan 22 09:06:49 2002 +++ linux-work-253p5/arch/i386/kernel/signal.c Fri Jan 25 15:01:07 2002 @@ -28,8 +28,6 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset)); - int copy_siginfo_to_user(siginfo_t *to, siginfo_t *from) { if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) diff -uNr linux-2.5.3-pre5/arch/i386/kernel/vm86.c linux-work-253p5/arch/i386/kernel/vm86.c --- linux-2.5.3-pre5/arch/i386/kernel/vm86.c Tue Jan 22 09:06:49 2002 +++ linux-work-253p5/arch/i386/kernel/vm86.c Fri Jan 25 15:01:07 2002 @@ -212,7 +212,7 @@ info->regs.__null_ds = 0; info->regs.__null_es = 0; -/* we are clearing fs,gs later just before "jmp ret_from_sys_call", +/* we are clearing fs,gs later just before "jmp resume_userspace", * because starting with Linux 2.1.x they aren't no longer saved/restored */ @@ -255,7 +255,7 @@ __asm__ __volatile__( "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t" "movl %0,%%esp\n\t" - "jmp ret_from_sys_call" + "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "b" (tsk) : "ax"); /* we never return here */ @@ -268,7 +268,7 @@ regs32 = save_v86_state(regs16); regs32->eax = retval; __asm__ __volatile__("movl %0,%%esp\n\t" - "jmp ret_from_sys_call" + "jmp resume_userspace" : : "r" (regs32), "b" (current)); } diff -uNr linux-2.5.3-pre5/fs/lockd/svc.c linux-work-253p5/fs/lockd/svc.c --- linux-2.5.3-pre5/fs/lockd/svc.c Tue Jan 22 09:05:58 2002 +++ linux-work-253p5/fs/lockd/svc.c Fri Jan 25 15:01:07 2002 @@ -304,7 +304,7 @@ * Wait for the lockd process to exit, but since we're holding * the lockd semaphore, we can't wait around forever ... */ - current->sigpending = 0; + current->work.sigpending = 0; interruptible_sleep_on_timeout(&lockd_exit, HZ); if (nlmsvc_pid) { printk(KERN_WARNING diff -uNr linux-2.5.3-pre5/fs/nfsd/export.c linux-work-253p5/fs/nfsd/export.c --- linux-2.5.3-pre5/fs/nfsd/export.c Tue Jan 22 09:05:58 2002 +++ linux-work-253p5/fs/nfsd/export.c Fri Jan 25 15:01:07 2002 @@ -468,7 +468,7 @@ return 0; } - current->sigpending = 0; + current->work.sigpending = 0; want_lock++; while (hash_count || hash_lock) { interruptible_sleep_on(&hash_wait); diff -uNr linux-2.5.3-pre5/include/asm-i386/signal.h linux-work-253p5/include/asm-i386/signal.h --- linux-2.5.3-pre5/include/asm-i386/signal.h Thu Jan 24 14:53:26 2002 +++ linux-work-253p5/include/asm-i386/signal.h Fri Jan 25 15:05:45 2002 @@ -2,6 +2,7 @@ #define _ASMi386_SIGNAL_H #include +#include /* Avoid too many header ordering problems. */ struct siginfo; @@ -216,6 +217,8 @@ return word; } +extern int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset)); + #endif /* __KERNEL__ */ #endif diff -uNr linux-2.5.3-pre5/include/linux/init_task.h linux-work-253p5/include/linux/init_task.h --- linux-2.5.3-pre5/include/linux/init_task.h Fri Jan 25 14:52:17 2002 +++ linux-work-253p5/include/linux/init_task.h Fri Jan 25 15:12:17 2002 @@ -35,6 +35,14 @@ siglock: SPIN_LOCK_UNLOCKED \ } +#define INIT_TASK_WORK \ +{ \ + need_resched: 0, \ + syscall_trace: 0, \ + sigpending: 0, \ + notify_resume: 0, \ +} + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -43,7 +51,7 @@ { \ state: 0, \ flags: 0, \ - sigpending: 0, \ + work: INIT_TASK_WORK, \ addr_limit: KERNEL_DS, \ exec_domain: &default_exec_domain, \ lock_depth: -1, \ diff -uNr linux-2.5.3-pre5/include/linux/sched.h linux-work-253p5/include/linux/sched.h --- linux-2.5.3-pre5/include/linux/sched.h Fri Jan 25 14:52:17 2002 +++ linux-work-253p5/include/linux/sched.h Fri Jan 25 15:05:45 2002 @@ -228,19 +228,29 @@ typedef struct prio_array prio_array_t; +/* this struct must occupy one 32-bit chunk so that is can be read in one go */ +struct task_work { + __s8 need_resched; + __u8 syscall_trace; /* count of syscall interceptors */ + __u8 sigpending; + __u8 notify_resume; /* request for notification on + userspace execution resumption */ +} __attribute__((packed)); + struct task_struct { /* * offsets of these are hardcoded elsewhere - touch with care */ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ unsigned long flags; /* per process flags, defined below */ - int sigpending; + volatile struct task_work work; + mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ struct exec_domain *exec_domain; - volatile long need_resched; + long __pad; unsigned long ptrace; int lock_depth; /* Lock depth */ @@ -381,7 +391,7 @@ */ #define PT_PTRACED 0x00000001 -#define PT_TRACESYS 0x00000002 +#define PT_SYSCALLTRACE 0x00000002 /* T if syscall_trace is +1 for ptrace() */ #define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */ #define PT_TRACESYSGOOD 0x00000008 #define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */ @@ -564,12 +574,12 @@ static inline int signal_pending(struct task_struct *p) { - return (p->sigpending != 0); + return (p->work.sigpending != 0); } static inline int need_resched(void) { - return unlikely(current->need_resched != 0); + return unlikely(current->work.need_resched != 0); } static inline void cond_resched(void) @@ -614,7 +624,7 @@ static inline void recalc_sigpending(struct task_struct *t) { - t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); + t->work.sigpending = has_pending_signals(&t->pending.signal, &t->blocked); } /* True if we are on the alternate signal stack. */ diff -uNr linux-2.5.3-pre5/kernel/fork.c linux-work-253p5/kernel/fork.c --- linux-2.5.3-pre5/kernel/fork.c Fri Jan 25 14:52:17 2002 +++ linux-work-253p5/kernel/fork.c Fri Jan 25 15:01:07 2002 @@ -631,7 +631,7 @@ } spin_lock_init(&p->alloc_lock); - p->sigpending = 0; + p->work.sigpending = 0; init_sigpending(&p->pending); p->it_real_value = p->it_virt_value = p->it_prof_value = 0; @@ -756,7 +756,7 @@ * Let the child process run first, to avoid most of the * COW overhead when the child exec()s afterwards. */ - current->need_resched = 1; + current->work.need_resched = 1; fork_out: return retval; diff -uNr linux-2.5.3-pre5/kernel/sched.c linux-work-253p5/kernel/sched.c --- linux-2.5.3-pre5/kernel/sched.c Fri Jan 25 14:52:17 2002 +++ linux-work-253p5/kernel/sched.c Fri Jan 25 15:01:07 2002 @@ -176,9 +176,9 @@ { int need_resched; - need_resched = p->need_resched; + need_resched = p->work.need_resched; wmb(); - p->need_resched = 1; + p->work.need_resched = 1; if (!need_resched && (p->cpu != smp_processor_id())) smp_send_reschedule(p->cpu); } @@ -483,7 +483,7 @@ this_rq->nr_running++; enqueue_task(next, this_rq->active); if (next->prio < current->prio) - current->need_resched = 1; + current->work.need_resched = 1; if (!idle && --imbalance) { if (array == busiest->expired) { array = busiest->active; @@ -528,7 +528,7 @@ return idle_tick(); /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { - p->need_resched = 1; + p->work.need_resched = 1; return; } spin_lock(&rq->lock); @@ -539,7 +539,7 @@ */ if ((p->policy == SCHED_RR) && !--p->time_slice) { p->time_slice = NICE_TO_TIMESLICE(p->__nice); - p->need_resched = 1; + p->work.need_resched = 1; /* put it at the end of the queue: */ dequeue_task(p, rq->active); @@ -559,7 +559,7 @@ p->sleep_avg--; if (!--p->time_slice) { dequeue_task(p, rq->active); - p->need_resched = 1; + p->work.need_resched = 1; p->prio = effective_prio(p); p->time_slice = NICE_TO_TIMESLICE(p->__nice); enqueue_task(p, TASK_INTERACTIVE(p) ? rq->active : rq->expired); @@ -622,7 +622,7 @@ next = list_entry(queue->next, task_t, run_list); switch_tasks: - prev->need_resched = 0; + prev->work.need_resched = 0; if (likely(prev != next)) { rq->nr_switches++; @@ -1246,7 +1246,7 @@ current->prio = MAX_PRIO; current->state = TASK_RUNNING; double_rq_unlock(this_rq, rq); - current->need_resched = 1; + current->work.need_resched = 1; __restore_flags(flags); } diff -uNr linux-2.5.3-pre5/kernel/signal.c linux-work-253p5/kernel/signal.c --- linux-2.5.3-pre5/kernel/signal.c Tue Jan 22 09:06:00 2002 +++ linux-work-253p5/kernel/signal.c Fri Jan 25 15:01:07 2002 @@ -105,7 +105,7 @@ void flush_signals(struct task_struct *t) { - t->sigpending = 0; + t->work.sigpending = 0; flush_sigqueue(&t->pending); } @@ -119,7 +119,7 @@ if (atomic_dec_and_test(&sig->count)) kmem_cache_free(sigact_cachep, sig); } - tsk->sigpending = 0; + tsk->work.sigpending = 0; flush_sigqueue(&tsk->pending); spin_unlock_irq(&tsk->sigmask_lock); } @@ -246,7 +246,7 @@ if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { if (!(current->notifier)(current->notifier_data)) { - current->sigpending = 0; + current->work.sigpending = 0; return 0; } } @@ -465,7 +465,7 @@ */ static inline void signal_wake_up(struct task_struct *t) { - t->sigpending = 1; + t->work.sigpending = 1; #ifdef CONFIG_SMP /* diff -uNr linux-2.5.3-pre5/net/sunrpc/sched.c linux-work-253p5/net/sunrpc/sched.c --- linux-2.5.3-pre5/net/sunrpc/sched.c Fri Jan 25 14:52:17 2002 +++ linux-work-253p5/net/sunrpc/sched.c Fri Jan 25 15:01:07 2002 @@ -1109,7 +1109,7 @@ unsigned long flags; while (all_tasks) { - current->sigpending = 0; + current->work.sigpending = 0; rpc_killall_tasks(NULL); __rpc_schedule(); if (all_tasks) { @@ -1183,7 +1183,7 @@ * Usually rpciod will exit very quickly, so we * wait briefly before checking the process id. */ - current->sigpending = 0; + current->work.sigpending = 0; yield(); /* * Display a message if we're going to wait longer. diff -uNr linux-2.5.3-pre5/net/sunrpc/svc.c linux-work-253p5/net/sunrpc/svc.c --- linux-2.5.3-pre5/net/sunrpc/svc.c Tue Jan 22 09:06:09 2002 +++ linux-work-253p5/net/sunrpc/svc.c Fri Jan 25 15:01:08 2002 @@ -185,7 +185,7 @@ progp->pg_name, proto == IPPROTO_UDP? "udp" : "tcp", port); if (!port) - current->sigpending = 0; + current->work.sigpending = 0; for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/