2002-01-25 18:58:23

by David Howells

[permalink] [raw]
Subject: [PATCH] syscall latency improvement #1


Hi Linus,

The attached patch does the following to 2.5.3-pre5:

* consolidates various status items that are found in the lower reaches of
task_struct into one 32-bit word, thus allowing them to be tested
atomically without the need to disable interrupts in entry.S.

* optimises the instructions in the system_call path in entry.S

* frees up a hole in the bottom part of the task_struct (on the 1st cache
line).

* improves base syscall latency by approximately 5.4% (dual PIII) or 3.6%
(dual Athlon) as measured by lmbench's "lat_syscall null" command against
the vanilla kernel.

Most notable are the changes to the following files:

arch/i386/kernel/entry.S
include/linux/sched.h

David

diff -uNr linux-2.5.3-pre5/arch/i386/kernel/entry.S linux-work-253p5/arch/i386/kernel/entry.S
--- linux-2.5.3-pre5/arch/i386/kernel/entry.S Tue Jan 22 09:06:49 2002
+++ linux-work-253p5/arch/i386/kernel/entry.S Fri Jan 25 15:01:07 2002
@@ -72,10 +72,13 @@
*/
state = 0
flags = 4
-sigpending = 8
+work = 8
+need_resched = work+0
+syscall_trace = work+1
+sigpending = work+2
+notify_resume = work+3
addr_limit = 12
exec_domain = 16
-need_resched = 20
tsk_ptrace = 24
processor = 52

@@ -151,7 +154,7 @@
call *%edx
addl $4, %esp
popl %eax
- jmp ret_from_sys_call
+ jmp resume_userspace

ENTRY(lcall27)
pushfl # We get a different stack layout with call gates,
@@ -172,7 +175,7 @@
call *%edx
addl $4, %esp
popl %eax
- jmp ret_from_sys_call
+ jmp resume_userspace


ENTRY(ret_from_fork)
@@ -180,9 +183,7 @@
call SYMBOL_NAME(schedule_tail)
addl $4, %esp
GET_CURRENT(%ebx)
- testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
- jne tracesys_exit
- jmp ret_from_sys_call
+ jmp syscall_exit

/*
* Return to user mode is not as complex as all this looks,
@@ -191,73 +192,99 @@
* less clear than it otherwise should be.
*/

+ # userspace resumption stub bypassing syscall exit tracing
+ ALIGN
+ENTRY(ret_from_intr)
+ GET_CURRENT(%ebx)
+ret_from_exception:
+ movl EFLAGS(%esp),%eax # mix EFLAGS and CS
+ movb CS(%esp),%al
+ testl $(VM_MASK | 3),%eax
+ jz restore_all # returning to kernel-space or vm86-space
+ sti # we may have come from an interrupt handler
+ENTRY(resume_userspace)
+ movl work(%ebx),%ecx
+ andl $0xffff00ff,%ecx # current->work (ignoring syscall_trace)
+ jne work_pending
+ jmp restore_all
+
+ # system call handler stub
+ ALIGN
ENTRY(system_call)
pushl %eax # save orig_eax
SAVE_ALL
GET_CURRENT(%ebx)
- testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
- jne tracesys
cmpl $(NR_syscalls),%eax
- jae badsys
+ jae syscall_badsys
+ testb $0xff,syscall_trace(%ebx) # system call tracing in operation
+ jnz syscall_trace_entry
+syscall_traced:
call *SYMBOL_NAME(sys_call_table)(,%eax,4)
- movl %eax,EAX(%esp) # save the return value
-ENTRY(ret_from_sys_call)
- cli # need_resched and signals atomic test
- cmpl $0,need_resched(%ebx)
- jne reschedule
- cmpl $0,sigpending(%ebx)
- jne signal_return
+ movl %eax,EAX(%esp) # store the return value
+syscall_exit:
+ movl work(%ebx),%ecx
+ testl %ecx,%ecx # current->work
+ jne syscall_exit_work
restore_all:
RESTORE_ALL

+ # perform work that needs to be done immediately before resumption
ALIGN
-signal_return:
- sti # we can get here from an interrupt handler
+work_pending:
+ testb %cl,%cl # current->work.need_resched
+ jz work_notifysig
+work_resched:
+ call SYMBOL_NAME(schedule)
+ movl work(%ebx),%ecx
+ andl $0xffff00ff,%ecx # ignore the syscall trace counter
+ jz restore_all
+ testb %cl,%cl # current->work.need_resched
+ jnz work_resched
+
+work_notifysig: # deal with pending signals and notify-resume requests
testl $(VM_MASK),EFLAGS(%esp)
movl %esp,%eax
- jne v86_signal_return
+ jne work_notifysig_v86 # returning to kernel-space or vm86-space
xorl %edx,%edx
- call SYMBOL_NAME(do_signal)
+ call SYMBOL_NAME(do_notify_resume)
jmp restore_all

ALIGN
-v86_signal_return:
+work_notifysig_v86:
+ pushl %ecx
call SYMBOL_NAME(save_v86_state)
+ popl %ecx
movl %eax,%esp
xorl %edx,%edx
- call SYMBOL_NAME(do_signal)
+ call SYMBOL_NAME(do_notify_resume)
jmp restore_all

+ # perform syscall exit tracing
ALIGN
-tracesys:
+syscall_trace_entry:
movl $-ENOSYS,EAX(%esp)
- call SYMBOL_NAME(syscall_trace)
+ movl %esp,%eax
+ xorl %edx,%edx
+ call SYMBOL_NAME(do_syscall_trace)
movl ORIG_EAX(%esp),%eax
cmpl $(NR_syscalls),%eax
- jae tracesys_exit
- call *SYMBOL_NAME(sys_call_table)(,%eax,4)
- movl %eax,EAX(%esp) # save the return value
-tracesys_exit:
- call SYMBOL_NAME(syscall_trace)
- jmp ret_from_sys_call
-badsys:
- movl $-ENOSYS,EAX(%esp)
- jmp ret_from_sys_call
+ jnae syscall_traced
+ jmp syscall_exit

+ # perform syscall exit tracing
ALIGN
-ENTRY(ret_from_intr)
- GET_CURRENT(%ebx)
-ret_from_exception:
- movl EFLAGS(%esp),%eax # mix EFLAGS and CS
- movb CS(%esp),%al
- testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor?
- jne ret_from_sys_call
- jmp restore_all
+syscall_exit_work:
+ testb %ch,%ch # current->work.syscall_trace
+ jz work_pending
+ movl %esp,%eax
+ movl $1,%edx
+ call SYMBOL_NAME(do_syscall_trace)
+ jmp resume_userspace

ALIGN
-reschedule:
- call SYMBOL_NAME(schedule) # test
- jmp ret_from_sys_call
+syscall_badsys:
+ movl $-ENOSYS,EAX(%esp)
+ jmp resume_userspace

ENTRY(divide_error)
pushl $0 # no error code
diff -uNr linux-2.5.3-pre5/arch/i386/kernel/process.c linux-work-253p5/arch/i386/kernel/process.c
--- linux-2.5.3-pre5/arch/i386/kernel/process.c Fri Jan 25 14:52:14 2002
+++ linux-work-253p5/arch/i386/kernel/process.c Fri Jan 25 15:01:07 2002
@@ -89,7 +89,7 @@

/*
* On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->need_resched flag instead of waiting for the
+ * to poll the ->work.need_resched flag instead of waiting for the
* cross-CPU IPI to arrive. Use this option with caution.
*/
static void poll_idle (void)
@@ -102,15 +102,15 @@
* Deal with another CPU just having chosen a thread to
* run here:
*/
- oldval = xchg(&current->need_resched, -1);
+ oldval = xchg(&current->work.need_resched, -1);

if (!oldval)
asm volatile(
"2:"
- "cmpl $-1, %0;"
+ "cmpb $-1, %0;"
"rep; nop;"
"je 2b;"
- : :"m" (current->need_resched));
+ : :"m" (current->work.need_resched));
}

/*
diff -uNr linux-2.5.3-pre5/arch/i386/kernel/ptrace.c linux-work-253p5/arch/i386/kernel/ptrace.c
--- linux-2.5.3-pre5/arch/i386/kernel/ptrace.c Tue Jan 22 09:06:49 2002
+++ linux-work-253p5/arch/i386/kernel/ptrace.c Fri Jan 25 15:01:07 2002
@@ -277,10 +277,18 @@
ret = -EIO;
if ((unsigned long) data > _NSIG)
break;
- if (request == PTRACE_SYSCALL)
- child->ptrace |= PT_TRACESYS;
- else
- child->ptrace &= ~PT_TRACESYS;
+ if (request == PTRACE_SYSCALL) {
+ if (!(child->ptrace & PT_SYSCALLTRACE)) {
+ child->ptrace |= PT_SYSCALLTRACE;
+ child->work.syscall_trace++;
+ }
+ }
+ else {
+ if (child->ptrace & PT_SYSCALLTRACE) {
+ child->ptrace &= ~PT_SYSCALLTRACE;
+ child->work.syscall_trace--;
+ }
+ }
child->exit_code = data;
/* make sure the single step bit is not set. */
tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
@@ -315,7 +323,10 @@
ret = -EIO;
if ((unsigned long) data > _NSIG)
break;
- child->ptrace &= ~PT_TRACESYS;
+ if (child->ptrace & PT_SYSCALLTRACE) {
+ child->ptrace &= ~PT_SYSCALLTRACE;
+ child->work.syscall_trace--;
+ }
if ((child->ptrace & PT_DTRACE) == 0) {
/* Spurious delayed TF traps may occur */
child->ptrace |= PT_DTRACE;
@@ -439,10 +450,14 @@
return ret;
}

-asmlinkage void syscall_trace(void)
+/* notification of system call entry/exit
+ * - triggered by current->work.syscall_trace
+ */
+__attribute__((regparm(3)))
+void do_syscall_trace(struct pt_regs *regs, int entryexit)
{
- if ((current->ptrace & (PT_PTRACED|PT_TRACESYS)) !=
- (PT_PTRACED|PT_TRACESYS))
+ if ((current->ptrace & (PT_PTRACED|PT_SYSCALLTRACE)) !=
+ (PT_PTRACED|PT_SYSCALLTRACE))
return;
/* the 0x80 provides a way for the tracing parent to distinguish
between a syscall stop and SIGTRAP delivery */
@@ -461,3 +476,15 @@
current->exit_code = 0;
}
}
+
+/* notification of userspace execution resumption
+ * - triggered by current->work.notify_resume
+ */
+__attribute__((regparm(3)))
+void do_notify_resume(struct pt_regs *regs, sigset_t *oldset,
+ struct task_work work_pending)
+{
+ /* deal with pending signal delivery */
+ if (work_pending.sigpending)
+ do_signal(regs,oldset);
+}
diff -uNr linux-2.5.3-pre5/arch/i386/kernel/signal.c linux-work-253p5/arch/i386/kernel/signal.c
--- linux-2.5.3-pre5/arch/i386/kernel/signal.c Tue Jan 22 09:06:49 2002
+++ linux-work-253p5/arch/i386/kernel/signal.c Fri Jan 25 15:01:07 2002
@@ -28,8 +28,6 @@

#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))

-int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset));
-
int copy_siginfo_to_user(siginfo_t *to, siginfo_t *from)
{
if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
diff -uNr linux-2.5.3-pre5/arch/i386/kernel/vm86.c linux-work-253p5/arch/i386/kernel/vm86.c
--- linux-2.5.3-pre5/arch/i386/kernel/vm86.c Tue Jan 22 09:06:49 2002
+++ linux-work-253p5/arch/i386/kernel/vm86.c Fri Jan 25 15:01:07 2002
@@ -212,7 +212,7 @@
info->regs.__null_ds = 0;
info->regs.__null_es = 0;

-/* we are clearing fs,gs later just before "jmp ret_from_sys_call",
+/* we are clearing fs,gs later just before "jmp resume_userspace",
* because starting with Linux 2.1.x they aren't no longer saved/restored
*/

@@ -255,7 +255,7 @@
__asm__ __volatile__(
"xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
"movl %0,%%esp\n\t"
- "jmp ret_from_sys_call"
+ "jmp resume_userspace"
: /* no outputs */
:"r" (&info->regs), "b" (tsk) : "ax");
/* we never return here */
@@ -268,7 +268,7 @@
regs32 = save_v86_state(regs16);
regs32->eax = retval;
__asm__ __volatile__("movl %0,%%esp\n\t"
- "jmp ret_from_sys_call"
+ "jmp resume_userspace"
: : "r" (regs32), "b" (current));
}

diff -uNr linux-2.5.3-pre5/fs/lockd/svc.c linux-work-253p5/fs/lockd/svc.c
--- linux-2.5.3-pre5/fs/lockd/svc.c Tue Jan 22 09:05:58 2002
+++ linux-work-253p5/fs/lockd/svc.c Fri Jan 25 15:01:07 2002
@@ -304,7 +304,7 @@
* Wait for the lockd process to exit, but since we're holding
* the lockd semaphore, we can't wait around forever ...
*/
- current->sigpending = 0;
+ current->work.sigpending = 0;
interruptible_sleep_on_timeout(&lockd_exit, HZ);
if (nlmsvc_pid) {
printk(KERN_WARNING
diff -uNr linux-2.5.3-pre5/fs/nfsd/export.c linux-work-253p5/fs/nfsd/export.c
--- linux-2.5.3-pre5/fs/nfsd/export.c Tue Jan 22 09:05:58 2002
+++ linux-work-253p5/fs/nfsd/export.c Fri Jan 25 15:01:07 2002
@@ -468,7 +468,7 @@
return 0;
}

- current->sigpending = 0;
+ current->work.sigpending = 0;
want_lock++;
while (hash_count || hash_lock) {
interruptible_sleep_on(&hash_wait);
diff -uNr linux-2.5.3-pre5/include/asm-i386/signal.h linux-work-253p5/include/asm-i386/signal.h
--- linux-2.5.3-pre5/include/asm-i386/signal.h Thu Jan 24 14:53:26 2002
+++ linux-work-253p5/include/asm-i386/signal.h Fri Jan 25 15:05:45 2002
@@ -2,6 +2,7 @@
#define _ASMi386_SIGNAL_H

#include <linux/types.h>
+#include <linux/linkage.h>

/* Avoid too many header ordering problems. */
struct siginfo;
@@ -216,6 +217,8 @@
return word;
}

+extern int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset));
+
#endif /* __KERNEL__ */

#endif
diff -uNr linux-2.5.3-pre5/include/linux/init_task.h linux-work-253p5/include/linux/init_task.h
--- linux-2.5.3-pre5/include/linux/init_task.h Fri Jan 25 14:52:17 2002
+++ linux-work-253p5/include/linux/init_task.h Fri Jan 25 15:12:17 2002
@@ -35,6 +35,14 @@
siglock: SPIN_LOCK_UNLOCKED \
}

+#define INIT_TASK_WORK \
+{ \
+ need_resched: 0, \
+ syscall_trace: 0, \
+ sigpending: 0, \
+ notify_resume: 0, \
+}
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -43,7 +51,7 @@
{ \
state: 0, \
flags: 0, \
- sigpending: 0, \
+ work: INIT_TASK_WORK, \
addr_limit: KERNEL_DS, \
exec_domain: &default_exec_domain, \
lock_depth: -1, \
diff -uNr linux-2.5.3-pre5/include/linux/sched.h linux-work-253p5/include/linux/sched.h
--- linux-2.5.3-pre5/include/linux/sched.h Fri Jan 25 14:52:17 2002
+++ linux-work-253p5/include/linux/sched.h Fri Jan 25 15:05:45 2002
@@ -228,19 +228,29 @@

typedef struct prio_array prio_array_t;

+/* this struct must occupy one 32-bit chunk so that is can be read in one go */
+struct task_work {
+ __s8 need_resched;
+ __u8 syscall_trace; /* count of syscall interceptors */
+ __u8 sigpending;
+ __u8 notify_resume; /* request for notification on
+ userspace execution resumption */
+} __attribute__((packed));
+
struct task_struct {
/*
* offsets of these are hardcoded elsewhere - touch with care
*/
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
unsigned long flags; /* per process flags, defined below */
- int sigpending;
+ volatile struct task_work work;
+
mm_segment_t addr_limit; /* thread address space:
0-0xBFFFFFFF for user-thead
0-0xFFFFFFFF for kernel-thread
*/
struct exec_domain *exec_domain;
- volatile long need_resched;
+ long __pad;
unsigned long ptrace;

int lock_depth; /* Lock depth */
@@ -381,7 +391,7 @@
*/

#define PT_PTRACED 0x00000001
-#define PT_TRACESYS 0x00000002
+#define PT_SYSCALLTRACE 0x00000002 /* T if syscall_trace is +1 for ptrace() */
#define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */
#define PT_TRACESYSGOOD 0x00000008
#define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */
@@ -564,12 +574,12 @@

static inline int signal_pending(struct task_struct *p)
{
- return (p->sigpending != 0);
+ return (p->work.sigpending != 0);
}

static inline int need_resched(void)
{
- return unlikely(current->need_resched != 0);
+ return unlikely(current->work.need_resched != 0);
}

static inline void cond_resched(void)
@@ -614,7 +624,7 @@

static inline void recalc_sigpending(struct task_struct *t)
{
- t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+ t->work.sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
}

/* True if we are on the alternate signal stack. */
diff -uNr linux-2.5.3-pre5/kernel/fork.c linux-work-253p5/kernel/fork.c
--- linux-2.5.3-pre5/kernel/fork.c Fri Jan 25 14:52:17 2002
+++ linux-work-253p5/kernel/fork.c Fri Jan 25 15:01:07 2002
@@ -631,7 +631,7 @@
}
spin_lock_init(&p->alloc_lock);

- p->sigpending = 0;
+ p->work.sigpending = 0;
init_sigpending(&p->pending);

p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
@@ -756,7 +756,7 @@
* Let the child process run first, to avoid most of the
* COW overhead when the child exec()s afterwards.
*/
- current->need_resched = 1;
+ current->work.need_resched = 1;

fork_out:
return retval;
diff -uNr linux-2.5.3-pre5/kernel/sched.c linux-work-253p5/kernel/sched.c
--- linux-2.5.3-pre5/kernel/sched.c Fri Jan 25 14:52:17 2002
+++ linux-work-253p5/kernel/sched.c Fri Jan 25 15:01:07 2002
@@ -176,9 +176,9 @@
{
int need_resched;

- need_resched = p->need_resched;
+ need_resched = p->work.need_resched;
wmb();
- p->need_resched = 1;
+ p->work.need_resched = 1;
if (!need_resched && (p->cpu != smp_processor_id()))
smp_send_reschedule(p->cpu);
}
@@ -483,7 +483,7 @@
this_rq->nr_running++;
enqueue_task(next, this_rq->active);
if (next->prio < current->prio)
- current->need_resched = 1;
+ current->work.need_resched = 1;
if (!idle && --imbalance) {
if (array == busiest->expired) {
array = busiest->active;
@@ -528,7 +528,7 @@
return idle_tick();
/* Task might have expired already, but not scheduled off yet */
if (p->array != rq->active) {
- p->need_resched = 1;
+ p->work.need_resched = 1;
return;
}
spin_lock(&rq->lock);
@@ -539,7 +539,7 @@
*/
if ((p->policy == SCHED_RR) && !--p->time_slice) {
p->time_slice = NICE_TO_TIMESLICE(p->__nice);
- p->need_resched = 1;
+ p->work.need_resched = 1;

/* put it at the end of the queue: */
dequeue_task(p, rq->active);
@@ -559,7 +559,7 @@
p->sleep_avg--;
if (!--p->time_slice) {
dequeue_task(p, rq->active);
- p->need_resched = 1;
+ p->work.need_resched = 1;
p->prio = effective_prio(p);
p->time_slice = NICE_TO_TIMESLICE(p->__nice);
enqueue_task(p, TASK_INTERACTIVE(p) ? rq->active : rq->expired);
@@ -622,7 +622,7 @@
next = list_entry(queue->next, task_t, run_list);

switch_tasks:
- prev->need_resched = 0;
+ prev->work.need_resched = 0;

if (likely(prev != next)) {
rq->nr_switches++;
@@ -1246,7 +1246,7 @@
current->prio = MAX_PRIO;
current->state = TASK_RUNNING;
double_rq_unlock(this_rq, rq);
- current->need_resched = 1;
+ current->work.need_resched = 1;
__restore_flags(flags);
}

diff -uNr linux-2.5.3-pre5/kernel/signal.c linux-work-253p5/kernel/signal.c
--- linux-2.5.3-pre5/kernel/signal.c Tue Jan 22 09:06:00 2002
+++ linux-work-253p5/kernel/signal.c Fri Jan 25 15:01:07 2002
@@ -105,7 +105,7 @@
void
flush_signals(struct task_struct *t)
{
- t->sigpending = 0;
+ t->work.sigpending = 0;
flush_sigqueue(&t->pending);
}

@@ -119,7 +119,7 @@
if (atomic_dec_and_test(&sig->count))
kmem_cache_free(sigact_cachep, sig);
}
- tsk->sigpending = 0;
+ tsk->work.sigpending = 0;
flush_sigqueue(&tsk->pending);
spin_unlock_irq(&tsk->sigmask_lock);
}
@@ -246,7 +246,7 @@
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
if (!(current->notifier)(current->notifier_data)) {
- current->sigpending = 0;
+ current->work.sigpending = 0;
return 0;
}
}
@@ -465,7 +465,7 @@
*/
static inline void signal_wake_up(struct task_struct *t)
{
- t->sigpending = 1;
+ t->work.sigpending = 1;

#ifdef CONFIG_SMP
/*
diff -uNr linux-2.5.3-pre5/net/sunrpc/sched.c linux-work-253p5/net/sunrpc/sched.c
--- linux-2.5.3-pre5/net/sunrpc/sched.c Fri Jan 25 14:52:17 2002
+++ linux-work-253p5/net/sunrpc/sched.c Fri Jan 25 15:01:07 2002
@@ -1109,7 +1109,7 @@
unsigned long flags;

while (all_tasks) {
- current->sigpending = 0;
+ current->work.sigpending = 0;
rpc_killall_tasks(NULL);
__rpc_schedule();
if (all_tasks) {
@@ -1183,7 +1183,7 @@
* Usually rpciod will exit very quickly, so we
* wait briefly before checking the process id.
*/
- current->sigpending = 0;
+ current->work.sigpending = 0;
yield();
/*
* Display a message if we're going to wait longer.
diff -uNr linux-2.5.3-pre5/net/sunrpc/svc.c linux-work-253p5/net/sunrpc/svc.c
--- linux-2.5.3-pre5/net/sunrpc/svc.c Tue Jan 22 09:06:09 2002
+++ linux-work-253p5/net/sunrpc/svc.c Fri Jan 25 15:01:08 2002
@@ -185,7 +185,7 @@
progp->pg_name, proto == IPPROTO_UDP? "udp" : "tcp", port);

if (!port)
- current->sigpending = 0;
+ current->work.sigpending = 0;

for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)


2002-01-25 22:30:44

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

On Fri, 2002-01-25 at 13:54, David Howells wrote:

> The attached patch does the following to 2.5.3-pre5:
>
> * consolidates various status items that are found in the lower reaches of
> task_struct into one 32-bit word, thus allowing them to be tested
> atomically without the need to disable interrupts in entry.S.
>
> * optimises the instructions in the system_call path in entry.S
>
> * frees up a hole in the bottom part of the task_struct (on the 1st cache
> line).
>
> * improves base syscall latency by approximately 5.4% (dual PIII) or 3.6%
> (dual Athlon) as measured by lmbench's "lat_syscall null" command against
> the vanilla kernel.

Mmm, I like it. Ingo Molnar talked to me about this (he wants such a
feature, too) earlier. This is a real win.

This patch is beneficial to the kernel preemption patch. I suspect
other future ideas could be added now without hurting the common case,
too. I had planned to roll need_resched into our preemption counter,
and I probably still can even with only 1 byte.

Anyhow, I'm looking it over -- good code.

Robert Love

2002-01-25 23:11:28

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

David Howells writes:

> The attached patch does the following to 2.5.3-pre5:
>
> * consolidates various status items that are found in the lower reaches of
> task_struct into one 32-bit word, thus allowing them to be tested
> atomically without the need to disable interrupts in entry.S.

I'm no expert on the x86 arch-specific code, but it looks to me like
you have misunderstood what is going on there. It seems to me that
you have opened up a race by testing the need_resched and sigpending
flags with interrupts enabled. The race is that you can test those
flags and find them false, and then get an interrupt that sets one or
other of those flags. You will then return to the usermode process
and not act on the need_resched or sigpending until the next
interrupt. So in fact you will reschedule / deliver the signal
eventually, but with extra latency.

In other words, the "atomic" in the comment is not about testing
need_resched and sigpending at the same time, but about making the
process of testing those flags and returning to usermode atomic.

Paul.

2002-01-26 00:40:42

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1


On Fri, 25 Jan 2002, David Howells wrote:
>
> * improves base syscall latency by approximately 5.4% (dual PIII) or 3.6%
> (dual Athlon) as measured by lmbench's "lat_syscall null" command against
> the vanilla kernel.

Looking at the code, I suspect that 99.9% of this "improvement" comes from
one thing, and one thing only: you removed the "cli" in the system call
return path.

Yes, you probably removed two other instructions too, but those other
instructions should have paired beautifully and should not be noticeable
in benchmarks.

The thing is, that "cli" was there for a reason, and we should think hard
about removing it. Removing the cli introduces a race condition where we
can otherwise return to user mode with "need_resched" or "signal_pending"
set.

Now, I don't disagree at all with putting those flags into a common word
and optimizing some of the code to test them at the same time, but I _do_
mind bad benchmarking.

Do we care about the race? Possibly not. It doesn't cause data corruption
or anything like that, but it can cause (for example), real-time processes
not getting scheduled when they should have been, or signals being
delayed.

In short:

- Try removing the "cli" from the standard tree, and I bet you'll get
basically the same speedups.

- this "atomically return to user mode and test flags" thing needs to be
discussed. I'm personally inclined to think that that "cli" is really
needed, but 5% on simple system calls is a strong argument.

NOTE! There are potentially other ways to do all of this, _without_ losing
atomicity. For example, you can move the "flags" value into the slot saved
for the CS segment (which, modulo vm86, will always be at a constant
offset on the stack), and make CS=0 be the work flag. That will cause the
CPU to trap atomically at the "iret".

(I doubt that is a good approach, as the trap itself is going to be
_quite_ expensive, on the order of several hundreds of cycles, and we'd
basically slow down signal handling and rescheduling by that amount. I'm
pointing it out more as a clever - as opposed to intelligent - alternate
approach that doesn't lose atomicity).

Linus

2002-01-26 01:04:49

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

Linus Torvalds wrote:
>
> On Fri, 25 Jan 2002, David Howells wrote:
> >
> > * improves base syscall latency by approximately 5.4% (dual PIII) or 3.6%
> > (dual Athlon) as measured by lmbench's "lat_syscall null" command against
> > the vanilla kernel.
>
> Looking at the code, I suspect that 99.9% of this "improvement" comes from
> one thing, and one thing only: you removed the "cli" in the system call
> return path.

Before the cli was in the stock kernel, I had added it in the
low-latency patch. Careful testing showed that it added
13 machine cycles to a system call on a P3.

> ...
>
> - this "atomically return to user mode and test flags" thing needs to be
> discussed. I'm personally inclined to think that that "cli" is really
> needed, but 5% on simple system calls is a strong argument.

Correctness first, please. I bet there are many ways in
which we can speed the kernel up by more than 13*n_syscalls.

<thinks of one>

On Intel hardware an open-coded duff-device memcpy is faster
than copy_to_user for all alignments except mod32->mod32.
Sometimes up to 25% faster.

<thinks of another>

p = malloc(4096)
read(fd, p, 4096)

the kernel memsets the faulted-in page to zero and then
immediately copies the pagecache data onto it. Removal
of the memset speeds up read() by ~10%.

<thinks of another>

s/inline//g

<thinks of another>

ext[23] directory inode allocation policy (aargh
it's horrid)

<thinks of another>

page pre-zeroing in the idle thread.

There are many ways of speeding up the kernel. Let's
concentrate on the biggies.

> NOTE! There are potentially other ways to do all of this, _without_ losing
> atomicity. For example, you can move the "flags" value into the slot saved
> for the CS segment (which, modulo vm86, will always be at a constant
> offset on the stack), and make CS=0 be the work flag. That will cause the
> CPU to trap atomically at the "iret".

Ingo's low-latency patch put markers around the critical code section,
and inspected the return EIP on the way back out of the interrupt.
If it falls inside the racy region, do special stuff.

-

2002-01-26 01:21:27

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1


On Fri, 25 Jan 2002, Andrew Morton wrote:
> >
> > Looking at the code, I suspect that 99.9% of this "improvement" comes from
> > one thing, and one thing only: you removed the "cli" in the system call
> > return path.
>
> Before the cli was in the stock kernel, I had added it in the
> low-latency patch. Careful testing showed that it added
> 13 machine cycles to a system call on a P3.

That sounds about right. The empty system call path is basically dominated
by the trap/iret costs, and is on the order of 200 cycles or so on most
CPU's. So 13 cycles would account for the roughly 5% improvement.

Linus

2002-01-26 01:55:27

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1


On 26 Jan 2002, Andi Kleen wrote:
>
> It doesn't explain the Athlon speedups. On athlon cli is ~4 cycles.

.. and it probably serializes the instruction stream.

Look at the patch.

The _only_ thing it does for the system call path is:

- remove the "cli"

- change

cmpl $0,..
jne
cmp $0,..
jne

into

movl ..,reg
testl reg,reg
jne

and the latter may be worth a cycle (or two, if the CPU happens to like
the second form better for some other reason), but it's certainly not
noticeable.

A 3.4% improvement is equivalent to something like 9 cycles, so the "cli"
being faster on Athlon than on a PIII certainly explains why it's less
noticeable on the Athlon, but it still makes me suspect that the _real_
cost of the cli is on the order of 8 cycles.

It should be eminently testable. Just remove the cli from the standard
kernel, and do before-and-after tests.

Linus

2002-01-26 02:04:09

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

On Fri, Jan 25, 2002 at 05:53:57PM -0800, Linus Torvalds wrote:
>
> On 26 Jan 2002, Andi Kleen wrote:
> >
> > It doesn't explain the Athlon speedups. On athlon cli is ~4 cycles.
>
> .. and it probably serializes the instruction stream.

I have word from AMD engineering that it doesn't stall the pipeline
or serializes.

(I asked the question during the design of the x86-64 syscall code)

-Andi

2002-01-26 02:15:53

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1


On Sat, 26 Jan 2002, Andi Kleen wrote:
> On Fri, Jan 25, 2002 at 05:53:57PM -0800, Linus Torvalds wrote:
> >
> > On 26 Jan 2002, Andi Kleen wrote:
> > >
> > > It doesn't explain the Athlon speedups. On athlon cli is ~4 cycles.
> >
> > .. and it probably serializes the instruction stream.
>
> I have word from AMD engineering that it doesn't stall the pipeline
> or serializes.

Note that it may not be the "cli" itself - the "iret" may be slower if it
has to enable interrupts that were disabled before. Ie the iret microcode
may have the equivalent of

/* Did eflags change? */
if ((new_eflags ^ old_eflags) & IF_MASK)
.. do sti/cli as appropriate ..

which would mean that the "cli" itself may take 4 cycles, but the "sti"
implicit in the iret will _also_ take 4 cycles and is optimized away when
not needed.

Which would add up to the 8 cycles needed for a ~3.4% speedup (this is
assuming the baseline is something like 250 cycles per system call, I've
not checked that assumption).

Linus

2002-01-26 02:18:37

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

Linus Torvalds wrote:
>
> It should be eminently testable. Just remove the cli from the standard
> kernel, and do before-and-after tests.
>

#include <unistd.h>

main()
{
int i = 100 * 1000 * 1000;

while (i--)
getpid();
}

With cli:
./a.out 22.05s user 15.31s system 99% cpu 37.361 total

without cli:
./a.out 18.29s user 17.42s system 99% cpu 35.731 total


That's 4.6%. Intel P3.

It's also 306 cycles per iteration. So the cli added 14 cycles.

-

2002-01-26 02:27:16

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

On Fri, Jan 25, 2002 at 06:14:25PM -0800, Linus Torvalds wrote:
>
> On Sat, 26 Jan 2002, Andi Kleen wrote:
> > On Fri, Jan 25, 2002 at 05:53:57PM -0800, Linus Torvalds wrote:
> > >
> > > On 26 Jan 2002, Andi Kleen wrote:
> > > >
> > > > It doesn't explain the Athlon speedups. On athlon cli is ~4 cycles.
> > >
> > > .. and it probably serializes the instruction stream.
> >
> > I have word from AMD engineering that it doesn't stall the pipeline
> > or serializes.
>
> Note that it may not be the "cli" itself - the "iret" may be slower if it
> has to enable interrupts that were disabled before. Ie the iret microcode
> may have the equivalent of

[...]

Yes that could explain it. I ignored it on x86-64 because it always uses
SYSCALL/SYSRET (at least for 64bit) @)

The real fix for that would be support of SYSENTER/SYSCALL on 32bit too
(more likely SYSENTER because it's supported by Athlons and SYSCALL is too
broken on K6 to be usable)

An int $0x80 does a awful lot of locked cycles for example and IRET is
also not exactly a speed daemon and very complex.

SYSENTER/SYSEXIT would be likely a much bigger win than nanooptimizations of
a few cycles around this.
-Andi

2002-01-26 02:39:46

by Dave Jones

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

On Sat, 26 Jan 2002, Andi Kleen wrote:

> The real fix for that would be support of SYSENTER/SYSCALL on 32bit too
> (more likely SYSENTER because it's supported by Athlons and SYSCALL is too
> broken on K6 to be usable)

There's an implementation at http://fy.chalmers.se/~appro/linux/sysenter.c
Haven't looked at it long enough to see how good/bad it is..

--
| Dave Jones. http://www.codemonkey.org.uk
| SuSE Labs

2002-01-26 02:46:36

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

On Fri, 25 Jan 2002, Linus Torvalds wrote:

>
> On Sat, 26 Jan 2002, Andi Kleen wrote:
> > On Fri, Jan 25, 2002 at 05:53:57PM -0800, Linus Torvalds wrote:
> > >
> > > On 26 Jan 2002, Andi Kleen wrote:
> > > >
> > > > It doesn't explain the Athlon speedups. On athlon cli is ~4 cycles.
> > >
> > > .. and it probably serializes the instruction stream.
> >
> > I have word from AMD engineering that it doesn't stall the pipeline
> > or serializes.
>
> Note that it may not be the "cli" itself - the "iret" may be slower if it
> has to enable interrupts that were disabled before. Ie the iret microcode
> may have the equivalent of
>
> /* Did eflags change? */
> if ((new_eflags ^ old_eflags) & IF_MASK)
> .. do sti/cli as appropriate ..
>
> which would mean that the "cli" itself may take 4 cycles, but the "sti"
> implicit in the iret will _also_ take 4 cycles and is optimized away when
> not needed.
>
> Which would add up to the 8 cycles needed for a ~3.4% speedup (this is
> assuming the baseline is something like 250 cycles per system call, I've
> not checked that assumption).

guys, why don't you use #rdtsc to discover where perf improvement comes from ?




- Davide


2002-01-26 03:02:08

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

On Fri, 2002-01-25 at 21:10, Andrew Morton wrote:

> With cli:
> ./a.out 22.05s user 15.31s system 99% cpu 37.361 total
>
> without cli:
> ./a.out 18.29s user 17.42s system 99% cpu 35.731 total
>
>
> That's 4.6%. Intel P3.

Same program, AMD Athlon MP 1600 (booted UP), kernel 2.5.3-pre5.

with cli: real 0m19.706s user 0m11.400s sys 0m8.290s
without cli: real 0m19.449s user 0m10.630s sys 0m8.820s

That is 1.3% improvement.

Robert Love

2002-01-26 03:16:11

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

On Fri, 2002-01-25 at 22:06, Robert Love wrote:

> Same program, AMD Athlon MP 1600 (booted UP), kernel 2.5.3-pre5.
>
> with cli: real 0m19.706s user 0m11.400s sys 0m8.290s
> without cli: real 0m19.449s user 0m10.630s sys 0m8.820s
>
> That is 1.3% improvement.

And let me add with David Howell's patch:

patch: real 0m19.305s user 0m8.130s sys 0m11.180s

Which is 0.7% faster than without cli, and 2.07% faster than the stock
(with cli) kernel. This is the average of three runs, btw.

Robert Love

2002-01-26 01:24:47

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

Linus Torvalds <[email protected]> writes:

> Looking at the code, I suspect that 99.9% of this "improvement" comes from
> one thing, and one thing only: you removed the "cli" in the system call
> return path.

It doesn't explain the Athlon speedups. On athlon cli is ~4 cycles.

-Andi

2002-01-26 04:04:48

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

Andrew Morton wrote:
> > NOTE! There are potentially other ways to do all of this, _without_ losing
> > atomicity. For example, you can move the "flags" value into the slot saved
> > for the CS segment (which, modulo vm86, will always be at a constant
> > offset on the stack), and make CS=0 be the work flag. That will cause the
> > CPU to trap atomically at the "iret".
>
> Ingo's low-latency patch put markers around the critical code section,
> and inspected the return EIP on the way back out of the interrupt.
> If it falls inside the racy region, do special stuff.

Latency tests showed that fixed the problem as well as the cli. It's
just _much_ uglier to read, is all.

Although it saves the cli from syscalls and interrupts, it adds back a
small cost to interrupts. Fortunately, syscall latency is far more
important than interrupt latency.

If we're going to micro-optimise the system calls, then markers are
definitely the way to fix the return path race IMHO.

-- Jamie

2002-01-26 10:08:19

by Nigel Gamble

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

On 25 Jan 2002, Robert Love wrote:
> Mmm, I like it. Ingo Molnar talked to me about this (he wants such a
> feature, too) earlier. This is a real win.
>
> This patch is beneficial to the kernel preemption patch.

Note that with a fully preemptible kernel, there is no need to test
need_resched on return from system call, since any needed reschedule
should already have been done. If the need_resched was set by an
interrupt handler, the preempt_schedule on return from interrupt (or on
exit from non-preemptible region) will have done the reschedule. And if
need_resched was set because one process woke up another (higher
priority) process, we can do the schedule() immediately, unless we are
in a non-preemptible region in which case it will happen on exit from
that region. I don't think we do an immediate schedule on wakeup yet
but, with the existing preemption patch, that would make the test on
syscall exit completely redundant (which may enable the cli instruction
to be safely removed).

Nigel Gamble [email protected]
Mountain View, CA, USA. http://www.nrg.org/

2002-01-26 18:27:41

by Alan

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

> NOTE! There are potentially other ways to do all of this, _without_ losing
> atomicity. For example, you can move the "flags" value into the slot saved
> for the CS segment (which, modulo vm86, will always be at a constant
> offset on the stack), and make CS=0 be the work flag. That will cause the
> CPU to trap atomically at the "iret".

Is the test even needed. Suppose we instead patch the return stack if we
set need_resched/sigpending, and do it on the rare occassion we set the
value.

2002-01-27 20:04:23

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

Alan Cox wrote:
> > NOTE! There are potentially other ways to do all of this, _without_ losing
> > atomicity. For example, you can move the "flags" value into the slot saved
> > for the CS segment (which, modulo vm86, will always be at a constant
> > offset on the stack), and make CS=0 be the work flag. That will cause the
> > CPU to trap atomically at the "iret".
>
> Is the test even needed. Suppose we instead patch the return stack if we
> set need_resched/sigpending, and do it on the rare occassion we set the
> value.

Yes, that's what you'll find in one of Ingo Molnar's low latency patches.

-- Jamie

2002-01-28 10:20:17

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

> <thinks of another>
>
> s/inline//g

I like this.

Plain grep turned out hundreds of 'inline' (I didn't even count).
I'd like to write a script which will narrow it to:

.... inline .... {
more than 2 lines
}

But my knowledge of grep/sed/perl/awk/??? magic is really not up to the task.

Plan:
* replace those with big_inline
* #define it to 'inline' or to '' (nothing) and compare kernel sizes
* make it CONFIG_xxx option if it worth the trouble
--
vda

2002-01-28 10:38:01

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

Denis Vlasenko wrote:
>
> > <thinks of another>
> >
> > s/inline//g
>
> I like this.

Well, it's a fairly small optimisation, but it's easy.

I did a patch a while back: http://www.zip.com.au/~akpm/linux/2.4/2.4.17-pre1/inline.patch
This is purely against core kernel files:

drivers/block/ll_rw_blk.c | 18 ++-------------
fs/binfmt_elf.c | 4 +--
fs/block_dev.c | 2 -
fs/dcache.c | 8 +++---
fs/inode.c | 6 ++---
fs/locks.c | 8 +++---
fs/namei.c | 14 ++++++------
fs/namespace.c | 42 ++++++++++++++++++++++++++++++++++++
fs/open.c | 4 +--
fs/read_write.c | 2 -
fs/stat.c | 2 -
fs/super.c | 2 -
include/linux/fs_struct.h | 53 +++-------------------------------------------
kernel/exit.c | 10 ++++----
kernel/fork.c | 4 +--
kernel/module.c | 2 -
kernel/sched.c | 6 ++---
kernel/signal.c | 3 --
kernel/sys.c | 2 -
kernel/timer.c | 2 -
lib/rwsem.c | 4 +--
mm/filemap.c | 4 +--
mm/highmem.c | 2 -
mm/memory.c | 2 -
mm/mmap.c | 4 +--
mm/slab.c | 14 ++++++------

And it reduces the kernel image by 11 kbytes. That's not much
RAM, but it's a lot of cache. It's almost all hot-path stuff.

> ...
> * replace those with big_inline
> * #define it to 'inline' or to '' (nothing) and compare kernel sizes
> * make it CONFIG_xxx option if it worth the trouble

The first patch should be against Documentation/CodingStyle.
What are we trying to achieve here? What are the guidelines
for when-to and when-to-not? I'd say:

- If a function has a single call site and is static then it
is always correct to inline.

- If a function is very small (20-30 bytes) then inlining
is correct even if it has many call sites.

- If a function is less-small, and has only one or two
*commonly called* call sites, then inlining is OK.

- If a function is a leaf function, then it is more inlinable
than a function which makes another function call.

fs/inode.c:__sync_one() violates all the above quite
outrageously :)

-

2002-01-28 15:26:20

by Jeff Dike

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

[email protected] said:
> - If a function is very small (20-30 bytes) then inlining
> is correct even if it has many call sites.

- Or if it collapses to something very small due to constant propogation
and other optimizations.

Maybe this was intended, but I think it's worth making it explicit.

Jeff

2002-01-29 00:53:41

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

On Mon, 28 Jan 2002 02:30:04 -0800
Andrew Morton <[email protected]> wrote:

> Denis Vlasenko wrote:
> >
> > > <thinks of another>
> > >
> > > s/inline//g
> >
> > I like this.
>
> Well, it's a fairly small optimisation, but it's easy.
>
> I did a patch a while back: http://www.zip.com.au/~akpm/linux/2.4/2.4.17-pre1/inline.patch
> This is purely against core kernel files:

You *really* want to do the headers, too. As a hack you could move all the
inlines into kernel/inline.c, and see what happens then.

It's the headers that make it messy as a config option (you don't want
non-inline functions in your .h file, because having multiple copies loses
the cache advantatge.

Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2002-01-29 09:01:11

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

On 28 January 2002 08:30, Andrew Morton wrote:
> > > s/inline//g
> >
> > I like this.
>
> Well, it's a fairly small optimisation, but it's easy.
>
> I did a patch a while back:
> http://www.zip.com.au/~akpm/linux/2.4/2.4.17-pre1/inline.patch This is
> purely against core kernel files:

[snip]

How did you find big inlines? Care to share hunter script with me (+ lkml)?

> The first patch should be against Documentation/CodingStyle.
> What are we trying to achieve here? What are the guidelines
> for when-to and when-to-not? I'd say:
>
> - If a function has a single call site and is static then it
> is always correct to inline.

And what if later you (or someone else!) add another call? You may forget to
remove inline. It adds maintenance trouble while not buying much of speed:
if func is big, inline gains are small, if it's small, it should be inlined
regardless of number of call sites.

> - If a function is very small (20-30 bytes) then inlining
> is correct even if it has many call sites.

Small in asm code, _not_ in C. Sometimes seemingly small C explodes into
horrendous asm.

> - If a function is less-small, and has only one or two
> *commonly called* call sites, then inlining is OK.

How much less small? We can have both:

inline int f_inlined() { ... }
int f() { return f_inline(); }
...
f_inlined(); /* we need speed here */
f(); /* we save space here */

> - If a function is a leaf function, then it is more inlinable
> than a function which makes another function call.

100%. Especially when some "cute small functions" called inside happen to be
inlined too (or are macros).

> fs/inode.c:__sync_one() violates all the above quite
> outrageously :)

Homehow I feel there are more :-)
--
vda

2002-01-31 20:37:03

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] syscall latency improvement #1

Hi!

> The first patch should be against Documentation/CodingStyle.
> What are we trying to achieve here? What are the guidelines
> for when-to and when-to-not? I'd say:
>
> - If a function has a single call site and is static then it
> is always correct to inline.

Yep, but gcc should figure this out itself. No point helping it.

Pavel
--
Philips Velo 1: 1"x4"x8", 300gram, 60, 12MB, 40bogomips, linux, mutt,
details at http://atrey.karlin.mff.cuni.cz/~pavel/velo/index.html.

2002-02-21 07:10:22

by Cameron Simpson

[permalink] [raw]
Subject: Re: syscall latency improvement #1

On 10:59 29 Jan 2002, Denis Vlasenko <[email protected]> wrote:
| > - If a function has a single call site and is static then it
| > is always correct to inline.

I'm thinking: any decent compiler will inline this on its own, _without_
an inline keyword. So _don't_ use inline here.

| And what if later you (or someone else!) add another call? You may forget to
| remove inline. It adds maintenance trouble while not buying much of speed:

Indeed. And handled by the null case of "no inline" used above - the
compiler will get this right if you leave out the inline keywords,
while adding it causes the above issue.

| if func is big, inline gains are small, if it's small, it should be inlined
| regardless of number of call sites.

Wasn't that case #2? Inline when func < some small number of bytes?
--
Cameron Simpson, DoD#743 [email protected] http://www.zip.com.au/~cs/

My father was a despatch rider during the last war. He rode BSAs but, for
reasons I still don't understand, he never bothered to tell me that they
were useless, unreliable piles of shit. - Grant Roff, _Two Wheels_ Nov96