2002-01-11 10:33:41

by Rainer Keller

[permalink] [raw]
Subject: [PATCH] Small optimization for UP in sched and prefetch

diff -ur linux-2.4.17/include/asm-i386/processor.h linux-2.4.17-mine/include/asm-i386/processor.h
--- linux-2.4.17/include/asm-i386/processor.h Thu Nov 22 20:46:19 2001
+++ linux-2.4.17-mine/include/asm-i386/processor.h Fri Jan 11 11:31:49 2002
@@ -478,8 +478,8 @@

#define cpu_relax() rep_nop()

-/* Prefetch instructions for Pentium III and AMD Athlon */
-#ifdef CONFIG_MPENTIUMIII
+/* Prefetch instructions for Pentium III, Pentium 4 and AMD Athlon */
+#if defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUM4)

#define ARCH_HAS_PREFETCH
extern inline void prefetch(const void *x)
@@ -502,7 +502,12 @@
{
__asm__ __volatile__ ("prefetchw (%0)" : : "r"(x));
}
-#define spin_lock_prefetch(x) prefetchw(x)
+
+#ifndef CONFIG_SMP
+#define spin_lock_prefetch(x) do { } while(0)
+#else
+#define spin_lock_prefetch(x) prefetchw(x)
+#endif

#endif

diff -ur linux-2.4.17/include/linux/prefetch.h linux-2.4.17-mine/include/linux/prefetch.h
--- linux-2.4.17/include/linux/prefetch.h Thu Nov 22 20:46:19 2001
+++ linux-2.4.17-mine/include/linux/prefetch.h Thu Jan 10 13:21:39 2002
@@ -10,6 +10,7 @@
#ifndef _LINUX_PREFETCH_H
#define _LINUX_PREFETCH_H

+#include <linux/config.h>
#include <asm/processor.h>
#include <asm/cache.h>

@@ -26,7 +27,9 @@

prefetch(x) - prefetches the cacheline at "x" for read
prefetchw(x) - prefetches the cacheline at "x" for write
- spin_lock_prefetch(x) - prefectches the spinlock *x for taking
+ spin_lock_prefetch(x) - prefetches the spinlock *x for taking,
+ if on SMP, otherwise not needed
+ (except for debugging reasons -- slow anyway).

there is also PREFETCH_STRIDE which is the architecure-prefered
"lookahead" size for prefetching streamed operations.
@@ -50,7 +53,11 @@

#ifndef ARCH_HAS_SPINLOCK_PREFETCH
#define ARCH_HAS_SPINLOCK_PREFETCH
+#ifndef CONFIG_SMP
+#define spin_lock_prefetch(x) do { } while(0)
+#else
#define spin_lock_prefetch(x) prefetchw(x)
+#endif
#endif

#ifndef PREFETCH_STRIDE
diff -ur linux-2.4.17/kernel/sched.c linux-2.4.17-mine/kernel/sched.c
--- linux-2.4.17/kernel/sched.c Fri Dec 21 18:42:04 2001
+++ linux-2.4.17-mine/kernel/sched.c Fri Jan 11 11:30:43 2002
@@ -117,11 +117,13 @@
#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
#define can_schedule(p,cpu) \
((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
+#define processor_of_tsk(tsk) (tsk)->processor

#else

#define idle_task(cpu) (&init_task)
#define can_schedule(p,cpu) (1)
+#define processor_of_tsk(tsk) (0)

#endif

@@ -172,7 +174,7 @@
#ifdef CONFIG_SMP
/* Give a largish advantage to the same processor... */
/* (this is equivalent to penalizing other processors) */
- if (p->processor == this_cpu)
+ if (processor_of_tsk(p) == this_cpu)
weight += PROC_CHANGE_PENALTY;
#endif

@@ -221,7 +223,7 @@
* shortcut if the woken up task's last CPU is
* idle now.
*/
- best_cpu = p->processor;
+ best_cpu = processor_of_tsk(p);
if (can_schedule(p, best_cpu)) {
tsk = idle_task(best_cpu);
if (cpu_curr(best_cpu) == tsk) {
@@ -295,18 +297,18 @@
tsk = target_tsk;
if (tsk) {
if (oldest_idle != -1ULL) {
- best_cpu = tsk->processor;
+ best_cpu = processor_of_tsk(tsk);
goto send_now_idle;
}
tsk->need_resched = 1;
- if (tsk->processor != this_cpu)
- smp_send_reschedule(tsk->processor);
+ if (processor_of_tsk(tsk) != this_cpu)
+ smp_send_reschedule(processor_of_tsk(tsk));
}
return;


#else /* UP */
- int this_cpu = smp_processor_id();
+ const int this_cpu = smp_processor_id();
struct task_struct *tsk;

tsk = cpu_curr(this_cpu);
@@ -559,7 +561,7 @@
if (!current->active_mm) BUG();
need_resched_back:
prev = current;
- this_cpu = prev->processor;
+ this_cpu = processor_of_tsk(prev);

if (unlikely(in_interrupt())) {
printk("Scheduling in interrupt\n");
@@ -1311,7 +1313,7 @@
}
sched_data->curr = current;
sched_data->last_schedule = get_cycles();
- clear_bit(current->processor, &wait_init_idle);
+ clear_bit(processor_of_tsk(current), &wait_init_idle);
}

extern void init_timervecs (void);


Attachments:
patch_prefetch_sched-2.4.17_second.diff (4.00 kB)

2002-01-11 10:51:35

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH] Small optimization for UP in sched and prefetch

Rainer Keller wrote:
> PS: Because of usage of prefetch in include/linux/list.h, the memory
> prefetch is triggered 137 times on my configuration...

We need to merge in __builtin_prefetch support into the kernel, because
gcc 3.1 recently got support for it. It would be nice at least for
future prefetch-related patches to perhaps call __builtin_prefetch, and
have the headers substitute a prefetch if the compiler does not support
it.

Jeff



--
Jeff Garzik | Alternate titles for LOTR:
Building 1024 | Fast Times at Uruk-Hai
MandrakeSoft | The Took, the Elf, His Daughter and Her Lover
| Samwise Gamgee: International Hobbit of Mystery

2002-01-11 17:07:43

by Rainer Keller

[permalink] [raw]
Subject: Re: [PATCH] Small optimization for UP in sched and prefetch (take 3)

diff -ur linux-2.4.17/include/asm-i386/processor.h linux-2.4.17-mine/include/asm-i386/processor.h
--- linux-2.4.17/include/asm-i386/processor.h Thu Nov 22 20:46:19 2001
+++ linux-2.4.17-mine/include/asm-i386/processor.h Fri Jan 11 13:27:30 2002
@@ -478,8 +478,14 @@

#define cpu_relax() rep_nop()

-/* Prefetch instructions for Pentium III and AMD Athlon */
-#ifdef CONFIG_MPENTIUMIII
+/*
+ * If we don't have a compiler, which offers builtin_prefetch, do it our selve,
+ * if the processor supports it.
+ */
+#ifndef HAVE_builtin_prefetch
+
+/* Prefetch instructions for Pentium III, Pentium 4 and AMD Athlon */
+#if defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUM4)

#define ARCH_HAS_PREFETCH
extern inline void prefetch(const void *x)
@@ -502,8 +508,14 @@
{
__asm__ __volatile__ ("prefetchw (%0)" : : "r"(x));
}
-#define spin_lock_prefetch(x) prefetchw(x)

+#ifndef CONFIG_SMP
+#define spin_lock_prefetch(x) do { } while(0)
+#else
+#define spin_lock_prefetch(x) prefetchw(x)
#endif
+
+#endif /* CONFIG_MPENTIUMII || CONFIG_MPENTIUM4 */
+#endif /* HAVE_builtin_prefetch */

#endif /* __ASM_I386_PROCESSOR_H */
diff -ur linux-2.4.17/include/linux/compiler.h linux-2.4.17-mine/include/linux/compiler.h
--- linux-2.4.17/include/linux/compiler.h Tue Sep 18 23:12:45 2001
+++ linux-2.4.17-mine/include/linux/compiler.h Fri Jan 11 13:19:40 2002
@@ -1,6 +1,8 @@
#ifndef __LINUX_COMPILER_H
#define __LINUX_COMPILER_H

+#include <linux/config.h>
+
/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
a mechanism by which the user can annotate likely branch directions and
expect the blocks to be reordered appropriately. Define __builtin_expect
@@ -12,5 +14,24 @@

#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
+
+
+/*
+ * Starting somewhere in GCC 3.1, builtin_prefetch support was added, i.e. we're
+ * not dependant on information by include/asm/processor.h
+ */
+#if __GNUC__ == 3 && __GNUC_MINOR__ >= 1
+#define HAVE_builtin_prefetch
+
+#define prefetch(x) __builtin_prefetch((x))
+#define prefetchw(x) __builtin_prefetch((x), 2)
+
+#ifndef CONFIG_SMP
+#define spin_lock_prefetch(x) do { } while(0)
+#else
+#define spin_lock_prefetch(x) prefetchw(x)
+#endif
+
+#endif

#endif /* __LINUX_COMPILER_H */
diff -ur linux-2.4.17/include/linux/prefetch.h linux-2.4.17-mine/include/linux/prefetch.h
--- linux-2.4.17/include/linux/prefetch.h Thu Nov 22 20:46:19 2001
+++ linux-2.4.17-mine/include/linux/prefetch.h Fri Jan 11 12:44:30 2002
@@ -10,6 +10,8 @@
#ifndef _LINUX_PREFETCH_H
#define _LINUX_PREFETCH_H

+#include <linux/config.h>
+#include <linux/compiler.h>
#include <asm/processor.h>
#include <asm/cache.h>

@@ -26,7 +28,9 @@

prefetch(x) - prefetches the cacheline at "x" for read
prefetchw(x) - prefetches the cacheline at "x" for write
- spin_lock_prefetch(x) - prefectches the spinlock *x for taking
+ spin_lock_prefetch(x) - prefetches the spinlock *x for taking,
+ if on SMP, otherwise not needed
+ (except for debugging reasons -- slow anyway).

there is also PREFETCH_STRIDE which is the architecure-prefered
"lookahead" size for prefetching streamed operations.
@@ -37,7 +41,9 @@
* These cannot be do{}while(0) macros. See the mental gymnastics in
* the loop macro.
*/
-
+
+#ifndef HAVE_builtin_prefetch
+
#ifndef ARCH_HAS_PREFETCH
#define ARCH_HAS_PREFETCH
static inline void prefetch(const void *x) {;}
@@ -50,11 +56,17 @@

#ifndef ARCH_HAS_SPINLOCK_PREFETCH
#define ARCH_HAS_SPINLOCK_PREFETCH
+#ifndef CONFIG_SMP
+#define spin_lock_prefetch(x) do { } while(0)
+#else
#define spin_lock_prefetch(x) prefetchw(x)
#endif
+#endif

#ifndef PREFETCH_STRIDE
#define PREFETCH_STRIDE (4*L1_CACHE_BYTES)
#endif
+
+#endif /* HAVE_builtin_prefetch */

#endif
diff -ur linux-2.4.17/kernel/sched.c linux-2.4.17-mine/kernel/sched.c
--- linux-2.4.17/kernel/sched.c Fri Dec 21 18:42:04 2001
+++ linux-2.4.17-mine/kernel/sched.c Fri Jan 11 11:30:43 2002
@@ -117,11 +117,13 @@
#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
#define can_schedule(p,cpu) \
((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
+#define processor_of_tsk(tsk) (tsk)->processor

#else

#define idle_task(cpu) (&init_task)
#define can_schedule(p,cpu) (1)
+#define processor_of_tsk(tsk) (0)

#endif

@@ -172,7 +174,7 @@
#ifdef CONFIG_SMP
/* Give a largish advantage to the same processor... */
/* (this is equivalent to penalizing other processors) */
- if (p->processor == this_cpu)
+ if (processor_of_tsk(p) == this_cpu)
weight += PROC_CHANGE_PENALTY;
#endif

@@ -221,7 +223,7 @@
* shortcut if the woken up task's last CPU is
* idle now.
*/
- best_cpu = p->processor;
+ best_cpu = processor_of_tsk(p);
if (can_schedule(p, best_cpu)) {
tsk = idle_task(best_cpu);
if (cpu_curr(best_cpu) == tsk) {
@@ -295,18 +297,18 @@
tsk = target_tsk;
if (tsk) {
if (oldest_idle != -1ULL) {
- best_cpu = tsk->processor;
+ best_cpu = processor_of_tsk(tsk);
goto send_now_idle;
}
tsk->need_resched = 1;
- if (tsk->processor != this_cpu)
- smp_send_reschedule(tsk->processor);
+ if (processor_of_tsk(tsk) != this_cpu)
+ smp_send_reschedule(processor_of_tsk(tsk));
}
return;


#else /* UP */
- int this_cpu = smp_processor_id();
+ const int this_cpu = smp_processor_id();
struct task_struct *tsk;

tsk = cpu_curr(this_cpu);
@@ -559,7 +561,7 @@
if (!current->active_mm) BUG();
need_resched_back:
prev = current;
- this_cpu = prev->processor;
+ this_cpu = processor_of_tsk(prev);

if (unlikely(in_interrupt())) {
printk("Scheduling in interrupt\n");
@@ -1311,7 +1313,7 @@
}
sched_data->curr = current;
sched_data->last_schedule = get_cycles();
- clear_bit(current->processor, &wait_init_idle);
+ clear_bit(processor_of_tsk(current), &wait_init_idle);
}

extern void init_timervecs (void);


Attachments:
patch_prefetch_sched-2.4.17_third.diff (5.79 kB)

2002-01-11 19:56:59

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] Small optimization for UP in sched and prefetch

On Fri, 2002-01-11 at 05:33, Rainer Keller wrote:

> -/* Prefetch instructions for Pentium III and AMD Athlon */
> -#ifdef CONFIG_MPENTIUMIII
> +/* Prefetch instructions for Pentium III, Pentium 4 and AMD Athlon */
> +#if defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUM4)

if we really intend to check for the use of the AMD Athlon as well, we
need to add CONFIG_MK7, too. Since the Athlon does have this prefetch,
it would make sense. Otherwise, the comment is wrong.

Anyhow, good patch and I can't see it not being safe for 2.4.

Robert Love

2002-01-11 20:03:09

by Dave Jones

[permalink] [raw]
Subject: Re: [PATCH] Small optimization for UP in sched and prefetch

On 11 Jan 2002, Robert Love wrote:

> if we really intend to check for the use of the AMD Athlon as well, we
> need to add CONFIG_MK7, too. Since the Athlon does have this prefetch,
> it would make sense. Otherwise, the comment is wrong.

It's handled a few lines further down in a CONFIG_X86_USE_3DNOW
which means that CyrixIII's can also use them too.

--
| Dave Jones. http://www.codemonkey.org.uk
| SuSE Labs

2002-01-11 20:04:50

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] Small optimization for UP in sched and prefetch

On Fri, 2002-01-11 at 15:02, Dave Jones wrote:

> It's handled a few lines further down in a CONFIG_X86_USE_3DNOW
> which means that CyrixIII's can also use them too.

Ah, my mistake. I should of read the source and not just the patch.

Robert Love