2004-09-18 23:31:58

by Christoph Lameter

[permalink] [raw]
Subject: page fault scalability patch V8: [4/7] universally available cmpxchg on i386

Changelog
* Make cmpxchg and cmpxchg8b generally available on i386.
* Provide emulation of cmpxchg suitable for UP if build and
run on 386.
* Provide emulation of cmpxchg8b suitable for UP if build
and run on 386 or 486.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linus/include/asm-i386/system.h
===================================================================
--- linus.orig/include/asm-i386/system.h 2004-09-18 14:25:23.000000000 -0700
+++ linus/include/asm-i386/system.h 2004-09-18 14:56:59.000000000 -0700
@@ -203,77 +203,6 @@
__set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
__set_64bit(ptr, ll_low(value), ll_high(value)) )

-/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- * but generally the primitive is invalid, *ptr is output argument. --ANK
- */
-static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
-{
- switch (size) {
- case 1:
- __asm__ __volatile__("xchgb %b0,%1"
- :"=q" (x)
- :"m" (*__xg(ptr)), "0" (x)
- :"memory");
- break;
- case 2:
- __asm__ __volatile__("xchgw %w0,%1"
- :"=r" (x)
- :"m" (*__xg(ptr)), "0" (x)
- :"memory");
- break;
- case 4:
- __asm__ __volatile__("xchgl %0,%1"
- :"=r" (x)
- :"m" (*__xg(ptr)), "0" (x)
- :"memory");
- break;
- }
- return x;
-}
-
-/*
- * Atomic compare and exchange. Compare OLD with MEM, if identical,
- * store NEW in MEM. Return the initial value in MEM. Success is
- * indicated by comparing RETURN with OLD.
- */
-
-#ifdef CONFIG_X86_CMPXCHG
-#define __HAVE_ARCH_CMPXCHG 1
-#endif
-
-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
-
-#define cmpxchg(ptr,o,n)\
- ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
- (unsigned long)(n),sizeof(*(ptr))))
-
#ifdef __KERNEL__
struct alt_instr {
__u8 *instr; /* original instruction */
Index: linus/arch/i386/Kconfig
===================================================================
--- linus.orig/arch/i386/Kconfig 2004-09-18 14:25:23.000000000 -0700
+++ linus/arch/i386/Kconfig 2004-09-18 14:56:59.000000000 -0700
@@ -345,6 +345,11 @@
depends on !M386
default y

+config X86_CMPXCHG8B
+ bool
+ depends on !M386 && !M486
+ default y
+
config X86_XADD
bool
depends on !M386
Index: linus/include/asm-i386/processor.h
===================================================================
--- linus.orig/include/asm-i386/processor.h 2004-09-18 14:25:23.000000000 -0700
+++ linus/include/asm-i386/processor.h 2004-09-18 14:56:59.000000000 -0700
@@ -657,4 +657,137 @@

#define cache_line_size() (boot_cpu_data.x86_cache_alignment)

+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+ * but generally the primitive is invalid, *ptr is output argument. --ANK
+ */
+static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
+{
+ switch (size) {
+ case 1:
+ __asm__ __volatile__("xchgb %b0,%1"
+ :"=q" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ case 2:
+ __asm__ __volatile__("xchgw %w0,%1"
+ :"=r" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ case 4:
+ __asm__ __volatile__("xchgl %0,%1"
+ :"=r" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ }
+ return x;
+}
+
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+
+#ifdef CONFIG_X86_CMPXCHG
+#define __HAVE_ARCH_CMPXCHG 1
+#endif
+
+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+#ifndef CONFIG_X86_CMPXCHG
+ /*
+ * Check if the kernel was compiled for an old cpu but the
+ * currently running cpu can do cmpxchg after all
+ */
+ unsigned long flags;
+
+ /* All CPUs except 386 support CMPXCHG */
+ if (cpu_data->x86 > 3) goto have_cmpxchg;
+
+ /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ local_irq_save(flags);
+ switch (size) {
+ case 1:
+ prev = * (u8 *)ptr;
+ if (prev == old) *(u8 *)ptr = new;
+ break;
+ case 2:
+ prev = * (u16 *)ptr;
+ if (prev == old) *(u16 *)ptr = new;
+ case 4:
+ prev = *(u32 *)ptr;
+ if (prev == old) *(u32 *)ptr = new;
+ break;
+ }
+ local_irq_restore(flags);
+ return prev;
+have_cmpxchg:
+#endif
+ switch (size) {
+ case 1:
+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return prev;
+}
+
+static inline unsigned long long cmpxchg8b(volatile unsigned long long *ptr,
+ unsigned long long old, unsigned long long newv)
+{
+ unsigned long long prev;
+#ifndef CONFIG_X86_CMPXCHG8B
+ unsigned long flags;
+
+ /*
+ * Check if the kernel was compiled for an old cpu but
+ * we are running really on a cpu capable of cmpxchg8b
+ */
+
+ if (cpu_has(cpu_data, X86_FEATURE_CX8)) goto have_cmpxchg8b;
+
+ /* Poor mans cmpxchg8b for 386 and 486. Not suitable for SMP */
+ local_irq_save(flags);
+ prev = *ptr;
+ if (prev == old) *ptr = newv;
+ local_irq_restore(flags);
+ return prev;
+
+have_cmpxchg8b:
+#endif
+
+ __asm__ __volatile__(
+ LOCK_PREFIX "cmpxchg8b %4\n"
+ : "=A" (prev)
+ : "0" (old), "c" ((unsigned long)(newv >> 32)),
+ "b" ((unsigned long)(newv & 0xffffffffLL)), "m" (ptr)
+ : "memory");
+ return prev ;
+}
+
+#define cmpxchg(ptr,o,n)\
+ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
+ (unsigned long)(n),sizeof(*(ptr))))
+
#endif /* __ASM_I386_PROCESSOR_H */


2004-09-19 12:12:00

by Andi Kleen

[permalink] [raw]
Subject: Re: page fault scalability patch V8: [4/7] universally available cmpxchg on i386

On Sun, Sep 19, 2004 at 02:30:37PM +0300, Denis Vlasenko wrote:
> Far too large for inline

It's much smaller than it looks - the switch will be optimized away by the
compiler. For the X86_CMPXCHG case it is even a single instruction.
For the other case it should be < 10 instructions, which is still reasonable.

-Andi

2004-09-20 15:46:30

by Christoph Lameter

[permalink] [raw]
Subject: Re: page fault scalability patch V8: [4/7] universally available cmpxchg on i386

On Sun, 19 Sep 2004, Denis Vlasenko wrote:

> Far too large for inline
> Ditto.

Umm... The code was inline before and for non 80386 its the same size as
before.

2004-09-20 20:51:46

by Christoph Lameter

[permalink] [raw]
Subject: Re: page fault scalability patch V8: [4/7] universally available cmpxchg on i386

On Mon, 20 Sep 2004, Denis Vlasenko wrote:

> I think it shouldn't be this way.
>
> OTOH for !CONFIG_386 case it makes perfect sense to have it inlined.

Would the following revised patch be acceptable?

Index: linux-2.6.9-rc2/include/asm-i386/system.h
===================================================================
--- linux-2.6.9-rc2.orig/include/asm-i386/system.h 2004-09-12 22:31:26.000000000 -0700
+++ linux-2.6.9-rc2/include/asm-i386/system.h 2004-09-20 13:44:49.000000000 -0700
@@ -240,7 +240,24 @@
*/

#ifdef CONFIG_X86_CMPXCHG
+
#define __HAVE_ARCH_CMPXCHG 1
+#define cmpxchg(ptr,o,n)\
+ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
+ (unsigned long)(n),sizeof(*(ptr))))
+
+#else
+
+/*
+ * Building a kernel capable running on 80386. It may be necessary to
+ * simulate the cmpxchg on the 80386 CPU.
+ */
+
+extern unsigned long cmpxchg_386(void *,unsigned long,unsigned long);
+
+#define cmpxchg(ptr,o,n)\
+ ((__typeof__(*(ptr)))cmpxchg_386((ptr),(unsigned long)(o),\
+ (unsigned long)(n),sizeof(*(ptr))))
#endif

static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
@@ -270,10 +287,32 @@
return old;
}

-#define cmpxchg(ptr,o,n)\
- ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
- (unsigned long)(n),sizeof(*(ptr))))
-
+static inline unsigned long long __cmpxchg8b(volatile unsigned long long *ptr,
+ unsigned long long old, unsigned long long newv)
+{
+ unsigned long long prev;
+ __asm__ __volatile__(
+ LOCK_PREFIX "cmpxchg8b %4\n"
+ : "=A" (prev)
+ : "0" (old), "c" ((unsigned long)(newv >> 32)),
+ "b" ((unsigned long)(newv & 0xffffffffLL)), "m" (ptr)
+ : "memory");
+ return prev ;
+}
+
+#ifdef CONFIG_X86_CMPXCHG8B
+#define cmpxchg8b __cmpxchg8b
+#else
+/*
+ * Building a kernel capable of running on 80486 and 80386. Both
+ * do not support cmpxchg8b. Call a function that emulates the
+ * instruction if necessary.
+ */
+extern unsigned long long cmpxchg_486(unsigned long long *,
+ unsigned long long, unsigned long long);
+#define cmpxchg8b cmpxchg8b_486
+#endif
+
#ifdef __KERNEL__
struct alt_instr {
__u8 *instr; /* original instruction */
Index: linux-2.6.9-rc2/arch/i386/Kconfig
===================================================================
--- linux-2.6.9-rc2.orig/arch/i386/Kconfig 2004-09-20 08:57:47.000000000 -0700
+++ linux-2.6.9-rc2/arch/i386/Kconfig 2004-09-20 10:11:45.000000000 -0700
@@ -345,6 +345,11 @@
depends on !M386
default y

+config X86_CMPXCHG8B
+ bool
+ depends on !M386 && !M486
+ default y
+
config X86_XADD
bool
depends on !M386
Index: linux-2.6.9-rc2/arch/i386/kernel/cpu/intel.c
===================================================================
--- linux-2.6.9-rc2.orig/arch/i386/kernel/cpu/intel.c 2004-09-12 22:31:59.000000000 -0700
+++ linux-2.6.9-rc2/arch/i386/kernel/cpu/intel.c 2004-09-20 13:44:08.000000000 -0700
@@ -415,5 +415,68 @@
return 0;
}

+#ifndef CONFIG_X86_CMPXCHG
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+
+unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ /*
+ * Check if the kernel was compiled for an old cpu but the
+ * currently running cpu can do cmpxchg after all
+ */
+ unsigned long flags;
+
+ /* All CPUs except 386 support CMPXCHG */
+ if (cpu_data->x86 > 3) return __cmpxchg(ptr, old, new, size);
+
+ /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ local_irq_save(flags);
+ switch (size) {
+ case 1:
+ prev = * (u8 *)ptr;
+ if (prev == old) *(u8 *)ptr = new;
+ break;
+ case 2:
+ prev = * (u16 *)ptr;
+ if (prev == old) *(u16 *)ptr = new;
+ case 4:
+ prev = *(u32 *)ptr;
+ if (prev == old) *(u32 *)ptr = new;
+ break;
+ }
+ local_irq_restore(flags);
+ return prev;
+}
+#endif
+
+#ifndef CONFIG_X86_CMPXCHG8B
+unsigned long long cmpxchg8b_486(unsigned long long *ptr,
+ unsigned long long old, unsigned long long newv)
+{
+ unsigned long long prev;
+ unsigned long flags;
+
+ /*
+ * Check if the kernel was compiled for an old cpu but
+ * we are running really on a cpu capable of cmpxchg8b
+ */
+
+ if (cpu_has(cpu_data, X86_FEATURE_CX8)) return __cmpxchg8b(ptr, old newv);
+
+ /* Poor mans cmpxchg8b for 386 and 486. Not suitable for SMP */
+ local_irq_save(flags);
+ prev = *ptr;
+ if (prev == old) *ptr = newv;
+ local_irq_restore(flags);
+ return prev;
+}
+#endif
+
// arch_initcall(intel_cpu_init);

2004-09-20 21:01:26

by Andi Kleen

[permalink] [raw]
Subject: Re: page fault scalability patch V8: [4/7] universally available cmpxchg on i386

On Mon, Sep 20, 2004 at 01:49:20PM -0700, Christoph Lameter wrote:
> On Mon, 20 Sep 2004, Denis Vlasenko wrote:
>
> > I think it shouldn't be this way.
> >
> > OTOH for !CONFIG_386 case it makes perfect sense to have it inlined.
>
> Would the following revised patch be acceptable?

You would need an EXPORT_SYMBOL at least. But to be honest your
original patch was much simpler and nicer and cmpxchg is not called
that often that it really matters. I would just ignore Denis'
suggestion and stay with the old patch.

-Andi

2004-09-21 15:45:54

by Andi Kleen

[permalink] [raw]
Subject: Re: page fault scalability patch V8: [4/7] universally available cmpxchg on i386

On Tue, Sep 21, 2004 at 06:41:25PM +0300, Denis Vlasenko wrote:
> On Monday 20 September 2004 23:57, Andi Kleen wrote:
> > On Mon, Sep 20, 2004 at 01:49:20PM -0700, Christoph Lameter wrote:
> > > On Mon, 20 Sep 2004, Denis Vlasenko wrote:
> > >
> > > > I think it shouldn't be this way.
> > > >
> > > > OTOH for !CONFIG_386 case it makes perfect sense to have it inlined.
> > >
> > > Would the following revised patch be acceptable?
> >
> > You would need an EXPORT_SYMBOL at least. But to be honest your
> > original patch was much simpler and nicer and cmpxchg is not called
> > that often that it really matters. I would just ignore Denis'
> > suggestion and stay with the old patch.
>
> A bit faster approach (for CONFIG_386 case) would be using

It's actually slower. Many x86 CPUs cannot predict indirect jumps
and those that do cannot predict them as well as a test and jump.

-Andi

2004-09-21 20:14:37

by Andi Kleen

[permalink] [raw]
Subject: Re: page fault scalability patch V8: [4/7] universally available cmpxchg on i386

> Looks like indirect jump is only slightly slower (on this CPU).

K7/K8 can predict indirect jumps. But most P3 and P4s can't (except for
the new Prescotts and Centrinos). And in all cases their jump predictor works
worse.

-Andi

2004-09-23 07:18:01

by Andy Lutomirski

[permalink] [raw]
Subject: Re: page fault scalability patch V8: [4/7] universally available cmpxchg on i386

Andi Kleen wrote:
> On Tue, Sep 21, 2004 at 06:41:25PM +0300, Denis Vlasenko wrote:
>
>>On Monday 20 September 2004 23:57, Andi Kleen wrote:
>>
>>>On Mon, Sep 20, 2004 at 01:49:20PM -0700, Christoph Lameter wrote:
>>>
>>>>On Mon, 20 Sep 2004, Denis Vlasenko wrote:
>>>>
>>>>
>>>>>I think it shouldn't be this way.
>>>>>
>>>>>OTOH for !CONFIG_386 case it makes perfect sense to have it inlined.
>>>>
>>>>Would the following revised patch be acceptable?
>>>
>>>You would need an EXPORT_SYMBOL at least. But to be honest your
>>>original patch was much simpler and nicer and cmpxchg is not called
>>>that often that it really matters. I would just ignore Denis'
>>>suggestion and stay with the old patch.
>>
>>A bit faster approach (for CONFIG_386 case) would be using
>
>
> It's actually slower. Many x86 CPUs cannot predict indirect jumps
> and those that do cannot predict them as well as a test and jump.

Wouldn't alternative_input() choosing between a cmpxchg and a call be
the way to go here? Or is the overhead too high in an inline function?

(No patch included since I don't pretend to understand gcc's asm syntax
at all.)

--Andy

2004-09-23 09:03:55

by Andi Kleen

[permalink] [raw]
Subject: Re: page fault scalability patch V8: [4/7] universally available cmpxchg on i386

> Wouldn't alternative_input() choosing between a cmpxchg and a call be
> the way to go here? Or is the overhead too high in an inline function?

It would if you want the absolute micro optimization yes. Disadvantage
is that you would waste some more space for nops in the !CONFIG_I386 case.
I personally don't think it matters much and that Christian's original
code was just fine.

-Andi (last post on the thread)

2004-09-27 19:07:22

by Christoph Lameter

[permalink] [raw]
Subject: page fault scalability patch V9: [0/7] overview

Signed-off-by: Christoph Lameter <[email protected]>

Changes from V8->V9 of this patch:
- Verify that mm->rss is changed to atomic on all arches
- Fixes to the i386 cmpxchg support. Make it as small as possible
by using a function instead of inlining.
- Patches against 2.6.9-rc2-bk15

This is a series of patches that increases the scalability of
the page fault handler for SMP. Typical performance increases in the page
fault rate are:

2 CPUs -> 10%
4 CPUs -> 50%
8 CPUs -> 70%

With a high number of CPUs (16..512) we are seeing the page fault rate
roughly doubling.

The performance increase is accomplished by avoiding the use of the
page_table_lock spinlock (but not mm->mmap_sem!) through new atomic
operations on pte's (ptep_xchg, ptep_cmpxchg) and on pmd and pgd's
(pgd_test_and_populate, pmd_test_and_populate).

The page table lock can be avoided in the following situations:

1. An empty pte or pmd entry is populated

This is safe since the swapper may only depopulate them and the
swapper code has been changed to never set a pte to be empty until the
page has been evicted. The population of an empty pte is frequent
if a process touches newly allocated memory.

2. Modifications of flags in a pte entry (write/accessed).

These modifications are done by the CPU or by low level handlers
on various platforms also bypassing the page_table_lock. So this
seems to be safe too.

The patchset is composed of 7 patches:

1/7: Make mm->rss atomic

The page table lock is used to protect mm->rss and the first patch
makes rss atomic so that it may be changed without holding the
page_table_lock.
Generic atomic variables are only 32 bit under Linux. However, 32 bits
is sufficient for rss even on a 64 bit machine since rss refers to the
number of pages allowing still up to 2^(31+12)= 8 terabytes of memory
to be in use by a single process. A 64 bit atomic would of course be better.

2/7: Avoid page_table_lock in handle_mm_fault

This patch defers the acquisition of the page_table_lock as much as
possible and uses atomic operations for allocating anonymous memory.
These atomic operations are simulated by acquiring the page_table_lock
for very small time frames if an architecture does not define
__HAVE_ARCH_ATOMIC_TABLE_OPS. It also changes the swapper so that a
pte will not be set to empty if a page is in transition to swap.

If only the first two patches are applied then the time that the page_table_lock
is held is simply reduced. The lock may then be acquired multiple
times during a page fault.

The remaining patches introduce the necessary atomic pte operations to avoid
the page_table_lock.

3/7: Atomic pte operations for ia64

This patch adds atomic pte operations to the IA64 platform. The page_table_lock
will then no longer be acquired for page faults that create pte's for anonymous
memory.

4/7: Make cmpxchg generally available on i386

The atomic operations on the page table rely heavily on cmpxchg instructions.
This patch adds emulations for cmpxchg and cmpxchg8b for old 80386 and 80486
cpus. The emulations are only included if a kernel is build for these old
cpus. The emulations are skipped for the real cmpxchg instructions if the kernel
that is build for 386 or 486 is then run on a more recent cpu.

This patch may be used independently of the other patches.

5/7: Atomic pte operations for i386

Add atomic PTE operations for i386. A generally available cmpxchg (last patch)
must be available for this patch.

6/7: Atomic pte operation for x86_64

Add atomic pte operations for x86_64. This has not been tested yet since I have
no x86_64.

7/7: Atomic pte operations for s390

Add atomic PTE operations for S390. This has also not been tested yet since I have
no S/390 available. Feedback from the S/390 people seems to indicate though that
the way it is done is fine.

2004-10-15 19:05:34

by Christoph Lameter

[permalink] [raw]
Subject: page fault scalability patch V10: [1/7] make rss atomic

Changelog
* Make mm->rss atomic, so that rss may be incremented or decremented
without holding the page table lock.
* Prerequisite for page table scalability patch

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.9-rc4/kernel/fork.c
===================================================================
--- linux-2.6.9-rc4.orig/kernel/fork.c 2004-10-10 19:57:03.000000000 -0700
+++ linux-2.6.9-rc4/kernel/fork.c 2004-10-14 12:22:14.000000000 -0700
@@ -296,7 +296,7 @@
mm->mmap_cache = NULL;
mm->free_area_cache = oldmm->mmap_base;
mm->map_count = 0;
- mm->rss = 0;
+ atomic_set(&mm->mm_rss, 0);
cpus_clear(mm->cpu_vm_mask);
mm->mm_rb = RB_ROOT;
rb_link = &mm->mm_rb.rb_node;
Index: linux-2.6.9-rc4/include/linux/sched.h
===================================================================
--- linux-2.6.9-rc4.orig/include/linux/sched.h 2004-10-10 19:57:03.000000000 -0700
+++ linux-2.6.9-rc4/include/linux/sched.h 2004-10-14 12:22:14.000000000 -0700
@@ -213,9 +213,10 @@
pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
+ atomic_t mm_rss; /* Number of pages used by this mm struct */
int map_count; /* number of VMAs */
struct rw_semaphore mmap_sem;
- spinlock_t page_table_lock; /* Protects task page tables and mm->rss */
+ spinlock_t page_table_lock; /* Protects task page tables */

struct list_head mmlist; /* List of all active mm's. These are globally strung
* together off init_mm.mmlist, and are protected
@@ -225,7 +226,7 @@
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
- unsigned long rss, total_vm, locked_vm, shared_vm;
+ unsigned long total_vm, locked_vm, shared_vm;
unsigned long exec_vm, stack_vm, reserved_vm, def_flags;

unsigned long saved_auxv[42]; /* for /proc/PID/auxv */
Index: linux-2.6.9-rc4/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.9-rc4.orig/fs/proc/task_mmu.c 2004-10-10 19:57:06.000000000 -0700
+++ linux-2.6.9-rc4/fs/proc/task_mmu.c 2004-10-14 12:22:14.000000000 -0700
@@ -21,7 +21,7 @@
"VmLib:\t%8lu kB\n",
(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
- mm->rss << (PAGE_SHIFT-10),
+ (unsigned long)atomic_read(&mm->mm_rss) << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib);
return buffer;
@@ -38,7 +38,7 @@
*shared = mm->shared_vm;
*text = (mm->end_code - mm->start_code) >> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm - *text;
- *resident = mm->rss;
+ *resident = atomic_read(&mm->mm_rss);
return mm->total_vm;
}

Index: linux-2.6.9-rc4/mm/mmap.c
===================================================================
--- linux-2.6.9-rc4.orig/mm/mmap.c 2004-10-10 19:58:06.000000000 -0700
+++ linux-2.6.9-rc4/mm/mmap.c 2004-10-14 12:22:14.000000000 -0700
@@ -1847,7 +1847,7 @@
vma = mm->mmap;
mm->mmap = mm->mmap_cache = NULL;
mm->mm_rb = RB_ROOT;
- mm->rss = 0;
+ atomic_set(&mm->mm_rss, 0);
mm->total_vm = 0;
mm->locked_vm = 0;

Index: linux-2.6.9-rc4/include/asm-generic/tlb.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-generic/tlb.h 2004-10-10 19:56:36.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-generic/tlb.h 2004-10-14 12:22:14.000000000 -0700
@@ -88,11 +88,11 @@
{
int freed = tlb->freed;
struct mm_struct *mm = tlb->mm;
- int rss = mm->rss;
+ int rss = atomic_read(&mm->mm_rss);

if (rss < freed)
freed = rss;
- mm->rss = rss - freed;
+ atomic_set(&mm->mm_rss, rss - freed);
tlb_flush_mmu(tlb, start, end);

/* keep the page table cache within bounds */
Index: linux-2.6.9-rc4/fs/binfmt_flat.c
===================================================================
--- linux-2.6.9-rc4.orig/fs/binfmt_flat.c 2004-10-10 19:56:36.000000000 -0700
+++ linux-2.6.9-rc4/fs/binfmt_flat.c 2004-10-14 12:22:14.000000000 -0700
@@ -650,7 +650,7 @@
current->mm->start_brk = datapos + data_len + bss_len;
current->mm->brk = (current->mm->start_brk + 3) & ~3;
current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
- current->mm->rss = 0;
+ atomic_set(current->mm->mm_rss, 0);
}

if (flags & FLAT_FLAG_KTRACE)
Index: linux-2.6.9-rc4/fs/exec.c
===================================================================
--- linux-2.6.9-rc4.orig/fs/exec.c 2004-10-10 19:57:30.000000000 -0700
+++ linux-2.6.9-rc4/fs/exec.c 2004-10-14 12:22:14.000000000 -0700
@@ -319,7 +319,7 @@
pte_unmap(pte);
goto out;
}
- mm->rss++;
+ atomic_inc(&mm->mm_rss);
lru_cache_add_active(page);
set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(
page, vma->vm_page_prot))));
Index: linux-2.6.9-rc4/mm/memory.c
===================================================================
--- linux-2.6.9-rc4.orig/mm/memory.c 2004-10-10 19:57:50.000000000 -0700
+++ linux-2.6.9-rc4/mm/memory.c 2004-10-14 12:22:14.000000000 -0700
@@ -325,7 +325,7 @@
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
get_page(page);
- dst->rss++;
+ atomic_inc(&dst->mm_rss);
set_pte(dst_pte, pte);
page_dup_rmap(page);
cont_copy_pte_range_noset:
@@ -1096,7 +1096,7 @@
page_table = pte_offset_map(pmd, address);
if (likely(pte_same(*page_table, pte))) {
if (PageReserved(old_page))
- ++mm->rss;
+ atomic_inc(&mm->mm_rss);
else
page_remove_rmap(old_page);
break_cow(vma, new_page, address, page_table);
@@ -1378,7 +1378,7 @@
if (vm_swap_full())
remove_exclusive_swap_page(page);

- mm->rss++;
+ atomic_inc(&mm->mm_rss);
pte = mk_pte(page, vma->vm_page_prot);
if (write_access && can_share_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1443,7 +1443,7 @@
spin_unlock(&mm->page_table_lock);
goto out;
}
- mm->rss++;
+ atomic_inc(&mm->mm_rss);
entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
vma->vm_page_prot)),
vma);
@@ -1552,7 +1552,7 @@
/* Only go through if we didn't race with anybody else... */
if (pte_none(*page_table)) {
if (!PageReserved(new_page))
- ++mm->rss;
+ atomic_inc(&mm->mm_rss);
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access)
Index: linux-2.6.9-rc4/fs/binfmt_som.c
===================================================================
--- linux-2.6.9-rc4.orig/fs/binfmt_som.c 2004-10-10 19:57:30.000000000 -0700
+++ linux-2.6.9-rc4/fs/binfmt_som.c 2004-10-14 12:22:14.000000000 -0700
@@ -259,7 +259,7 @@
create_som_tables(bprm);

current->mm->start_stack = bprm->p;
- current->mm->rss = 0;
+ atomic_set(current->mm->mm_rss, 0);

#if 0
printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
Index: linux-2.6.9-rc4/mm/fremap.c
===================================================================
--- linux-2.6.9-rc4.orig/mm/fremap.c 2004-10-10 19:56:40.000000000 -0700
+++ linux-2.6.9-rc4/mm/fremap.c 2004-10-14 12:22:14.000000000 -0700
@@ -38,7 +38,7 @@
set_page_dirty(page);
page_remove_rmap(page);
page_cache_release(page);
- mm->rss--;
+ atomic_dec(&mm->mm_rss);
}
}
} else {
@@ -86,7 +86,7 @@

zap_pte(mm, vma, addr, pte);

- mm->rss++;
+ atomic_inc(&mm->mm_rss);
flush_icache_page(vma, page);
set_pte(pte, mk_pte(page, prot));
page_add_file_rmap(page);
Index: linux-2.6.9-rc4/mm/swapfile.c
===================================================================
--- linux-2.6.9-rc4.orig/mm/swapfile.c 2004-10-10 19:57:07.000000000 -0700
+++ linux-2.6.9-rc4/mm/swapfile.c 2004-10-14 12:22:14.000000000 -0700
@@ -430,7 +430,7 @@
unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
swp_entry_t entry, struct page *page)
{
- vma->vm_mm->rss++;
+ atomic_inc(&vma->vm_mm->mm_rss);
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
page_add_anon_rmap(page, vma, address);
Index: linux-2.6.9-rc4/fs/binfmt_aout.c
===================================================================
--- linux-2.6.9-rc4.orig/fs/binfmt_aout.c 2004-10-10 19:57:06.000000000 -0700
+++ linux-2.6.9-rc4/fs/binfmt_aout.c 2004-10-14 12:22:14.000000000 -0700
@@ -309,7 +309,7 @@
(current->mm->start_brk = N_BSSADDR(ex));
current->mm->free_area_cache = current->mm->mmap_base;

- current->mm->rss = 0;
+ atomic_set(&current->mm->mm_rss, 0);
current->mm->mmap = NULL;
compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
Index: linux-2.6.9-rc4/arch/ia64/mm/hugetlbpage.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/ia64/mm/hugetlbpage.c 2004-10-10 19:57:59.000000000 -0700
+++ linux-2.6.9-rc4/arch/ia64/mm/hugetlbpage.c 2004-10-14 12:22:14.000000000 -0700
@@ -65,7 +65,7 @@
{
pte_t entry;

- mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &mm->mm_rss);
if (write_access) {
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
@@ -108,7 +108,7 @@
ptepage = pte_page(entry);
get_page(ptepage);
set_pte(dst_pte, entry);
- dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &dst->mm_rss);
addr += HPAGE_SIZE;
}
return 0;
@@ -249,7 +249,7 @@
put_page(page);
pte_clear(pte);
}
- mm->rss -= (end - start) >> PAGE_SHIFT;
+ atomic_sub((end - start) >> PAGE_SHIFT, &mm->mm_rss);
flush_tlb_range(vma, start, end);
}

Index: linux-2.6.9-rc4/fs/proc/array.c
===================================================================
--- linux-2.6.9-rc4.orig/fs/proc/array.c 2004-10-10 19:58:06.000000000 -0700
+++ linux-2.6.9-rc4/fs/proc/array.c 2004-10-14 12:22:14.000000000 -0700
@@ -388,7 +388,7 @@
jiffies_to_clock_t(task->it_real_value),
start_time,
vsize,
- mm ? mm->rss : 0, /* you might want to shift this left 3 */
+ mm ? (unsigned long)atomic_read(&mm->mm_rss) : 0, /* you might want to shift this left 3 */
task->rlim[RLIMIT_RSS].rlim_cur,
mm ? mm->start_code : 0,
mm ? mm->end_code : 0,
Index: linux-2.6.9-rc4/fs/binfmt_elf.c
===================================================================
--- linux-2.6.9-rc4.orig/fs/binfmt_elf.c 2004-10-10 19:57:50.000000000 -0700
+++ linux-2.6.9-rc4/fs/binfmt_elf.c 2004-10-14 12:22:14.000000000 -0700
@@ -716,7 +716,7 @@

/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
- current->mm->rss = 0;
+ atomic_set(&current->mm->mm_rss, 0);
current->mm->free_area_cache = current->mm->mmap_base;
retval = setup_arg_pages(bprm, executable_stack);
if (retval < 0) {
Index: linux-2.6.9-rc4/mm/rmap.c
===================================================================
--- linux-2.6.9-rc4.orig/mm/rmap.c 2004-10-10 19:58:49.000000000 -0700
+++ linux-2.6.9-rc4/mm/rmap.c 2004-10-14 12:22:14.000000000 -0700
@@ -262,7 +262,7 @@
pte_t *pte;
int referenced = 0;

- if (!mm->rss)
+ if (!atomic_read(&mm->mm_rss))
goto out;
address = vma_address(page, vma);
if (address == -EFAULT)
@@ -501,7 +501,7 @@
pte_t pteval;
int ret = SWAP_AGAIN;

- if (!mm->rss)
+ if (!atomic_read(&mm->mm_rss))
goto out;
address = vma_address(page, vma);
if (address == -EFAULT)
@@ -580,7 +580,7 @@
BUG_ON(pte_file(*pte));
}

- mm->rss--;
+ atomic_dec(&mm->mm_rss);
page_remove_rmap(page);
page_cache_release(page);

@@ -680,7 +680,7 @@

page_remove_rmap(page);
page_cache_release(page);
- mm->rss--;
+ atomic_dec(&mm->mm_rss);
(*mapcount)--;
}

@@ -779,7 +779,7 @@
if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
continue;
cursor = (unsigned long) vma->vm_private_data;
- while (vma->vm_mm->rss &&
+ while (atomic_read(&vma->vm_mm->mm_rss) &&
cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
try_to_unmap_cluster(cursor, &mapcount, vma);
Index: linux-2.6.9-rc4/include/asm-ia64/tlb.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-ia64/tlb.h 2004-10-10 19:57:44.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-ia64/tlb.h 2004-10-14 12:22:14.000000000 -0700
@@ -161,11 +161,11 @@
{
unsigned long freed = tlb->freed;
struct mm_struct *mm = tlb->mm;
- unsigned long rss = mm->rss;
+ unsigned long rss = atomic_read(&mm->mm_rss);

if (rss < freed)
freed = rss;
- mm->rss = rss - freed;
+ atomic_sub(freed, &mm->mm_rss);
/*
* Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
* tlb->end_addr.
Index: linux-2.6.9-rc4/include/asm-arm/tlb.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-arm/tlb.h 2004-10-10 19:56:40.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-arm/tlb.h 2004-10-14 12:22:14.000000000 -0700
@@ -54,11 +54,11 @@
{
struct mm_struct *mm = tlb->mm;
unsigned long freed = tlb->freed;
- int rss = mm->rss;
+ int rss = atomic_read(&mm->mm_rss);

if (rss < freed)
freed = rss;
- mm->rss = rss - freed;
+ atomic_sub(freed, &mm->mm_rss);

if (freed) {
flush_tlb_mm(mm);
Index: linux-2.6.9-rc4/include/asm-arm26/tlb.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-arm26/tlb.h 2004-10-10 19:58:56.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-arm26/tlb.h 2004-10-14 12:22:14.000000000 -0700
@@ -35,11 +35,11 @@
{
struct mm_struct *mm = tlb->mm;
unsigned long freed = tlb->freed;
- int rss = mm->rss;
+ int rss = atomic_read(&mm->mm_rss);

if (rss < freed)
freed = rss;
- mm->rss = rss - freed;
+ atomic_sub(freed, &mm->mm_rss);

if (freed) {
flush_tlb_mm(mm);
Index: linux-2.6.9-rc4/include/asm-sparc64/tlb.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-sparc64/tlb.h 2004-10-10 19:58:24.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-sparc64/tlb.h 2004-10-14 12:22:14.000000000 -0700
@@ -80,11 +80,11 @@
{
unsigned long freed = mp->freed;
struct mm_struct *mm = mp->mm;
- unsigned long rss = mm->rss;
+ unsigned long rss = atomic_read(&mm->mm_rss);

if (rss < freed)
freed = rss;
- mm->rss = rss - freed;
+ atomic_sub(freed, &mm->mm_rss);

tlb_flush_mmu(mp);

Index: linux-2.6.9-rc4/arch/sh/mm/hugetlbpage.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/sh/mm/hugetlbpage.c 2004-10-10 19:58:06.000000000 -0700
+++ linux-2.6.9-rc4/arch/sh/mm/hugetlbpage.c 2004-10-14 12:22:14.000000000 -0700
@@ -62,7 +62,7 @@
unsigned long i;
pte_t entry;

- mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &mm->mm_rss);

if (write_access)
entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
@@ -115,7 +115,7 @@
pte_val(entry) += PAGE_SIZE;
dst_pte++;
}
- dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &dst->mm_rss);
addr += HPAGE_SIZE;
}
return 0;
@@ -206,7 +206,7 @@
pte++;
}
}
- mm->rss -= (end - start) >> PAGE_SHIFT;
+ atomic_sub((end - start) >> PAGE_SHIFT, &mm->mm_rss);
flush_tlb_range(vma, start, end);
}

Index: linux-2.6.9-rc4/arch/x86_64/ia32/ia32_aout.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/x86_64/ia32/ia32_aout.c 2004-10-10 19:58:56.000000000 -0700
+++ linux-2.6.9-rc4/arch/x86_64/ia32/ia32_aout.c 2004-10-14 12:22:14.000000000 -0700
@@ -309,7 +309,7 @@
(current->mm->start_brk = N_BSSADDR(ex));
current->mm->free_area_cache = TASK_UNMAPPED_BASE;

- current->mm->rss = 0;
+ atomic_set(&current->mm->mm_rss. 0);
current->mm->mmap = NULL;
compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
Index: linux-2.6.9-rc4/arch/ppc64/mm/hugetlbpage.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/ppc64/mm/hugetlbpage.c 2004-10-10 19:57:59.000000000 -0700
+++ linux-2.6.9-rc4/arch/ppc64/mm/hugetlbpage.c 2004-10-14 12:22:14.000000000 -0700
@@ -125,7 +125,7 @@
hugepte_t entry;
int i;

- mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &mm->mm_rss);
entry = mk_hugepte(page, write_access);
for (i = 0; i < HUGEPTE_BATCH_SIZE; i++)
set_hugepte(ptep+i, entry);
@@ -287,7 +287,7 @@
/* This is the first hugepte in a batch */
ptepage = hugepte_page(entry);
get_page(ptepage);
- dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &dst->mm_rss);
}
set_hugepte(dst_pte, entry);

@@ -410,7 +410,7 @@
}
put_cpu();

- mm->rss -= (end - start) >> PAGE_SHIFT;
+ atomic_sub((end - start) >> PAGE_SHIFT, &mm->mm_rss);
}

int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
Index: linux-2.6.9-rc4/arch/sh64/mm/hugetlbpage.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/sh64/mm/hugetlbpage.c 2004-10-10 19:57:30.000000000 -0700
+++ linux-2.6.9-rc4/arch/sh64/mm/hugetlbpage.c 2004-10-14 12:22:14.000000000 -0700
@@ -62,7 +62,7 @@
unsigned long i;
pte_t entry;

- mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &mm->mm_rss);

if (write_access)
entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
@@ -115,7 +115,7 @@
pte_val(entry) += PAGE_SIZE;
dst_pte++;
}
- dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &dst->mm_rss);
addr += HPAGE_SIZE;
}
return 0;
@@ -206,7 +206,7 @@
pte++;
}
}
- mm->rss -= (end - start) >> PAGE_SHIFT;
+ atomic_sub((end - start) >> PAGE_SHIFT, &mm->mm_rss);
flush_tlb_range(vma, start, end);
}

Index: linux-2.6.9-rc4/arch/sparc64/mm/hugetlbpage.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/sparc64/mm/hugetlbpage.c 2004-10-10 19:58:07.000000000 -0700
+++ linux-2.6.9-rc4/arch/sparc64/mm/hugetlbpage.c 2004-10-14 12:22:14.000000000 -0700
@@ -59,7 +59,7 @@
unsigned long i;
pte_t entry;

- mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &mm->mm_rss);

if (write_access)
entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
@@ -112,7 +112,7 @@
pte_val(entry) += PAGE_SIZE;
dst_pte++;
}
- dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &dst->mm_rss);
addr += HPAGE_SIZE;
}
return 0;
@@ -203,7 +203,7 @@
pte++;
}
}
- mm->rss -= (end - start) >> PAGE_SHIFT;
+ atomic_sub((end - start) >> PAGE_SHIFT, &mm->mm_rss);
flush_tlb_range(vma, start, end);
}

Index: linux-2.6.9-rc4/arch/mips/kernel/irixelf.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/mips/kernel/irixelf.c 2004-10-10 19:58:56.000000000 -0700
+++ linux-2.6.9-rc4/arch/mips/kernel/irixelf.c 2004-10-14 12:22:14.000000000 -0700
@@ -686,7 +686,7 @@
/* Do this so that we can load the interpreter, if need be. We will
* change some of these later.
*/
- current->mm->rss = 0;
+ atomic_set(&current->mm->mm_rss, 0);
setup_arg_pages(bprm, EXSTACK_DEFAULT);
current->mm->start_stack = bprm->p;

Index: linux-2.6.9-rc4/arch/m68k/atari/stram.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/m68k/atari/stram.c 2004-10-10 19:58:24.000000000 -0700
+++ linux-2.6.9-rc4/arch/m68k/atari/stram.c 2004-10-14 12:22:14.000000000 -0700
@@ -635,7 +635,7 @@
set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
get_page(page);
- ++vma->vm_mm->rss;
+ atomic_inc(&vma->vm_mm->mm_rss);
}

static inline void unswap_pmd(struct vm_area_struct * vma, pmd_t *dir,
Index: linux-2.6.9-rc4/arch/i386/mm/hugetlbpage.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/i386/mm/hugetlbpage.c 2004-10-10 19:58:49.000000000 -0700
+++ linux-2.6.9-rc4/arch/i386/mm/hugetlbpage.c 2004-10-14 12:22:14.000000000 -0700
@@ -42,7 +42,7 @@
{
pte_t entry;

- mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &mm->mm_rss);
if (write_access) {
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
@@ -82,7 +82,7 @@
ptepage = pte_page(entry);
get_page(ptepage);
set_pte(dst_pte, entry);
- dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &dst->mm_rss);
addr += HPAGE_SIZE;
}
return 0;
@@ -218,7 +218,7 @@
page = pte_page(pte);
put_page(page);
}
- mm->rss -= (end - start) >> PAGE_SHIFT;
+ atomic_sub((end - start) >> PAGE_SHIFT, &mm->mm_rss);
flush_tlb_range(vma, start, end);
}

Index: linux-2.6.9-rc4/arch/sparc64/kernel/binfmt_aout32.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/sparc64/kernel/binfmt_aout32.c 2004-10-10 19:57:59.000000000 -0700
+++ linux-2.6.9-rc4/arch/sparc64/kernel/binfmt_aout32.c 2004-10-14 12:22:14.000000000 -0700
@@ -239,7 +239,7 @@
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));

- current->mm->rss = 0;
+ atomic_set(&current->mm->mm_rss, 0);
current->mm->mmap = NULL;
compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;

2004-10-15 19:07:13

by Christoph Lameter

[permalink] [raw]
Subject: page fault scalability patch V10: [2/7] defer/omit taking page_table_lock

Changelog
* Increase parallelism in SMP configurations by deferring
the acquisition of page_table_lock in handle_mm_fault
* Anonymous memory page faults bypass the page_table_lock
through the use of atomic page table operations
* Swapper does not set pte to empty in transition to swap
* Simulate atomic page table operations using the
page_table_lock if an arch does not define
__HAVE_ARCH_ATOMIC_TABLE_OPS. This still provides
a performance benefit since the page_table_lock
is held for shorter periods of time.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.9-rc4/mm/memory.c
===================================================================
--- linux-2.6.9-rc4.orig/mm/memory.c 2004-10-14 12:22:14.000000000 -0700
+++ linux-2.6.9-rc4/mm/memory.c 2004-10-14 12:22:14.000000000 -0700
@@ -1314,8 +1314,7 @@
}

/*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
+ * We hold the mm semaphore
*/
static int do_swap_page(struct mm_struct * mm,
struct vm_area_struct * vma, unsigned long address,
@@ -1327,15 +1326,13 @@
int ret = VM_FAULT_MINOR;

pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
/*
- * Back out if somebody else faulted in this pte while
- * we released the page table lock.
+ * Back out if somebody else faulted in this pte
*/
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
@@ -1406,14 +1403,12 @@
}

/*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs.
+ * We are called with the MM semaphore held.
*/
static int
do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *page_table, pmd_t *pmd, int write_access,
- unsigned long addr)
+ unsigned long addr, pte_t orig_entry)
{
pte_t entry;
struct page * page = ZERO_PAGE(addr);
@@ -1425,7 +1420,6 @@
if (write_access) {
/* Allocate our own private page. */
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);

if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
@@ -1434,30 +1428,39 @@
goto no_mem;
clear_user_highpage(page, addr);

- spin_lock(&mm->page_table_lock);
+ lock_page(page);
page_table = pte_offset_map(pmd, addr);

- if (!pte_none(*page_table)) {
- pte_unmap(page_table);
- page_cache_release(page);
- spin_unlock(&mm->page_table_lock);
- goto out;
- }
- atomic_inc(&mm->mm_rss);
entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
vma->vm_page_prot)),
vma);
- lru_cache_add_active(page);
mark_page_accessed(page);
- page_add_anon_rmap(page, vma, addr);
}

- set_pte(page_table, entry);
+ /* update the entry */
+ if (!ptep_cmpxchg(vma, addr, page_table, orig_entry, entry)) {
+ if (write_access) {
+ pte_unmap(page_table);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ goto out;
+ }
+ if (write_access) {
+ /*
+ * The following two functions are safe to use without
+ * the page_table_lock but do they need to come before
+ * the cmpxchg?
+ */
+ lru_cache_add_active(page);
+ page_add_anon_rmap(page, vma, addr);
+ atomic_inc(&mm->mm_rss);
+ unlock_page(page);
+ }
pte_unmap(page_table);

/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
- spin_unlock(&mm->page_table_lock);
out:
return VM_FAULT_MINOR;
no_mem:
@@ -1473,12 +1476,12 @@
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
- * This is called with the MM semaphore held and the page table
- * spinlock held. Exit with the spinlock released.
+ * This is called with the MM semaphore held.
*/
static int
do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+ unsigned long address, int write_access, pte_t *page_table,
+ pmd_t *pmd, pte_t orig_entry)
{
struct page * new_page;
struct address_space *mapping = NULL;
@@ -1489,9 +1492,8 @@

if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table,
- pmd, write_access, address);
+ pmd, write_access, address, orig_entry);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);

if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
@@ -1589,7 +1591,7 @@
* nonlinear vmas.
*/
static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+ unsigned long address, int write_access, pte_t *pte, pmd_t *pmd, pte_t entry)
{
unsigned long pgoff;
int err;
@@ -1602,13 +1604,12 @@
if (!vma->vm_ops || !vma->vm_ops->populate ||
(write_access && !(vma->vm_flags & VM_SHARED))) {
pte_clear(pte);
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ return do_no_page(mm, vma, address, write_access, pte, pmd, entry);
}

pgoff = pte_to_pgoff(*pte);

pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);

err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
if (err == -ENOMEM)
@@ -1627,49 +1628,49 @@
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
+ * Note that kswapd only ever _removes_ pages, never adds them.
+ * We need to insure to handle that case properly.
+ *
* The adding of pages is protected by the MM semaphore (which we hold),
* so we don't need to worry about a page being suddenly been added into
* our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct * vma, unsigned long address,
int write_access, pte_t *pte, pmd_t *pmd)
{
pte_t entry;
+ pte_t new_entry;

entry = *pte;
if (!pte_present(entry)) {
/*
* If it truly wasn't present, we know that kswapd
* and the PTE updates will not touch it later. So
- * drop the lock.
+ * no need to acquire the page_table_lock.
*/
if (pte_none(entry))
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ return do_no_page(mm, vma, address, write_access, pte, pmd, entry);
if (pte_file(entry))
- return do_file_page(mm, vma, address, write_access, pte, pmd);
+ return do_file_page(mm, vma, address, write_access, pte, pmd, entry);
return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
}

+ /*
+ * This is the case in which we may only update some bits in the pte.
+ */
+ new_entry = pte_mkyoung(entry);
if (write_access) {
- if (!pte_write(entry))
+ if (!pte_write(entry)) {
+ /* do_wp_page expects us to hold the page_table_lock */
+ spin_lock(&mm->page_table_lock);
return do_wp_page(mm, vma, address, pte, pmd, entry);
-
- entry = pte_mkdirty(entry);
+ }
+ new_entry = pte_mkdirty(new_entry);
}
- entry = pte_mkyoung(entry);
- ptep_set_access_flags(vma, address, pte, entry, write_access);
- update_mmu_cache(vma, address, entry);
+ if (ptep_cmpxchg(vma, address, pte, entry, new_entry))
+ update_mmu_cache(vma, address, new_entry);
pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
return VM_FAULT_MINOR;
}

@@ -1687,22 +1688,42 @@

inc_page_state(pgfault);

- if (is_vm_hugetlb_page(vma))
+ if (unlikely(is_vm_hugetlb_page(vma)))
return VM_FAULT_SIGBUS; /* mapping truncation does this. */

/*
- * We need the page table lock to synchronize with kswapd
- * and the SMP-safe atomic PTE updates.
+ * We rely on the mmap_sem and the SMP-safe atomic PTE updates.
+ * to synchronize with kswapd
*/
- spin_lock(&mm->page_table_lock);
- pmd = pmd_alloc(mm, pgd, address);
+ if (unlikely(pgd_none(*pgd))) {
+ pmd_t *new = pmd_alloc_one(mm, address);
+ if (!new) return VM_FAULT_OOM;
+
+ /* Insure that the update is done in an atomic way */
+ if (!pgd_test_and_populate(mm, pgd, new)) pmd_free(new);
+ }
+
+ pmd = pmd_offset(pgd, address);
+
+ if (likely(pmd)) {
+ pte_t *pte;
+
+ if (!pmd_present(*pmd)) {
+ struct page *new;

- if (pmd) {
- pte_t * pte = pte_alloc_map(mm, pmd, address);
- if (pte)
+ new = pte_alloc_one(mm, address);
+ if (!new) return VM_FAULT_OOM;
+
+ if (!pmd_test_and_populate(mm, pmd, new))
+ pte_free(new);
+ else
+ inc_page_state(nr_page_table_pages);
+ }
+
+ pte = pte_offset_map(pmd, address);
+ if (likely(pte))
return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
}
- spin_unlock(&mm->page_table_lock);
return VM_FAULT_OOM;
}

Index: linux-2.6.9-rc4/include/asm-generic/pgtable.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-generic/pgtable.h 2004-10-10 19:57:30.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-generic/pgtable.h 2004-10-14 12:22:14.000000000 -0700
@@ -126,4 +126,75 @@
#define pgd_offset_gate(mm, addr) pgd_offset(mm, addr)
#endif

+#ifndef __HAVE_ARCH_ATOMIC_TABLE_OPS
+/*
+ * If atomic page table operations are not available then use
+ * the page_table_lock to insure some form of locking.
+ * Note thought that low level operations as well as the
+ * page_table_handling of the cpu may bypass all locking.
+ */
+
+#ifndef __HAVE_ARCH_PTEP_XCHG_FLUSH
+#define ptep_xchg_flush(__vma, __address, __ptep, __pteval) \
+({ \
+ pte_t __pte; \
+ spin_lock(&__vma->vm_mm->page_table_lock); \
+ __pte = *(__ptep); \
+ set_pte(__ptep, __pteval); \
+ flush_tlb_page(__vma, __address); \
+ spin_unlock(&__vma->vm_mm->page_table_lock); \
+ __pte; \
+})
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CMPXCHG
+#define ptep_cmpxchg(__vma, __addr, __ptep, __oldval, __newval) \
+({ \
+ int __rc; \
+ spin_lock(&__vma->vm_mm->page_table_lock); \
+ __rc = pte_same(*(__ptep), __oldval); \
+ if (__rc) set_pte(__ptep, __newval); \
+ spin_unlock(&__vma->vm_mm->page_table_lock); \
+ __rc; \
+})
+#endif
+
+#ifndef __HAVE_ARCH_PGP_TEST_AND_POPULATE
+#define pgd_test_and_populate(__mm, __pgd, __pmd) \
+({ \
+ int __rc; \
+ spin_lock(&__mm->page_table_lock); \
+ __rc = !pgd_present(*(__pgd)); \
+ if (__rc) pgd_populate(__mm, __pgd, __pmd); \
+ spin_unlock(&__mm->page_table_lock); \
+ __rc; \
+})
+#endif
+
+#ifndef __HAVE_PMD_TEST_AND_POPULATE
+#define pmd_test_and_populate(__mm, __pmd, __page) \
+({ \
+ int __rc; \
+ spin_lock(&__mm->page_table_lock); \
+ __rc = !pmd_present(*(__pmd)); \
+ if (__rc) pmd_populate(__mm, __pmd, __page); \
+ spin_unlock(&__mm->page_table_lock); \
+ __rc; \
+})
+#endif
+
+#else
+
+#ifndef __HAVE_ARCH_PTEP_XCHG_FLUSH
+#define ptep_xchg_flush(__vma, __address, __ptep, __pteval) \
+({ \
+ pte_t __pte = ptep_xchg((__vma)->vm_mm, __ptep, __pteval); \
+ flush_tlb_page(__vma, __address); \
+ __pte; \
+})
+
+#endif
+
+#endif
+
#endif /* _ASM_GENERIC_PGTABLE_H */
Index: linux-2.6.9-rc4/mm/rmap.c
===================================================================
--- linux-2.6.9-rc4.orig/mm/rmap.c 2004-10-14 12:22:14.000000000 -0700
+++ linux-2.6.9-rc4/mm/rmap.c 2004-10-14 12:22:14.000000000 -0700
@@ -420,7 +420,10 @@
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the mm->page_table_lock if page
+ * is pointing to something that is known by the vm.
+ * The lock does not need to be held if page is pointing
+ * to a newly allocated page.
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
@@ -562,11 +565,6 @@

/* Nuke the page table entry. */
flush_cache_page(vma, address);
- pteval = ptep_clear_flush(vma, address, pte);
-
- /* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
- set_page_dirty(page);

if (PageAnon(page)) {
swp_entry_t entry = { .val = page->private };
@@ -576,10 +574,14 @@
*/
BUG_ON(!PageSwapCache(page));
swap_duplicate(entry);
- set_pte(pte, swp_entry_to_pte(entry));
+ pteval = ptep_xchg_flush(vma, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
- }
+ } else
+ pteval = ptep_clear_flush(vma, address, pte);

+ /* Move the dirty bit to the physical page now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
atomic_dec(&mm->mm_rss);
page_remove_rmap(page);
page_cache_release(page);
@@ -666,15 +668,24 @@
if (ptep_clear_flush_young(vma, address, pte))
continue;

- /* Nuke the page table entry. */
flush_cache_page(vma, address);
- pteval = ptep_clear_flush(vma, address, pte);
+ /*
+ * There would be a race here with the handle_mm_fault code that
+ * bypasses the page_table_lock to allow a fast creation of ptes
+ * if we would zap the pte before
+ * putting something into it. On the other hand we need to
+ * have the dirty flag when we replaced the value.
+ * The dirty flag may be handled by a processor so we better
+ * use an atomic operation here.
+ */

/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
- set_pte(pte, pgoff_to_pte(page->index));
+ pteval = ptep_xchg_flush(vma, address, pte, pgoff_to_pte(page->index));
+ else
+ pteval = ptep_get_and_clear(pte);

- /* Move the dirty bit to the physical page now the pte is gone. */
+ /* Move the dirty bit to the physical page now that the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);

2004-10-15 19:11:43

by Christoph Lameter

[permalink] [raw]
Subject: page fault scalability patch V10: [6/7] x86_64 atomic pte operations

Changelog
* Provide atomic pte operations for x86_64

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.9-rc4/include/asm-x86_64/pgalloc.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-x86_64/pgalloc.h 2004-10-10 19:57:59.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-x86_64/pgalloc.h 2004-10-15 11:20:36.000000000 -0700
@@ -7,16 +7,26 @@
#include <linux/threads.h>
#include <linux/mm.h>

+#define PMD_NONE 0
+#define PGD_NONE 0
+
#define pmd_populate_kernel(mm, pmd, pte) \
set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
#define pgd_populate(mm, pgd, pmd) \
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pmd)))
+#define pgd_test_and_populate(mm, pgd, pmd) \
+ (cmpxchg((int *)pgd, PGD_NONE, _PAGE_TABLE | __pa(pmd)) == PGD_NONE)

static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
{
set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
}

+static inline int pmd_test_and_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+{
+ return cmpxchg((int *)pmd, PMD_NONE, _PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)) == PMD_NONE;
+}
+
extern __inline__ pmd_t *get_pmd(void)
{
return (pmd_t *)get_zeroed_page(GFP_KERNEL);
Index: linux-2.6.9-rc4/include/asm-x86_64/pgtable.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-x86_64/pgtable.h 2004-10-10 19:58:23.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-x86_64/pgtable.h 2004-10-15 11:21:27.000000000 -0700
@@ -436,6 +436,11 @@
#define kc_offset_to_vaddr(o) \
(((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))

+
+#define ptep_xchg(addr,xp,newval) __pte(xchg(&(xp)->pte, pte_val(newval)))
+#define ptep_cmpxchg(mm,addr,xp,oldval,newval) (cmpxchg(&(xp)->pte, pte_val(oldval), pte_val(newval)) == pte_val(oldval))
+#define __HAVE_ARCH_ATOMIC_TABLE_OPS
+
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR

2004-10-15 19:08:59

by Christoph Lameter

[permalink] [raw]
Subject: page fault scalability patch V10: [4/7] cmpxchg for 386 and 486

Changelog
* Make cmpxchg and cmpxchg8b generally available on i386.
* Provide emulation of cmpxchg suitable for UP if build and
run on 386.
* Provide emulation of cmpxchg8b suitable for UP if build
and run on 386 or 486.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.9-rc4/include/asm-i386/system.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-i386/system.h 2004-10-10 19:56:39.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-i386/system.h 2004-10-14 11:32:35.000000000 -0700
@@ -240,7 +240,24 @@
*/

#ifdef CONFIG_X86_CMPXCHG
+
#define __HAVE_ARCH_CMPXCHG 1
+#define cmpxchg(ptr,o,n)\
+ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
+ (unsigned long)(n),sizeof(*(ptr))))
+
+#else
+
+/*
+ * Building a kernel capable running on 80386. It may be necessary to
+ * simulate the cmpxchg on the 80386 CPU.
+ */
+
+extern unsigned long cmpxchg_386(void *, unsigned long, unsigned long);
+
+#define cmpxchg(ptr,o,n)\
+ ((__typeof__(*(ptr)))cmpxchg_386((ptr),(unsigned long)(o),\
+ (unsigned long)(n),sizeof(*(ptr))))
#endif

static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
@@ -270,10 +287,32 @@
return old;
}

-#define cmpxchg(ptr,o,n)\
- ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
- (unsigned long)(n),sizeof(*(ptr))))
-
+static inline unsigned long long __cmpxchg8b(volatile unsigned long long *ptr,
+ unsigned long long old, unsigned long long newv)
+{
+ unsigned long long prev;
+ __asm__ __volatile__(
+ LOCK_PREFIX "cmpxchg8b %4\n"
+ : "=A" (prev)
+ : "0" (old), "c" ((unsigned long)(newv >> 32)),
+ "b" ((unsigned long)(newv & 0xffffffffLL)), "m" (ptr)
+ : "memory");
+ return prev ;
+}
+
+#ifdef CONFIG_X86_CMPXCHG8B
+#define cmpxchg8b __cmpxchg8b
+#else
+/*
+ * Building a kernel capable of running on 80486 and 80386. Both
+ * do not support cmpxchg8b. Call a function that emulates the
+ * instruction if necessary.
+ */
+extern unsigned long long cmpxchg_486(unsigned long long *,
+ unsigned long long, unsigned long long);
+#define cmpxchg8b cmpxchg8b_486
+#endif
+
#ifdef __KERNEL__
struct alt_instr {
__u8 *instr; /* original instruction */
Index: linux-2.6.9-rc4/arch/i386/Kconfig
===================================================================
--- linux-2.6.9-rc4.orig/arch/i386/Kconfig 2004-10-10 19:57:06.000000000 -0700
+++ linux-2.6.9-rc4/arch/i386/Kconfig 2004-10-14 11:32:35.000000000 -0700
@@ -345,6 +345,11 @@
depends on !M386
default y

+config X86_CMPXCHG8B
+ bool
+ depends on !M386 && !M486
+ default y
+
config X86_XADD
bool
depends on !M386
Index: linux-2.6.9-rc4/arch/i386/kernel/cpu/intel.c
===================================================================
--- linux-2.6.9-rc4.orig/arch/i386/kernel/cpu/intel.c 2004-10-10 19:57:16.000000000 -0700
+++ linux-2.6.9-rc4/arch/i386/kernel/cpu/intel.c 2004-10-14 11:32:35.000000000 -0700
@@ -415,5 +415,65 @@
return 0;
}

+#ifndef CONFIG_X86_CMPXCHG
+unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ unsigned long flags;
+ /*
+ * Check if the kernel was compiled for an old cpu but the
+ * currently running cpu can do cmpxchg after all
+ * All CPUs except 386 support CMPXCHG
+ */
+ if (cpu_data->x86 > 3) return __cmpxchg(ptr, old, new, size);
+
+ /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ local_irq_save(flags);
+ switch (size) {
+ case 1:
+ prev = * (u8 *)ptr;
+ if (prev == old) *(u8 *)ptr = new;
+ break;
+ case 2:
+ prev = * (u16 *)ptr;
+ if (prev == old) *(u16 *)ptr = new;
+ case 4:
+ prev = *(u32 *)ptr;
+ if (prev == old) *(u32 *)ptr = new;
+ break;
+ }
+ local_irq_restore(flags);
+ return prev;
+}
+
+EXPORT_SYMBOL(cmpxchg_386);
+#endif
+
+#ifndef CONFIG_X86_CMPXCHG8B
+unsigned long long cmpxchg8b_486(unsigned long long *ptr,
+ unsigned long long old, unsigned long long newv)
+{
+ unsigned long long prev;
+ unsigned long flags;
+
+ /*
+ * Check if the kernel was compiled for an old cpu but
+ * we are running really on a cpu capable of cmpxchg8b
+ */
+
+ if (cpu_has(cpu_data, X86_FEATURE_CX8)) return __cmpxchg8b(ptr, old newv);
+
+ /* Poor mans cmpxchg8b for 386 and 486. Not suitable for SMP */
+ local_irq_save(flags);
+ prev = *ptr;
+ if (prev == old) *ptr = newv;
+ local_irq_restore(flags);
+ return prev;
+}
+
+EXPORT_SYMBOL(cmpxchg8b_486);
+#endif
+
// arch_initcall(intel_cpu_init);

2004-10-15 19:15:14

by Christoph Lameter

[permalink] [raw]
Subject: page fault scalability patch V10: [7/7] s/390 atomic pte operations

Changelog
* Provide atomic pte operations for s390

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.9-rc4/include/asm-s390/pgtable.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-s390/pgtable.h 2004-10-10 19:58:24.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-s390/pgtable.h 2004-10-14 12:22:14.000000000 -0700
@@ -567,6 +567,17 @@
return pte;
}

+#define ptep_xchg_flush(__vma, __address, __ptep, __pteval) \
+({ \
+ struct mm_struct *__mm = __vma->vm_mm; \
+ pte_t __pte; \
+ spin_lock(&__mm->page_table_lock); \
+ __pte = ptep_clear_flush(__vma, __address, __ptep); \
+ set_pte(__ptep, __pteval); \
+ spin_unlock(&__mm->page_table_lock); \
+ __pte; \
+})
+
static inline void ptep_set_wrprotect(pte_t *ptep)
{
pte_t old_pte = *ptep;
@@ -778,6 +789,19 @@

#define kern_addr_valid(addr) (1)

+/* Atomic PTE operations */
+#define __HAVE_ARCH_ATOMIC_TABLE_OPS
+
+static inline pte_t ptep_xchg(struct mm_struct *mm, unsigned long address, pte_t *ptep, pte_t pteval)
+{
+ return __pte(xchg(ptep, pte_val(pteval)));
+}
+
+static inline int ptep_cmpxchg (struct mm_struct *mm, unsigned long address, pte_t *ptep, pte_t oldval, pte_t newval)
+{
+ return cmpxchg(ptep, pte_val(oldval), pte_val(newval)) == pte_val(oldval);
+}
+
/*
* No page table caches to initialise
*/
@@ -791,6 +815,7 @@
#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
+#define __HAVE_ARCH_PTEP_XCHG_FLUSH
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
#define __HAVE_ARCH_PTEP_MKDIRTY
#define __HAVE_ARCH_PTE_SAME
Index: linux-2.6.9-rc4/include/asm-s390/pgalloc.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-s390/pgalloc.h 2004-10-10 19:58:06.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-s390/pgalloc.h 2004-10-14 12:22:14.000000000 -0700
@@ -97,6 +97,10 @@
pgd_val(*pgd) = _PGD_ENTRY | __pa(pmd);
}

+static inline int pgd_test_and_populate(struct mm_struct *mm, pdg_t *pgd, pmd_t *pmd)
+{
+ return cmpxchg(pgd, _PAGE_TABLE_INV, _PGD_ENTRY | __pa(pmd)) == _PAGE_TABLE_INV;
+}
#endif /* __s390x__ */

static inline void
@@ -119,6 +123,18 @@
pmd_populate_kernel(mm, pmd, (pte_t *)((page-mem_map) << PAGE_SHIFT));
}

+static inline int
+pmd_test_and_populate(struct mm_struct *mm, pmd_t *pmd, struct page *page)
+{
+ int rc;
+ spin_lock(&mm->page_table_lock);
+
+ rc=pte_same(*pmd, _PAGE_INVALID_EMPTY);
+ if (rc) pmd_populate(mm, pmd, page);
+ spin_unlock(&mm->page_table_lock);
+ return rc;
+}
+
/*
* page table entry allocation/free routines.
*/

2004-10-15 19:21:36

by Christoph Lameter

[permalink] [raw]
Subject: page fault scalability patch V10: [0/7] overview

Signed-off-by: Christoph Lameter <[email protected]>

Changes from V9->V10 of this patch:
- generic: fixes and updates
- S390: changes after feedback from Martin Schwidefsky
- x86_64: tested and now works fine.
- i386: stable
- ia64: stable. Added support for pte_locking necessary
for a planned parallelization of COW.

This is a series of patches that increases the scalability of
the page fault handler for SMP. Typical performance increases in the page
fault rate are:

2 CPUs -> 30%
4 CPUs -> 45%
8 CPUs -> 60%

With a high number of CPUs (16..512) we are seeing the page fault rate
roughly doubling.

The performance increase is accomplished by avoiding the use of the
page_table_lock spinlock (but not mm->mmap_sem!) through new atomic
operations on pte's (ptep_xchg, ptep_cmpxchg) and on pmd and pgd's
(pgd_test_and_populate, pmd_test_and_populate).

The page table lock can be avoided in the following situations:

1. An empty pte or pmd entry is populated

This is safe since the swapper may only depopulate them and the
swapper code has been changed to never set a pte to be empty until the
page has been evicted. The population of an empty pte is frequent
if a process touches newly allocated memory.

2. Modifications of flags in a pte entry (write/accessed).

These modifications are done by the CPU or by low level handlers
on various platforms also bypassing the page_table_lock. So this
seems to be safe too.

The patchset is composed of 7 patches:

1/7: Make mm->rss atomic

The page table lock is used to protect mm->rss and the first patch
makes rss atomic so that it may be changed without holding the
page_table_lock.
Generic atomic variables are only 32 bit under Linux. However, 32 bits
is sufficient for rss even on a 64 bit machine since rss refers to the
number of pages allowing still up to 2^(31+12)= 8 terabytes of memory
to be in use by a single process. A 64 bit atomic would of course be better.

2/7: Avoid page_table_lock in handle_mm_fault

This patch defers the acquisition of the page_table_lock as much as
possible and uses atomic operations for allocating anonymous memory.
These atomic operations are simulated by acquiring the page_table_lock
for very small time frames if an architecture does not define
__HAVE_ARCH_ATOMIC_TABLE_OPS. It also changes the swapper so that a
pte will not be set to empty if a page is in transition to swap.

If only the first two patches are applied then the time that the page_table_lock
is held is simply reduced. The lock may then be acquired multiple
times during a page fault.

The remaining patches introduce the necessary atomic pte operations to avoid
the page_table_lock.

3/7: Atomic pte operations for ia64

4/7: Make cmpxchg generally available on i386

The atomic operations on the page table rely heavily on cmpxchg instructions.
This patch adds emulations for cmpxchg and cmpxchg8b for old 80386 and 80486
cpus. The emulations are only included if a kernel is build for these old
cpus and are skipped for the real cmpxchg instructions if the kernel
that is build for 386 or 486 is then run on a more recent cpu.

This patch may be used independently of the other patches.

5/7: Atomic pte operations for i386

A generally available cmpxchg (last patch) must be available for this patch to
preserve the ability to build kernels for 386 and 486.

6/7: Atomic pte operation for x86_64

7/7: Atomic pte operations for s390

2004-10-15 19:22:36

by Christoph Lameter

[permalink] [raw]
Subject: page fault scalability patch V10: [5/7] i386 atomic pte operations

Changelog
* Atomic pte operations for i386

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.9-rc4/include/asm-i386/pgtable.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-i386/pgtable.h 2004-10-10 19:58:24.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-i386/pgtable.h 2004-10-14 11:32:37.000000000 -0700
@@ -412,6 +412,7 @@
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
#define __HAVE_ARCH_PTEP_MKDIRTY
#define __HAVE_ARCH_PTE_SAME
+#define __HAVE_ARCH_ATOMIC_TABLE_OPS
#include <asm-generic/pgtable.h>

#endif /* _I386_PGTABLE_H */
Index: linux-2.6.9-rc4/include/asm-i386/pgtable-3level.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-i386/pgtable-3level.h 2004-10-10 19:58:41.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-i386/pgtable-3level.h 2004-10-14 11:32:37.000000000 -0700
@@ -6,7 +6,8 @@
* tables on PPro+ CPUs.
*
* Copyright (C) 1999 Ingo Molnar <[email protected]>
- */
+ * August 26, 2004 added ptep_cmpxchg and ptep_xchg <[email protected]>
+*/

#define pte_ERROR(e) \
printk("%s:%d: bad pte %p(%08lx%08lx).\n", __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
@@ -141,4 +142,26 @@
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })

+/* Atomic PTE operations */
+static inline pte_t ptep_xchg(struct mm_struct *mm, pte_t *ptep, pte_t newval)
+{
+ pte_t res;
+
+ /* xchg acts as a barrier before the setting of the high bits.
+ * (But we also have a cmpxchg8b. Why not use that? (cl))
+ */
+ res.pte_low = xchg(&ptep->pte_low, newval.pte_low);
+ res.pte_high = ptep->pte_high;
+ ptep->pte_high = newval.pte_high;
+
+ return res;
+}
+
+
+static inline int ptep_cmpxchg(struct mm_struct *mm, unsigned long address, pte_t *ptep, pte_t oldval, pte_t newval)
+{
+ return cmpxchg(ptep, pte_val(oldval), pte_val(newval)) == pte_val(oldval);
+}
+
+
#endif /* _I386_PGTABLE_3LEVEL_H */
Index: linux-2.6.9-rc4/include/asm-i386/pgtable-2level.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-i386/pgtable-2level.h 2004-10-10 19:58:05.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-i386/pgtable-2level.h 2004-10-14 11:32:37.000000000 -0700
@@ -82,4 +82,8 @@
#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })

+/* Atomic PTE operations */
+#define ptep_xchg(mm,xp,a) __pte(xchg(&(xp)->pte_low, (a).pte_low))
+#define ptep_cmpxchg(mm,a,xp,oldpte,newpte) (cmpxchg(&(xp)->pte_low, (oldpte).pte_low, (newpte).pte_low)==(oldpte).pte_low)
+
#endif /* _I386_PGTABLE_2LEVEL_H */
Index: linux-2.6.9-rc4/include/asm-i386/pgalloc.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-i386/pgalloc.h 2004-10-10 19:57:02.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-i386/pgalloc.h 2004-10-14 11:32:37.000000000 -0700
@@ -7,6 +7,8 @@
#include <linux/threads.h>
#include <linux/mm.h> /* for struct page */

+#define PMD_NONE 0L
+
#define pmd_populate_kernel(mm, pmd, pte) \
set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))

@@ -16,6 +18,19 @@
((unsigned long long)page_to_pfn(pte) <<
(unsigned long long) PAGE_SHIFT)));
}
+
+/* Atomic version */
+static inline int pmd_test_and_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+{
+#ifdef CONFIG_X86_PAE
+ return cmpxchg8b( ((unsigned long long *)pmd), PMD_NONE, _PAGE_TABLE +
+ ((unsigned long long)page_to_pfn(pte) <<
+ (unsigned long long) PAGE_SHIFT) ) == PMD_NONE;
+#else
+ return cmpxchg( (unsigned long *)pmd, PMD_NONE, _PAGE_TABLE + (page_to_pfn(pte) << PAGE_SHIFT)) == PMD_NONE;
+#endif
+}
+
/*
* Allocate and free page tables.
*/
@@ -49,6 +64,7 @@
#define pmd_free(x) do { } while (0)
#define __pmd_free_tlb(tlb,x) do { } while (0)
#define pgd_populate(mm, pmd, pte) BUG()
+#define pgd_test_and_populate(mm, pmd, pte) ({ BUG(); 1; })

#define check_pgt_cache() do { } while (0)

2004-10-15 19:21:05

by Christoph Lameter

[permalink] [raw]
Subject: page fault scalability patch V10: [3/7] IA64 atomic pte operations

Changelog
* Provide atomic pte operations for ia64
* pte lock operations
* Enhanced parallelism in page fault handler if applied together
with the generic patch

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.9-rc4/include/asm-ia64/pgalloc.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-ia64/pgalloc.h 2004-10-10 19:56:40.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-ia64/pgalloc.h 2004-10-14 11:32:38.000000000 -0700
@@ -34,6 +34,10 @@
#define pmd_quicklist (local_cpu_data->pmd_quick)
#define pgtable_cache_size (local_cpu_data->pgtable_cache_sz)

+/* Empty entries of PMD and PGD */
+#define PMD_NONE 0
+#define PGD_NONE 0
+
static inline pgd_t*
pgd_alloc_one_fast (struct mm_struct *mm)
{
@@ -78,12 +82,19 @@
preempt_enable();
}

+
static inline void
pgd_populate (struct mm_struct *mm, pgd_t *pgd_entry, pmd_t *pmd)
{
pgd_val(*pgd_entry) = __pa(pmd);
}

+/* Atomic populate */
+static inline int
+pgd_test_and_populate (struct mm_struct *mm, pgd_t *pgd_entry, pmd_t *pmd)
+{
+ return ia64_cmpxchg8_acq(pgd_entry,__pa(pmd), PGD_NONE) == PGD_NONE;
+}

static inline pmd_t*
pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr)
@@ -132,6 +143,13 @@
pmd_val(*pmd_entry) = page_to_phys(pte);
}

+/* Atomic populate */
+static inline int
+pmd_test_and_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte)
+{
+ return ia64_cmpxchg8_acq(pmd_entry, page_to_phys(pte), PMD_NONE) == PMD_NONE;
+}
+
static inline void
pmd_populate_kernel (struct mm_struct *mm, pmd_t *pmd_entry, pte_t *pte)
{
Index: linux-2.6.9-rc4/include/asm-ia64/pgtable.h
===================================================================
--- linux-2.6.9-rc4.orig/include/asm-ia64/pgtable.h 2004-10-10 19:57:17.000000000 -0700
+++ linux-2.6.9-rc4/include/asm-ia64/pgtable.h 2004-10-14 12:21:06.000000000 -0700
@@ -30,6 +30,8 @@
#define _PAGE_P_BIT 0
#define _PAGE_A_BIT 5
#define _PAGE_D_BIT 6
+#define _PAGE_IG_BITS 53
+#define _PAGE_LOCK_BIT (_PAGE_IG_BITS+3) /* bit 56. Aligned to 8 bits */

#define _PAGE_P (1 << _PAGE_P_BIT) /* page present bit */
#define _PAGE_MA_WB (0x0 << 2) /* write back memory attribute */
@@ -58,6 +60,7 @@
#define _PAGE_PPN_MASK (((__IA64_UL(1) << IA64_MAX_PHYS_BITS) - 1) & ~0xfffUL)
#define _PAGE_ED (__IA64_UL(1) << 52) /* exception deferral */
#define _PAGE_PROTNONE (__IA64_UL(1) << 63)
+#define _PAGE_LOCK (__IA64_UL(1) << _PAGE_LOCK_BIT)

/* Valid only for a PTE with the present bit cleared: */
#define _PAGE_FILE (1 << 1) /* see swap & file pte remarks below */
@@ -270,6 +273,8 @@
#define pte_dirty(pte) ((pte_val(pte) & _PAGE_D) != 0)
#define pte_young(pte) ((pte_val(pte) & _PAGE_A) != 0)
#define pte_file(pte) ((pte_val(pte) & _PAGE_FILE) != 0)
+#define pte_locked(pte) ((pte_val(pte) & _PAGE_LOCK)!=0)
+
/*
* Note: we convert AR_RWX to AR_RX and AR_RW to AR_R by clearing the 2nd bit in the
* access rights:
@@ -281,8 +286,15 @@
#define pte_mkyoung(pte) (__pte(pte_val(pte) | _PAGE_A))
#define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D))
#define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_D))
+#define pte_mkunlocked(pte) (__pte(pte_val(pte) & ~_PAGE_LOCK))

/*
+ * Lock functions for pte's
+ */
+#define ptep_lock(ptep) test_and_set_bit(_PAGE_LOCK_BIT, ptep)
+#define ptep_unlock(ptep) { clear_bit(_PAGE_LOCK_BIT,ptep); smp_mb__after_clear_bit(); }
+#define ptep_unlock_set(ptep, val) set_pte(ptep, pte_mkunlocked(val))
+/*
* Macro to a page protection value as "uncacheable". Note that "protection" is really a
* misnomer here as the protection value contains the memory attribute bits, dirty bits,
* and various other bits as well.
@@ -342,7 +354,6 @@
#define pte_unmap_nested(pte) do { } while (0)

/* atomic versions of the some PTE manipulations: */
-
static inline int
ptep_test_and_clear_young (pte_t *ptep)
{
@@ -414,6 +425,18 @@
#endif
}

+static inline pte_t
+ptep_xchg (struct mm_struct *mm, pte_t *ptep, pte_t pteval)
+{
+ return __pte(xchg((long *) ptep, pteval.pte));
+}
+
+static inline int
+ptep_cmpxchg (struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t oldval, pte_t newval)
+{
+ return ia64_cmpxchg8_acq(&ptep->pte, newval.pte, oldval.pte) == oldval.pte;
+}
+
static inline int
pte_same (pte_t a, pte_t b)
{
@@ -558,6 +581,8 @@
#define __HAVE_ARCH_PTEP_MKDIRTY
#define __HAVE_ARCH_PTE_SAME
#define __HAVE_ARCH_PGD_OFFSET_GATE
+#define __HAVE_ARCH_ATOMIC_TABLE_OPS
+#define __HAVE_ARCH_LOCK_TABLE_OPS
#include <asm-generic/pgtable.h>

#endif /* _ASM_IA64_PGTABLE_H */

2004-10-15 21:52:07

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: page fault scalability patch V10: [2/7] defer/omit taking page_table_lock


Hi Christoph,

Nice work!

On Fri, Oct 15, 2004 at 12:04:53PM -0700, Christoph Lameter wrote:
> Changelog
> * Increase parallelism in SMP configurations by deferring
> the acquisition of page_table_lock in handle_mm_fault
> * Anonymous memory page faults bypass the page_table_lock
> through the use of atomic page table operations
> * Swapper does not set pte to empty in transition to swap
> * Simulate atomic page table operations using the
> page_table_lock if an arch does not define
> __HAVE_ARCH_ATOMIC_TABLE_OPS. This still provides
> a performance benefit since the page_table_lock
> is held for shorter periods of time.
>
> Signed-off-by: Christoph Lameter <[email protected]>
>
> Index: linux-2.6.9-rc4/mm/memory.c
> ===================================================================
> --- linux-2.6.9-rc4.orig/mm/memory.c 2004-10-14 12:22:14.000000000 -0700
> +++ linux-2.6.9-rc4/mm/memory.c 2004-10-14 12:22:14.000000000 -0700
> @@ -1314,8 +1314,7 @@
> }
>
> /*
> - * We hold the mm semaphore and the page_table_lock on entry and
> - * should release the pagetable lock on exit..
> + * We hold the mm semaphore
> */
> static int do_swap_page(struct mm_struct * mm,
> struct vm_area_struct * vma, unsigned long address,
> @@ -1327,15 +1326,13 @@
> int ret = VM_FAULT_MINOR;
>
> pte_unmap(page_table);
> - spin_unlock(&mm->page_table_lock);
> page = lookup_swap_cache(entry);
> if (!page) {
> swapin_readahead(entry, address, vma);
> page = read_swap_cache_async(entry, vma, address);
> if (!page) {
> /*
> - * Back out if somebody else faulted in this pte while
> - * we released the page table lock.
> + * Back out if somebody else faulted in this pte
> */
> spin_lock(&mm->page_table_lock);
> page_table = pte_offset_map(pmd, address);

The comment above, which is a few lines down on do_swap_page() is now
bogus (the "while we released the page table lock").

/*
* Back out if somebody else faulted in this pte while we
* released the page table lock.
*/
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
if (unlikely(!pte_same(*page_table, orig_pte))) {

> @@ -1406,14 +1403,12 @@
> }
>
> /*
> - * We are called with the MM semaphore and page_table_lock
> - * spinlock held to protect against concurrent faults in
> - * multithreaded programs.
> + * We are called with the MM semaphore held.
> */
> static int
> do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
> pte_t *page_table, pmd_t *pmd, int write_access,
> - unsigned long addr)
> + unsigned long addr, pte_t orig_entry)
> {
> pte_t entry;
> struct page * page = ZERO_PAGE(addr);
> @@ -1425,7 +1420,6 @@
> if (write_access) {
> /* Allocate our own private page. */
> pte_unmap(page_table);
> - spin_unlock(&mm->page_table_lock);
>
> if (unlikely(anon_vma_prepare(vma)))
> goto no_mem;
> @@ -1434,30 +1428,39 @@
> goto no_mem;
> clear_user_highpage(page, addr);
>
> - spin_lock(&mm->page_table_lock);
> + lock_page(page);

Question: Why do you need to hold the pagelock now?

I can't seem to figure that out myself.

> page_table = pte_offset_map(pmd, addr);
>
> - if (!pte_none(*page_table)) {
> - pte_unmap(page_table);
> - page_cache_release(page);
> - spin_unlock(&mm->page_table_lock);
> - goto out;
> - }
> - atomic_inc(&mm->mm_rss);
> entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
> vma->vm_page_prot)),
> vma);
> - lru_cache_add_active(page);
> mark_page_accessed(page);
> - page_add_anon_rmap(page, vma, addr);
> }
>
> - set_pte(page_table, entry);
> + /* update the entry */
> + if (!ptep_cmpxchg(vma, addr, page_table, orig_entry, entry)) {
> + if (write_access) {
> + pte_unmap(page_table);
> + unlock_page(page);
> + page_cache_release(page);
> + }
> + goto out;
> + }
> + if (write_access) {
> + /*
> + * The following two functions are safe to use without
> + * the page_table_lock but do they need to come before
> + * the cmpxchg?
> + */

They do need to come after AFAICS - from the point they are in the reverse map
and the page is on the LRU try_to_unmap() can come in and try to
unmap the pte (now that we dont hold page_table_lock anymore).

> + lru_cache_add_active(page);
> + page_add_anon_rmap(page, vma, addr);
> + atomic_inc(&mm->mm_rss);
> + unlock_page(page);
> + }
> pte_unmap(page_table);
>
> /* No need to invalidate - it was non-present before */
> update_mmu_cache(vma, addr, entry);
> - spin_unlock(&mm->page_table_lock);
> out:
> return VM_FAULT_MINOR;
> no_mem:
> @@ -1473,12 +1476,12 @@
> * As this is called only for pages that do not currently exist, we
> * do not need to flush old virtual caches or the TLB.
> *
> - * This is called with the MM semaphore held and the page table
> - * spinlock held. Exit with the spinlock released.
> + * This is called with the MM semaphore held.
> */
> static int
> do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> - unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
> + unsigned long address, int write_access, pte_t *page_table,
> + pmd_t *pmd, pte_t orig_entry)
> {
> struct page * new_page;
> struct address_space *mapping = NULL;
> @@ -1489,9 +1492,8 @@
>
> if (!vma->vm_ops || !vma->vm_ops->nopage)
> return do_anonymous_page(mm, vma, page_table,
> - pmd, write_access, address);
> + pmd, write_access, address, orig_entry);
> pte_unmap(page_table);
> - spin_unlock(&mm->page_table_lock);
>
> if (vma->vm_file) {
> mapping = vma->vm_file->f_mapping;
> @@ -1589,7 +1591,7 @@
> * nonlinear vmas.
> */
> static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
> - unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
> + unsigned long address, int write_access, pte_t *pte, pmd_t *pmd, pte_t entry)
> {
> unsigned long pgoff;
> int err;
> @@ -1602,13 +1604,12 @@
> if (!vma->vm_ops || !vma->vm_ops->populate ||
> (write_access && !(vma->vm_flags & VM_SHARED))) {
> pte_clear(pte);
> - return do_no_page(mm, vma, address, write_access, pte, pmd);
> + return do_no_page(mm, vma, address, write_access, pte, pmd, entry);
> }
>
> pgoff = pte_to_pgoff(*pte);
>
> pte_unmap(pte);
> - spin_unlock(&mm->page_table_lock);
>
> err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
> if (err == -ENOMEM)
> @@ -1627,49 +1628,49 @@
> * with external mmu caches can use to update those (ie the Sparc or
> * PowerPC hashed page tables that act as extended TLBs).
> *
> - * Note the "page_table_lock". It is to protect against kswapd removing
> - * pages from under us. Note that kswapd only ever _removes_ pages, never
> - * adds them. As such, once we have noticed that the page is not present,
> - * we can drop the lock early.
> - *
> + * Note that kswapd only ever _removes_ pages, never adds them.
> + * We need to insure to handle that case properly.
> + *
> * The adding of pages is protected by the MM semaphore (which we hold),
> * so we don't need to worry about a page being suddenly been added into
> * our VM.
> - *
> - * We enter with the pagetable spinlock held, we are supposed to
> - * release it when done.
> */
> static inline int handle_pte_fault(struct mm_struct *mm,
> struct vm_area_struct * vma, unsigned long address,
> int write_access, pte_t *pte, pmd_t *pmd)
> {
> pte_t entry;
> + pte_t new_entry;
>
> entry = *pte;
> if (!pte_present(entry)) {
> /*
> * If it truly wasn't present, we know that kswapd
> * and the PTE updates will not touch it later. So
> - * drop the lock.
> + * no need to acquire the page_table_lock.
> */
> if (pte_none(entry))
> - return do_no_page(mm, vma, address, write_access, pte, pmd);
> + return do_no_page(mm, vma, address, write_access, pte, pmd, entry);
> if (pte_file(entry))
> - return do_file_page(mm, vma, address, write_access, pte, pmd);
> + return do_file_page(mm, vma, address, write_access, pte, pmd, entry);
> return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
> }

I wonder what happens if kswapd, through try_to_unmap_one(), unmap's the
pte right here ?

Aren't we going to proceed with the "pte_mkyoung(entry)" of a potentially
now unmapped pte? Isnt that case possible now?

2004-10-18 16:00:27

by Christoph Lameter

[permalink] [raw]
Subject: Re: page fault scalability patch V10: [2/7] defer/omit taking page_table_lock

On Fri, 15 Oct 2004, Marcelo Tosatti wrote:

> The comment above, which is a few lines down on do_swap_page() is now
> bogus (the "while we released the page table lock").

Ok. I also modified the second occurence of that comment.

> > if (write_access) {
> > /* Allocate our own private page. */
> > pte_unmap(page_table);
> > - spin_unlock(&mm->page_table_lock);
> >
> > if (unlikely(anon_vma_prepare(vma)))
> > goto no_mem;
> > @@ -1434,30 +1428,39 @@
> > goto no_mem;
> > clear_user_highpage(page, addr);
> >
> > - spin_lock(&mm->page_table_lock);
> > + lock_page(page);
>
> Question: Why do you need to hold the pagelock now?
>
> I can't seem to figure that out myself.

Hmm.. I cannot see a good reason for it either. The page is new
and thus no references can exist yet. Removed.

> > + if (write_access) {
> > + /*
> > + * The following two functions are safe to use without
> > + * the page_table_lock but do they need to come before
> > + * the cmpxchg?
> > + */
>
> They do need to come after AFAICS - from the point they are in the reverse map
> and the page is on the LRU try_to_unmap() can come in and try to
> unmap the pte (now that we dont hold page_table_lock anymore).

Ahh. Thanks.

> > - return do_no_page(mm, vma, address, write_access, pte, pmd);
> > + return do_no_page(mm, vma, address, write_access, pte, pmd, entry);
> > if (pte_file(entry))
> > - return do_file_page(mm, vma, address, write_access, pte, pmd);
> > + return do_file_page(mm, vma, address, write_access, pte, pmd, entry);
> > return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
> > }
>
> I wonder what happens if kswapd, through try_to_unmap_one(), unmap's the
> pte right here ?
>
> Aren't we going to proceed with the "pte_mkyoung(entry)" of a potentially
> now unmapped pte? Isnt that case possible now?

The pte value is saved in entry. If the pte is umapped then its value is
changed. Thus the cmpxchg will fail and the page fault handler will return
without doing anything.

try_to_unmap_one was modified to handle the pte's in an atomic way using
ptep_xchg.

2004-10-19 05:27:51

by Christoph Lameter

[permalink] [raw]
Subject: [revised] page fault scalability patch V10: [2/7] defer/omit taking page_table_lock

Here is an updated version following up on Marcelo's feedback....

Index: linux-2.6.9-final/mm/memory.c
===================================================================
--- linux-2.6.9-final.orig/mm/memory.c 2004-10-18 08:43:49.000000000 -0700
+++ linux-2.6.9-final/mm/memory.c 2004-10-18 09:00:10.000000000 -0700
@@ -1314,8 +1314,7 @@
}

/*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
+ * We hold the mm semaphore
*/
static int do_swap_page(struct mm_struct * mm,
struct vm_area_struct * vma, unsigned long address,
@@ -1327,15 +1326,13 @@
int ret = VM_FAULT_MINOR;

pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
/*
- * Back out if somebody else faulted in this pte while
- * we released the page table lock.
+ * Back out if somebody else faulted in this pte
*/
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
@@ -1358,8 +1355,7 @@
lock_page(page);

/*
- * Back out if somebody else faulted in this pte while we
- * released the page table lock.
+ * Back out if somebody else faulted in this pte
*/
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
@@ -1406,14 +1402,12 @@
}

/*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs.
+ * We are called with the MM semaphore held.
*/
static int
do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *page_table, pmd_t *pmd, int write_access,
- unsigned long addr)
+ unsigned long addr, pte_t orig_entry)
{
pte_t entry;
struct page * page = ZERO_PAGE(addr);
@@ -1425,7 +1419,6 @@
if (write_access) {
/* Allocate our own private page. */
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);

if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
@@ -1434,30 +1427,36 @@
goto no_mem;
clear_user_highpage(page, addr);

- spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, addr);

- if (!pte_none(*page_table)) {
- pte_unmap(page_table);
- page_cache_release(page);
- spin_unlock(&mm->page_table_lock);
- goto out;
- }
- atomic_inc(&mm->mm_rss);
entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
vma->vm_page_prot)),
vma);
- lru_cache_add_active(page);
mark_page_accessed(page);
- page_add_anon_rmap(page, vma, addr);
}

- set_pte(page_table, entry);
+ /* update the entry */
+ if (!ptep_cmpxchg(vma, addr, page_table, orig_entry, entry)) {
+ if (write_access) {
+ pte_unmap(page_table);
+ page_cache_release(page);
+ }
+ goto out;
+ }
+ if (write_access) {
+ /*
+ * These two functions must come after the cmpxchg
+ * because if the page is on the LRU then try_to_unmap may come
+ * in and unmap the pte.
+ */
+ lru_cache_add_active(page);
+ page_add_anon_rmap(page, vma, addr);
+ atomic_inc(&mm->mm_rss);
+ }
pte_unmap(page_table);

/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
- spin_unlock(&mm->page_table_lock);
out:
return VM_FAULT_MINOR;
no_mem:
@@ -1473,12 +1472,12 @@
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
- * This is called with the MM semaphore held and the page table
- * spinlock held. Exit with the spinlock released.
+ * This is called with the MM semaphore held.
*/
static int
do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+ unsigned long address, int write_access, pte_t *page_table,
+ pmd_t *pmd, pte_t orig_entry)
{
struct page * new_page;
struct address_space *mapping = NULL;
@@ -1489,9 +1488,8 @@

if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table,
- pmd, write_access, address);
+ pmd, write_access, address, orig_entry);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);

if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
@@ -1589,7 +1587,7 @@
* nonlinear vmas.
*/
static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+ unsigned long address, int write_access, pte_t *pte, pmd_t *pmd, pte_t entry)
{
unsigned long pgoff;
int err;
@@ -1602,13 +1600,12 @@
if (!vma->vm_ops || !vma->vm_ops->populate ||
(write_access && !(vma->vm_flags & VM_SHARED))) {
pte_clear(pte);
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ return do_no_page(mm, vma, address, write_access, pte, pmd, entry);
}

pgoff = pte_to_pgoff(*pte);

pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);

err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
if (err == -ENOMEM)
@@ -1627,49 +1624,49 @@
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
+ * Note that kswapd only ever _removes_ pages, never adds them.
+ * We need to insure to handle that case properly.
+ *
* The adding of pages is protected by the MM semaphore (which we hold),
* so we don't need to worry about a page being suddenly been added into
* our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct * vma, unsigned long address,
int write_access, pte_t *pte, pmd_t *pmd)
{
pte_t entry;
+ pte_t new_entry;

entry = *pte;
if (!pte_present(entry)) {
/*
* If it truly wasn't present, we know that kswapd
* and the PTE updates will not touch it later. So
- * drop the lock.
+ * no need to acquire the page_table_lock.
*/
if (pte_none(entry))
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ return do_no_page(mm, vma, address, write_access, pte, pmd, entry);
if (pte_file(entry))
- return do_file_page(mm, vma, address, write_access, pte, pmd);
+ return do_file_page(mm, vma, address, write_access, pte, pmd, entry);
return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
}

+ /*
+ * This is the case in which we may only update some bits in the pte.
+ */
+ new_entry = pte_mkyoung(entry);
if (write_access) {
- if (!pte_write(entry))
+ if (!pte_write(entry)) {
+ /* do_wp_page expects us to hold the page_table_lock */
+ spin_lock(&mm->page_table_lock);
return do_wp_page(mm, vma, address, pte, pmd, entry);
-
- entry = pte_mkdirty(entry);
+ }
+ new_entry = pte_mkdirty(new_entry);
}
- entry = pte_mkyoung(entry);
- ptep_set_access_flags(vma, address, pte, entry, write_access);
- update_mmu_cache(vma, address, entry);
+ if (ptep_cmpxchg(vma, address, pte, entry, new_entry))
+ update_mmu_cache(vma, address, new_entry);
pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
return VM_FAULT_MINOR;
}

@@ -1687,22 +1684,42 @@

inc_page_state(pgfault);

- if (is_vm_hugetlb_page(vma))
+ if (unlikely(is_vm_hugetlb_page(vma)))
return VM_FAULT_SIGBUS; /* mapping truncation does this. */

/*
- * We need the page table lock to synchronize with kswapd
- * and the SMP-safe atomic PTE updates.
+ * We rely on the mmap_sem and the SMP-safe atomic PTE updates.
+ * to synchronize with kswapd
*/
- spin_lock(&mm->page_table_lock);
- pmd = pmd_alloc(mm, pgd, address);
+ if (unlikely(pgd_none(*pgd))) {
+ pmd_t *new = pmd_alloc_one(mm, address);
+ if (!new) return VM_FAULT_OOM;
+
+ /* Insure that the update is done in an atomic way */
+ if (!pgd_test_and_populate(mm, pgd, new)) pmd_free(new);
+ }
+
+ pmd = pmd_offset(pgd, address);
+
+ if (likely(pmd)) {
+ pte_t *pte;
+
+ if (!pmd_present(*pmd)) {
+ struct page *new;

- if (pmd) {
- pte_t * pte = pte_alloc_map(mm, pmd, address);
- if (pte)
+ new = pte_alloc_one(mm, address);
+ if (!new) return VM_FAULT_OOM;
+
+ if (!pmd_test_and_populate(mm, pmd, new))
+ pte_free(new);
+ else
+ inc_page_state(nr_page_table_pages);
+ }
+
+ pte = pte_offset_map(pmd, address);
+ if (likely(pte))
return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
}
- spin_unlock(&mm->page_table_lock);
return VM_FAULT_OOM;
}

Index: linux-2.6.9-final/include/asm-generic/pgtable.h
===================================================================
--- linux-2.6.9-final.orig/include/asm-generic/pgtable.h 2004-10-15 20:02:39.000000000 -0700
+++ linux-2.6.9-final/include/asm-generic/pgtable.h 2004-10-18 08:43:56.000000000 -0700
@@ -134,4 +134,75 @@
#define pgd_offset_gate(mm, addr) pgd_offset(mm, addr)
#endif

+#ifndef __HAVE_ARCH_ATOMIC_TABLE_OPS
+/*
+ * If atomic page table operations are not available then use
+ * the page_table_lock to insure some form of locking.
+ * Note thought that low level operations as well as the
+ * page_table_handling of the cpu may bypass all locking.
+ */
+
+#ifndef __HAVE_ARCH_PTEP_XCHG_FLUSH
+#define ptep_xchg_flush(__vma, __address, __ptep, __pteval) \
+({ \
+ pte_t __pte; \
+ spin_lock(&__vma->vm_mm->page_table_lock); \
+ __pte = *(__ptep); \
+ set_pte(__ptep, __pteval); \
+ flush_tlb_page(__vma, __address); \
+ spin_unlock(&__vma->vm_mm->page_table_lock); \
+ __pte; \
+})
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CMPXCHG
+#define ptep_cmpxchg(__vma, __addr, __ptep, __oldval, __newval) \
+({ \
+ int __rc; \
+ spin_lock(&__vma->vm_mm->page_table_lock); \
+ __rc = pte_same(*(__ptep), __oldval); \
+ if (__rc) set_pte(__ptep, __newval); \
+ spin_unlock(&__vma->vm_mm->page_table_lock); \
+ __rc; \
+})
+#endif
+
+#ifndef __HAVE_ARCH_PGP_TEST_AND_POPULATE
+#define pgd_test_and_populate(__mm, __pgd, __pmd) \
+({ \
+ int __rc; \
+ spin_lock(&__mm->page_table_lock); \
+ __rc = !pgd_present(*(__pgd)); \
+ if (__rc) pgd_populate(__mm, __pgd, __pmd); \
+ spin_unlock(&__mm->page_table_lock); \
+ __rc; \
+})
+#endif
+
+#ifndef __HAVE_PMD_TEST_AND_POPULATE
+#define pmd_test_and_populate(__mm, __pmd, __page) \
+({ \
+ int __rc; \
+ spin_lock(&__mm->page_table_lock); \
+ __rc = !pmd_present(*(__pmd)); \
+ if (__rc) pmd_populate(__mm, __pmd, __page); \
+ spin_unlock(&__mm->page_table_lock); \
+ __rc; \
+})
+#endif
+
+#else
+
+#ifndef __HAVE_ARCH_PTEP_XCHG_FLUSH
+#define ptep_xchg_flush(__vma, __address, __ptep, __pteval) \
+({ \
+ pte_t __pte = ptep_xchg((__vma)->vm_mm, __ptep, __pteval); \
+ flush_tlb_page(__vma, __address); \
+ __pte; \
+})
+
+#endif
+
+#endif
+
#endif /* _ASM_GENERIC_PGTABLE_H */
Index: linux-2.6.9-final/mm/rmap.c
===================================================================
--- linux-2.6.9-final.orig/mm/rmap.c 2004-10-18 08:43:49.000000000 -0700
+++ linux-2.6.9-final/mm/rmap.c 2004-10-18 08:43:56.000000000 -0700
@@ -420,7 +420,10 @@
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the mm->page_table_lock if page
+ * is pointing to something that is known by the vm.
+ * The lock does not need to be held if page is pointing
+ * to a newly allocated page.
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
@@ -562,11 +565,6 @@

/* Nuke the page table entry. */
flush_cache_page(vma, address);
- pteval = ptep_clear_flush(vma, address, pte);
-
- /* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
- set_page_dirty(page);

if (PageAnon(page)) {
swp_entry_t entry = { .val = page->private };
@@ -576,10 +574,14 @@
*/
BUG_ON(!PageSwapCache(page));
swap_duplicate(entry);
- set_pte(pte, swp_entry_to_pte(entry));
+ pteval = ptep_xchg_flush(vma, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
- }
+ } else
+ pteval = ptep_clear_flush(vma, address, pte);

+ /* Move the dirty bit to the physical page now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
atomic_dec(&mm->mm_rss);
page_remove_rmap(page);
page_cache_release(page);
@@ -666,15 +668,24 @@
if (ptep_clear_flush_young(vma, address, pte))
continue;

- /* Nuke the page table entry. */
flush_cache_page(vma, address);
- pteval = ptep_clear_flush(vma, address, pte);
+ /*
+ * There would be a race here with the handle_mm_fault code that
+ * bypasses the page_table_lock to allow a fast creation of ptes
+ * if we would zap the pte before
+ * putting something into it. On the other hand we need to
+ * have the dirty flag when we replaced the value.
+ * The dirty flag may be handled by a processor so we better
+ * use an atomic operation here.
+ */

/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
- set_pte(pte, pgoff_to_pte(page->index));
+ pteval = ptep_xchg_flush(vma, address, pte, pgoff_to_pte(page->index));
+ else
+ pteval = ptep_get_and_clear(pte);

- /* Move the dirty bit to the physical page now the pte is gone. */
+ /* Move the dirty bit to the physical page now that the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);