2002-08-06 23:11:43

by William Lee Irwin III

[permalink] [raw]
Subject: fix CONFIG_HIGHPTE

Minimalistic fix. Perhaps rough at the edges but I can clean the
ugliness ppl care about when they complain. 2.5.30 successfully booted
& ran userspace on a 16-way NUMA-Q with 16GB of RAM with this patch
and CONFIG_HIGHPTE enabled.



Cheers,
Bill


===== arch/i386/config.in 1.44 vs edited =====
--- 1.44/arch/i386/config.in Thu Jul 25 14:02:05 2002
+++ edited/arch/i386/config.in Fri Aug 2 22:56:10 2002
@@ -194,6 +194,10 @@
define_bool CONFIG_X86_PAE y
fi

+if [ "$CONFIG_HIGHMEM4G" = "y" -o "$CONFIG_HIGHMEM64G" = "y" ]; then
+ bool 'Allocate 3rd-level pagetables from highmem' CONFIG_HIGHPTE
+fi
+
bool 'Math emulation' CONFIG_MATH_EMULATION
bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR

===== include/asm-generic/rmap.h 1.2 vs edited =====
--- 1.2/include/asm-generic/rmap.h Tue Jul 16 14:46:30 2002
+++ edited/include/asm-generic/rmap.h Fri Aug 2 23:25:57 2002
@@ -39,16 +39,30 @@

static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
{
- struct page * page = virt_to_page(ptep);
+ struct page * page = kmap_to_page(ptep);
return (struct mm_struct *) page->mapping;
}

static inline unsigned long ptep_to_address(pte_t * ptep)
{
- struct page * page = virt_to_page(ptep);
+ struct page * page = kmap_to_page(ptep);
unsigned long low_bits;
low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
return page->index + low_bits;
}
+
+#if CONFIG_HIGHPTE
+static inline pte_addr_t ptep_to_paddr(pte_t *ptep)
+{
+ pte_addr_t paddr;
+ paddr = ((pte_addr_t)page_to_pfn(kmap_to_page(ptep))) << PAGE_SHIFT;
+ return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK);
+}
+#else
+static inline pte_addr_t ptep_to_paddr(pte_t *ptep)
+{
+ return (pte_addr_t)ptep;
+}
+#endif

#endif /* _GENERIC_RMAP_H */
===== include/asm-i386/fixmap.h 1.5 vs edited =====
--- 1.5/include/asm-i386/fixmap.h Thu Mar 14 02:11:25 2002
+++ edited/include/asm-i386/fixmap.h Fri Aug 2 23:32:53 2002
@@ -103,6 +103,7 @@
#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)

#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x) ((FIXADDR_TOP - (x)) >> PAGE_SHIFT)

extern void __this_fixmap_does_not_exist(void);

@@ -126,6 +127,12 @@
__this_fixmap_does_not_exist();

return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+ return __virt_to_fix(vaddr);
}

#endif
===== include/asm-i386/highmem.h 1.7 vs edited =====
--- 1.7/include/asm-i386/highmem.h Wed Jun 5 01:48:57 2002
+++ edited/include/asm-i386/highmem.h Fri Aug 2 23:06:52 2002
@@ -122,6 +122,19 @@
preempt_enable();
}

+static inline struct page *kmap_to_page(void *ptr)
+{
+ unsigned long idx, vaddr = (unsigned long)ptr;
+ pte_t *pte;
+
+ if (vaddr < FIXADDR_START)
+ return virt_to_page(ptr);
+
+ idx = virt_to_fix(vaddr);
+ pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
+ return pte_page(*pte);
+}
+
#endif /* __KERNEL__ */

#endif /* _ASM_HIGHMEM_H */
===== include/asm-i386/kmap_types.h 1.8 vs edited =====
--- 1.8/include/asm-i386/kmap_types.h Sun Jun 16 15:50:19 2002
+++ edited/include/asm-i386/kmap_types.h Fri Aug 2 22:26:12 2002
@@ -19,7 +19,8 @@
D(6) KM_BIO_DST_IRQ,
D(7) KM_PTE0,
D(8) KM_PTE1,
-D(9) KM_TYPE_NR
+D(9) KM_PTE2,
+D(10) KM_TYPE_NR
};

#undef D
===== include/asm-i386/pgtable.h 1.17 vs edited =====
--- 1.17/include/asm-i386/pgtable.h Mon Jun 17 20:14:46 2002
+++ edited/include/asm-i386/pgtable.h Fri Aug 2 23:19:13 2002
@@ -264,6 +265,27 @@
((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + __pte_offset(address))
#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
+
+#if CONFIG_HIGHPTE
+#define rmap_ptep_map(pte_paddr) \
+({ \
+ unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); \
+ unsigned long idx = __pte_offset(((unsigned long)pte_paddr)); \
+ (pte_t *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + idx; \
+})
+
+#define rmap_ptep_unmap(pte) kunmap_atomic(pte, KM_PTE2)
+#else /* !CONFIG_HIGHPTE */
+static inline rmap_ptep_map(pte_addr_t pte_paddr)
+{
+ return (pte_t *)pte_paddr;
+}
+
+static inline rmap_ptep_unmap(pte_t *pte)
+{
+ return;
+}
+#endif /* !CONFIG_HIGHPTE */

/*
* The i386 doesn't have any external MMU info: the kernel page
===== include/linux/mm.h 1.66 vs edited =====
--- 1.66/include/linux/mm.h Thu Aug 1 12:30:06 2002
+++ edited/include/linux/mm.h Fri Aug 2 22:24:40 2002
@@ -161,7 +161,7 @@
union {
struct pte_chain * chain; /* Reverse pte mapping pointer.
* protected by PG_chainlock */
- pte_t * direct;
+ pte_addr_t direct;
} pte;
unsigned long private; /* mapping-private opaque data */

===== include/linux/types.h 1.4 vs edited =====
--- 1.4/include/linux/types.h Tue Jun 11 18:51:43 2002
+++ edited/include/linux/types.h Fri Aug 2 23:23:46 2002
@@ -11,6 +11,12 @@
#include <linux/posix_types.h>
#include <asm/types.h>

+#if CONFIG_HIGHPTE
+typedef u64 pte_addr_t;
+#else
+typedef pte_t *pte_addr_t;
+#endif
+
#ifndef __KERNEL_STRICT_NAMES

typedef __kernel_fd_set fd_set;
===== mm/rmap.c 1.7 vs edited =====
--- 1.7/mm/rmap.c Wed Jul 31 02:58:53 2002
+++ edited/mm/rmap.c Fri Aug 2 23:29:10 2002
@@ -49,7 +49,7 @@
*/
struct pte_chain {
struct pte_chain * next;
- pte_t * ptep;
+ pte_addr_t ptep;
};

static kmem_cache_t *pte_chain_cache;
@@ -74,13 +74,17 @@
referenced++;

if (PageDirect(page)) {
- if (ptep_test_and_clear_young(page->pte.direct))
+ pte_t *pte = rmap_ptep_map(page->pte.direct);
+ if (ptep_test_and_clear_young(pte))
referenced++;
+ rmap_ptep_unmap(pte);
} else {
/* Check all the page tables mapping this page. */
for (pc = page->pte.chain; pc; pc = pc->next) {
- if (ptep_test_and_clear_young(pc->ptep))
+ pte_t *pte = rmap_ptep_map(pc->ptep);
+ if (ptep_test_and_clear_young(pte))
referenced++;
+ rmap_ptep_unmap(pte);
}
}
return referenced;
@@ -97,7 +101,8 @@
void page_add_rmap(struct page * page, pte_t * ptep)
{
struct pte_chain * pte_chain;
- unsigned long pfn = pte_pfn(*ptep);
+ unsigned long pfn = page_to_pfn(page);
+ pte_addr_t pte_paddr = ptep_to_paddr(ptep);

#ifdef DEBUG_RMAP
if (!page || !ptep)
@@ -112,6 +117,9 @@
return;

#ifdef DEBUG_RMAP
+ /*
+ * This stuff needs help to get up to highmem speed.
+ */
pte_chain_lock(page);
{
struct pte_chain * pc;
@@ -141,11 +149,11 @@
if (page->pte.chain) {
/* Hook up the pte_chain to the page. */
pte_chain = pte_chain_alloc();
- pte_chain->ptep = ptep;
+ pte_chain->ptep = pte_paddr;
pte_chain->next = page->pte.chain;
page->pte.chain = pte_chain;
} else {
- page->pte.direct = ptep;
+ page->pte.direct = pte_paddr;
SetPageDirect(page);
}

@@ -167,6 +175,7 @@
{
struct pte_chain * pc, * prev_pc = NULL;
unsigned long pfn = page_to_pfn(page);
+ pte_addr_t paddr = ptep_to_paddr(ptep);

if (!page || !ptep)
BUG();
@@ -176,14 +185,14 @@
pte_chain_lock(page);

if (PageDirect(page)) {
- if (page->pte.direct == ptep) {
- page->pte.direct = NULL;
+ if (page->pte.direct == paddr) {
+ page->pte.direct = (pte_addr_t)NULL;
ClearPageDirect(page);
goto out;
}
} else {
for (pc = page->pte.chain; pc; prev_pc = pc, pc = pc->next) {
- if (pc->ptep == ptep) {
+ if (pc->ptep == paddr) {
pte_chain_free(pc, prev_pc, page);
/* Check whether we can convert to direct */
pc = page->pte.chain;
@@ -211,8 +220,8 @@
#endif

out:
- dec_page_state(nr_reverse_maps);
pte_chain_unlock(page);
+ dec_page_state(nr_reverse_maps);
return;
}

@@ -230,9 +239,10 @@
* pte_chain_lock page_launder()
* mm->page_table_lock try_to_unmap_one(), trylock
*/
-static int FASTCALL(try_to_unmap_one(struct page *, pte_t *));
-static int try_to_unmap_one(struct page * page, pte_t * ptep)
+static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
+static int try_to_unmap_one(struct page * page, pte_addr_t paddr)
{
+ pte_t *ptep = rmap_ptep_map(paddr);
unsigned long address = ptep_to_address(ptep);
struct mm_struct * mm = ptep_to_mm(ptep);
struct vm_area_struct * vma;
@@ -246,8 +256,11 @@
* We need the page_table_lock to protect us from page faults,
* munmap, fork, etc...
*/
- if (!spin_trylock(&mm->page_table_lock))
+ if (!spin_trylock(&mm->page_table_lock)) {
+ rmap_ptep_unmap(ptep);
return SWAP_AGAIN;
+ }
+

/* During mremap, it's possible pages are not in a VMA. */
vma = find_vma(mm, address);
@@ -284,6 +297,7 @@
ret = SWAP_SUCCESS;

out_unlock:
+ rmap_ptep_unmap(ptep);
spin_unlock(&mm->page_table_lock);
return ret;
}


2002-08-07 00:42:38

by Andrew Morton

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

William Lee Irwin III wrote:
>
> Minimalistic fix. Perhaps rough at the edges but I can clean the
> ugliness ppl care about when they complain. 2.5.30 successfully booted
> & ran userspace on a 16-way NUMA-Q with 16GB of RAM with this patch
> and CONFIG_HIGHPTE enabled.

Thanks, Bill. It doesn't seem any uglier than anything else highmem-related.

> ...
> +#define rmap_ptep_map(pte_paddr) \
> +({ \
> + unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); \
> + unsigned long idx = __pte_offset(((unsigned long)pte_paddr)); \
> + (pte_t *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + idx; \
> +})

Could be an inline?

> +static inline rmap_ptep_map(pte_addr_t pte_paddr)
> +{
> + return (pte_t *)pte_paddr;
> +}

Better try compiling that ;)

> ...
> --- 1.66/include/linux/mm.h Thu Aug 1 12:30:06 2002
> +++ edited/include/linux/mm.h Fri Aug 2 22:24:40 2002
> @@ -161,7 +161,7 @@
> union {
> struct pte_chain * chain; /* Reverse pte mapping pointer.
> * protected by PG_chainlock */
> - pte_t * direct;
> + pte_addr_t direct;
> } pte;

Four more bytes into struct page. I bet that hurt.

> ...
> struct pte_chain {
> struct pte_chain * next;
> - pte_t * ptep;
> + pte_addr_t ptep;
> };

We'll get fifteen pte_addr_t's per pte_chain on a P4 with the
array-of-pteps-per-pte_chain patch.

And we'll need that, to reduce load on KM_PTECHAIN. Because
there's no point in pte_highmem without also having pte_chain_highmem,
yes?

Which means either going back to a custom allocator or teaching
slab about highmem and kmap_atomic. (Probably a custom allocator;
internal fragmentation on 32/64/128 byte pte_chains won't be tooooo
bad, presumably).

We're piling more and more crap in there to support these pte_chains.
How much is too much?

Is it likely that large pages and/or shared pagetables would allow us to
place pagetables and pte_chains in the direct-mapped region, avoid all
this?

2002-08-07 00:47:25

by Rik van Riel

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On Tue, 6 Aug 2002, Andrew Morton wrote:

> Is it likely that large pages and/or shared pagetables would allow us to
> place pagetables and pte_chains in the direct-mapped region, avoid all
> this?

For all workloads we care about, yes.

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-07 01:08:29

by Anton Blanchard

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE


> We're piling more and more crap in there to support these pte_chains.
> How much is too much?
>
> Is it likely that large pages and/or shared pagetables would allow us to
> place pagetables and pte_chains in the direct-mapped region, avoid all
> this?

On ppc64 shared pagetables will require significant changes to the way
we handle the hardware hashtable. So add that to the "more and more crap
in there to support these pte_chains"

Will shared pagetables be a requirement or can we turn it on per arch?

Anton

2002-08-07 01:28:10

by William Lee Irwin III

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On Tue, 6 Aug 2002, Andrew Morton wrote:
>> Is it likely that large pages and/or shared pagetables would allow us to
>> place pagetables and pte_chains in the direct-mapped region, avoid all
>> this?

On Tue, Aug 06, 2002 at 09:50:50PM -0300, Rik van Riel wrote:
> For all workloads we care about, yes.
> regards,
> Rik

Not the university workload. NFI what my employer thinks of it, but I
care about it for the sake of correctness in all cases.

Lynch me now.


Cheers,
Bill

2002-08-07 01:51:53

by William Lee Irwin III

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

At some point in the past, Andrew Morton wrote:
>> We're piling more and more crap in there to support these pte_chains.
>> How much is too much?
>> Is it likely that large pages and/or shared pagetables would allow us to
>> place pagetables and pte_chains in the direct-mapped region, avoid all
>> this?

On Wed, Aug 07, 2002 at 11:07:52AM +1000, Anton Blanchard wrote:
> On ppc64 shared pagetables will require significant changes to the way
> we handle the hardware hashtable. So add that to the "more and more crap
> in there to support these pte_chains"
> Will shared pagetables be a requirement or can we turn it on per arch?
> Anton

Actually shared pagetables require significant semantic changes in rmap,
e.g. every usage of ptep_to_mm() is broken by shared pagetables and
tracking down assumptions that the (pte, mm) relation is 1:1 is ugly
too. The existing patch for it is not prepared to cope with these.

If they're not already sitting in a back room in ozlabs or Austin
somewhere I'll ship the 3 or 4 singletask 64-bit pagetable OOM's to LTP
etc. to help dispel the 32-bit pagetable space myth, too.


Cheers,
Bill

2002-08-07 02:44:00

by Andrew Morton

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

Anton Blanchard wrote:
>
>
> > We're piling more and more crap in there to support these pte_chains.
> > How much is too much?
> >
> > Is it likely that large pages and/or shared pagetables would allow us to
> > place pagetables and pte_chains in the direct-mapped region, avoid all
> > this?
>
> On ppc64 shared pagetables will require significant changes to the way
> we handle the hardware hashtable. So add that to the "more and more crap
> in there to support these pte_chains"

Last I heard, pagetable sharing wasn't working out too well
because they all get unshared.

> Will shared pagetables be a requirement or can we turn it on per arch?

It's doubtful if per-arch would be an option.

How about this?

- We rely on large pages to solve the Oracle problem

- I'll do pte_chain_highmem and keep that and Bill's patch under test
in my tree on a wait-and-see basis. Could go ahead and submit it
but it's all more complexity, and it'd be nice to actually pull
something out for a change.

- We'll continue to suck for the University workload.

2002-08-07 03:57:16

by Rik van Riel

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On Tue, 6 Aug 2002, William Lee Irwin III wrote:
> On Tue, 6 Aug 2002, Andrew Morton wrote:
> >> Is it likely that large pages and/or shared pagetables would allow us to
> >> place pagetables and pte_chains in the direct-mapped region, avoid all
> >> this?
>
> On Tue, Aug 06, 2002 at 09:50:50PM -0300, Rik van Riel wrote:
> > For all workloads we care about, yes.
>
> Not the university workload. NFI what my employer thinks of it, but I
> care about it for the sake of correctness in all cases.
>
> Lynch me now.

I agree with you, but you'll also have to confess that keeping
pagetables around at all (whether it's in highmem or not) will
potentially be a disaster for the university workload.

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-07 04:04:04

by Rik van Riel

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On Wed, 7 Aug 2002, Anton Blanchard wrote:

> On ppc64 shared pagetables will require significant changes to the way
> we handle the hardware hashtable. So add that to the "more and more crap
> in there to support these pte_chains"
>
> Will shared pagetables be a requirement or can we turn it on per arch?

Sharing the logical page table doesn't mean you'll have to do
the same for the PPC hashed page table...

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-07 04:08:53

by William Lee Irwin III

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On Tue, Aug 06, 2002 at 09:50:50PM -0300, Rik van Riel wrote:
>>> For all workloads we care about, yes.

On Tue, 6 Aug 2002, William Lee Irwin III wrote:
>> Not the university workload. NFI what my employer thinks of it, but I
>> care about it for the sake of correctness in all cases.
>> Lynch me now.

On Wed, Aug 07, 2002 at 01:00:28AM -0300, Rik van Riel wrote:
> I agree with you, but you'll also have to confess that keeping
> pagetables around at all (whether it's in highmem or not) will
> potentially be a disaster for the university workload.
> regards,
> Rik

That's pretty much the point of it, yes.

... coincidentally, this is also needed to properly handle the vastness
of 64-bit address spaces in comparison to physical memory.


Cheers,
Bill

2002-08-07 04:19:18

by Anton Blanchard

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE


> Sharing the logical page table doesn't mean you'll have to do
> the same for the PPC hashed page table...

We have an optimisation where we store information in the linux pte that
lets us find the hashtable pte. If that one to one relationship is lost
we may need to search the primary and secondary group which could be
up to 16 hypervisor calls when running in logical partitioned mode.

We could start sharing hashtable ptes but thats going to require a fair
amount of work.

Anton

2002-08-07 05:15:46

by Martin J. Bligh

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

> And we'll need that, to reduce load on KM_PTECHAIN. Because
> there's no point in pte_highmem without also having pte_chain_highmem,
> yes?

I'm not sure I agree that there's no point. If we shove half the
overhead into highmem (well, maybe 1/3 depending if on your PTE size),
we can fit a workload double the size. Not to be sniffed at. 50% of
the benefit at 5% of the cost.

No, it doesn't completely solve the problem, but it's another hammer
to give it a good sturdy whack over the head with.

M.

2002-08-08 14:42:05

by Ingo Oeser

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

Hi,

On Tue, Aug 06, 2002 at 07:57:07PM -0700, Andrew Morton wrote:
> - We'll continue to suck for the University workload.

Hop that's not an 2.6 option, because our University alone is
using Linux on 1000+ machines, on 500+ private machines and lots
of mission critical servers.

If Linux becomes crap for the CPU-Server-Load, we would be VERY
sorry here, since we are pushing it very hard[1].

Regards

Ingo Oeser

[1] All interpretions of this sentence apply.
--
Science is what we can tell a computer. Art is everything else. --- D.E.Knuth

2002-08-08 14:48:23

by Rik van Riel

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On Wed, 7 Aug 2002, Ingo Oeser wrote:
> On Tue, Aug 06, 2002 at 07:57:07PM -0700, Andrew Morton wrote:
> > - We'll continue to suck for the University workload.
>
> Hop that's not an 2.6 option, because our University alone is
> using Linux on 1000+ machines, on 500+ private machines and lots
> of mission critical servers.
>
> If Linux becomes crap for the CPU-Server-Load, we would be VERY
> sorry here, since we are pushing it very hard[1].

Linux isn't yet up to having 500 simultaneous interactive
users, in fact I don't think it has ever been up to this
situation.

It'll probably work in many cases, but Linux just doesn't
have graceful degradation and code to cope with bad load
spikes (again, yet ... people are looking at handling this
stuff).

That doesn't mean Linux isn't working in your situation,
if it works right now it'll continue working right, chances
are it should run better in 2.6.

regards,

Rik
--
http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid. Go buy yourself a real t-shirt"

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-08 14:51:21

by Martin J. Bligh

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

>> - We'll continue to suck for the University workload.
>
> Hop that's not an 2.6 option, because our University alone is
> using Linux on 1000+ machines, on 500+ private machines and lots
> of mission critical servers.
>
> If Linux becomes crap for the CPU-Server-Load, we would be VERY
> sorry here, since we are pushing it very hard[1].

It'd be helpful if you benchmarked whatever workload you have
comparing 2.4 and 2.5 mainline then ;-) If it sucks, post some
profiling data, and carefully describe what your workload is.

M.

2002-08-08 18:38:18

by Alan

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On Thu, 2002-08-08 at 15:51, Rik van Riel wrote:
> Linux isn't yet up to having 500 simultaneous interactive
> users, in fact I don't think it has ever been up to this
> situation.

It works suprisingly well. I know people who are doing it. It does not
work when those users are all running arbitarly large jobs. In most
conventional (non student compile) type setups 500 is fine. The O(1)
scheduler and highio are pretty essential as is a real I/O subsystem.

2002-08-08 18:42:14

by Rik van Riel

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On 8 Aug 2002, Alan Cox wrote:
> On Thu, 2002-08-08 at 15:51, Rik van Riel wrote:
> > Linux isn't yet up to having 500 simultaneous interactive
> > users, in fact I don't think it has ever been up to this
> > situation.
>
> It works suprisingly well. I know people who are doing it. It does not
> work when those users are all running arbitarly large jobs. In most
> conventional (non student compile) type setups 500 is fine. The O(1)
> scheduler and highio are pretty essential as is a real I/O subsystem.

Agreed, it'll work when things are well behaved and the
system isn't overloaded.

However, having been a curious student myself I'm pretty
sure student workloads aren't always well behaved and do
have a tendency to overload the system once in a while.

I'm not sure Linux will be able to deal with the "I wonder
what happens if I ..." type students ;)

regards,

Rik
--
http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid. Go buy yourself a real t-shirt"

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-08 19:19:21

by William Lee Irwin III

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On Thu, 2002-08-08 at 15:51, Rik van Riel wrote:
>> Linux isn't yet up to having 500 simultaneous interactive
>> users, in fact I don't think it has ever been up to this
>> situation.

On Thu, Aug 08, 2002 at 08:59:17PM +0100, Alan Cox wrote:
> It works suprisingly well. I know people who are doing it. It does not
> work when those users are all running arbitarly large jobs. In most
> conventional (non student compile) type setups 500 is fine. The O(1)
> scheduler and highio are pretty essential as is a real I/O subsystem.


Could you put me and/or Rik in contact with them so we can get a better
grip on their issues and characteristics of their workload?


Thanks,
Bill

2002-08-09 14:02:53

by Daniel Phillips

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On Wednesday 07 August 2002 04:57, Andrew Morton wrote:
> Anton Blanchard wrote:
> > On ppc64 shared pagetables will require significant changes to the way
> > we handle the hardware hashtable. So add that to the "more and more crap
> > in there to support these pte_chains"
>
> Last I heard, pagetable sharing wasn't working out too well
> because they all get unshared.

That's only when you fork from a process with a minimal amount of VM mapped,
such as bash, which has 3 page tables allocated to it, all of which get
unshared. The situation is entirely different if you fork from a process
that has malloced more than a few meg, or beaten on a large mmap. Page table
sharing turns in a significant win there.

> > Will shared pagetables be a requirement or can we turn it on per arch?
>
> It's doubtful if per-arch would be an option.

It's currently expressed as a config option. As it's purely an optimization
there's no reason to do otherwise. Disabling it per-arch should be trivial.

> - We'll continue to suck for the University workload.

That seems likely ;-)

--
Daniel

2002-08-09 14:06:08

by Daniel Phillips

[permalink] [raw]
Subject: Re: fix CONFIG_HIGHPTE

On Wednesday 07 August 2002 20:43, Ingo Oeser wrote:
> On Tue, Aug 06, 2002 at 07:57:07PM -0700, Andrew Morton wrote:
> > - We'll continue to suck for the University workload.
>
> Hop that's not an 2.6 option, because our University alone is
> using Linux on 1000+ machines, on 500+ private machines and lots
> of mission critical servers.

Bill's university workload consists of 10,000 students logged into *one
machine*. I don't think you have to worry about that just now.

--
Daniel