High contention on the pagemap_lru lock seems to be a major
scalability problem for rmap at the moment. Based on wli's and
Rik's suggestions, I've made a first cut at a patch to split up the
lock into a per-page lock for each pte_chain.
This isn't ready to go yet - I'm not going to pretend it works. I'm
looking for feedback on the approach, and any obvious blunders
I've made in coding. I plan to move the lock in the pages_struct
into the flags field to save space once it's working reliably.
If I may steal akpm's favourite disclaimer - "I know diddly squat
about ......" ;-) Flame away .....
Thanks,
Martin.
diff -urN linux-2.4.17-rmap/include/linux/mm.h linux-2.4.17-rmap-mjb/include/linux/mm.h
--- linux-2.4.17-rmap/include/linux/mm.h Thu Feb 14 17:53:17 2002
+++ linux-2.4.17-rmap-mjb/include/linux/mm.h Tue Feb 26 10:34:23 2002
@@ -160,6 +160,7 @@
protected by pagemap_lru_lock !! */
unsigned char age; /* Page aging counter. */
unsigned char zone; /* Memory zone the page belongs to. */
+ spinlock_t pte_chain_lock;
struct pte_chain * pte_chain; /* Reverse pte mapping pointer. */
struct page **pprev_hash; /* Complement to *next_hash. */
struct buffer_head * buffers; /* Buffer maps us to a disk block. */
diff -urN linux-2.4.17-rmap/mm/filemap.c linux-2.4.17-rmap-mjb/mm/filemap.c
--- linux-2.4.17-rmap/mm/filemap.c Thu Feb 14 17:53:17 2002
+++ linux-2.4.17-rmap-mjb/mm/filemap.c Fri Feb 15 17:23:36 2002
@@ -235,8 +235,13 @@
static void truncate_complete_page(struct page *page)
{
/* Leave it on the LRU if it gets converted into anonymous buffers */
- if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0)))
+ pte_chain_lock(page);
+ if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0))) {
+ pte_chain_unlock(page);
lru_cache_del(page);
+ } else {
+ pte_chain_unlock(page);
+ }
/*
* We remove the page from the page cache _after_ we have
diff -urN linux-2.4.17-rmap/mm/page_alloc.c linux-2.4.17-rmap-mjb/mm/page_alloc.c
--- linux-2.4.17-rmap/mm/page_alloc.c Thu Feb 14 17:53:17 2002
+++ linux-2.4.17-rmap-mjb/mm/page_alloc.c Tue Feb 26 10:58:42 2002
@@ -127,8 +127,10 @@
BUG();
if (PageInactiveClean(page))
BUG();
+ pte_chain_lock(page);
if (page->pte_chain)
BUG();
+ pte_chain_unlock(page);
page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
page->age = PAGE_AGE_START;
@@ -989,6 +991,7 @@
struct page *page = mem_map + offset + i;
set_page_zone(page, pgdat->node_id * MAX_NR_ZONES + j);
init_page_count(page);
+ page->pte_chain_lock = SPIN_LOCK_UNLOCKED;
__SetPageReserved(page);
memlist_init(&page->list);
if (j != ZONE_HIGHMEM)
diff -urN linux-2.4.17-rmap/mm/rmap.c linux-2.4.17-rmap-mjb/mm/rmap.c
--- linux-2.4.17-rmap/mm/rmap.c Thu Feb 14 17:53:17 2002
+++ linux-2.4.17-rmap-mjb/mm/rmap.c Tue Feb 26 16:50:55 2002
@@ -13,9 +13,8 @@
/*
* Locking:
- * - the page->pte_chain is protected by the pagemap_lru_lock,
- * we probably want to change this to a per-page lock in the
- * future
+ * - the page->pte_chain is protected by a per-page lock.
+ *
* - because swapout locking is opposite to the locking order
* in the page fault path, the swapout path uses trylocks
* on the mm->page_table_lock
@@ -53,13 +52,25 @@
static inline void pte_chain_free(struct pte_chain *, struct pte_chain *, struct page *);
static void alloc_new_pte_chains(void);
+/* lock the pte_chain for this page */
+inline void pte_chain_lock(struct page *page)
+{
+ spin_lock(&page->pte_chain_lock);
+}
+
+/* unlock the pte_chain for this page */
+inline void pte_chain_unlock(struct page *page)
+{
+ spin_unlock(&page->pte_chain_lock);
+}
+
/**
* page_referenced - test if the page was referenced
* @page: the page to test
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of processes which referenced the page.
- * Caller needs to hold the pagemap_lru_lock.
+ * Caller needs to hold the page's pte_chain lock.
*/
int FASTCALL(page_referenced(struct page *));
int page_referenced(struct page * page)
@@ -95,7 +106,7 @@
if (!VALID_PAGE(page) || PageReserved(page))
return;
- spin_lock(&pagemap_lru_lock);
+ pte_chain_lock(page);
#ifdef DEBUG_RMAP
if (!page || !ptep)
BUG();
@@ -118,7 +129,7 @@
pte_chain->next = page->pte_chain;
page->pte_chain = pte_chain;
- spin_unlock(&pagemap_lru_lock);
+ pte_chain_unlock(page);
}
/**
@@ -141,7 +152,7 @@
if (!VALID_PAGE(page) || PageReserved(page))
return;
- spin_lock(&pagemap_lru_lock);
+ pte_chain_lock(page);
for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) {
if (pc->ptep == ptep) {
pte_chain_free(pc, prev_pc, page);
@@ -159,9 +170,8 @@
#endif
out:
- spin_unlock(&pagemap_lru_lock);
+ pte_chain_unlock(page);
return;
-
}
/**
@@ -240,7 +250,7 @@
* @page: the page to get unmapped
*
* Tries to remove all the page table entries which are mapping this
- * page, used in the pageout path. Caller must hold pagemap_lru_lock
+ * page, used in the pageout path. Caller must hold pte_chain_lock
* and the page lock. Return values are:
*
* SWAP_SUCCESS - we succeeded in removing all mappings
@@ -294,7 +304,7 @@
* we make the optimisation of only checking the first process
* in the pte_chain list, this should catch hogs while not
* evicting pages shared by many processes.
- * The caller needs to hold the pagemap_lru_lock.
+ * The caller needs to hold the page's pte_chain lock
*/
int FASTCALL(page_over_rsslimit(struct page *));
int page_over_rsslimit(struct page * page)
@@ -322,7 +332,7 @@
* This function unlinks pte_chain from the singly linked list it
* may be on and adds the pte_chain to the free list. May also be
* called for new pte_chain structures which aren't on any list yet.
- * Caller needs to hold the pagemap_lru_list.
+ * Caller needs to hold the page's pte_chain lock if page is not NULL
*/
static inline void pte_chain_free(struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain, struct page * page)
{
@@ -341,7 +351,7 @@
*
* Returns a pointer to a fresh pte_chain structure. Allocates new
* pte_chain structures as required.
- * Caller needs to hold the pagemap_lru_lock.
+ * Caller needs to hold the page's pte_chain lock
*/
static inline struct pte_chain * pte_chain_alloc(void)
{
diff -urN linux-2.4.17-rmap/mm/swap.c linux-2.4.17-rmap-mjb/mm/swap.c
--- linux-2.4.17-rmap/mm/swap.c Thu Feb 14 17:53:17 2002
+++ linux-2.4.17-rmap-mjb/mm/swap.c Fri Feb 15 17:35:53 2002
@@ -106,12 +106,14 @@
UnlockPage(page);
}
+ pte_chain_lock(page);
/* Make sure the page really is reclaimable. */
if (!page->mapping || PageDirty(page) || page->pte_chain ||
page->buffers || page_count(page) > 1)
deactivate_page_nolock(page);
else if (page_count(page) == 1) {
+ pte_chain_unlock(page);
ClearPageReferenced(page);
page->age = 0;
if (PageActive(page)) {
@@ -121,6 +123,8 @@
del_page_from_inactive_dirty_list(page);
add_page_to_inactive_clean_list(page);
}
+ } else {
+ pte_chain_unlock(page);
}
}
diff -urN linux-2.4.17-rmap/mm/vmscan.c linux-2.4.17-rmap-mjb/mm/vmscan.c
--- linux-2.4.17-rmap/mm/vmscan.c Thu Feb 14 17:53:17 2002
+++ linux-2.4.17-rmap-mjb/mm/vmscan.c Fri Feb 15 17:14:30 2002
@@ -48,6 +48,7 @@
page->age -= min(PAGE_AGE_DECL, (int)page->age);
}
+/* Must be called with the page's pte_chain lock held */
static inline int page_mapping_inuse(struct page * page)
{
struct address_space * mapping = page->mapping;
@@ -109,13 +110,16 @@
}
/* Page cannot be reclaimed ? Move to inactive_dirty list. */
+ pte_chain_lock(page);
if (unlikely(page->pte_chain || page->buffers ||
PageReferenced(page) || PageDirty(page) ||
page_count(page) > 1 || TryLockPage(page))) {
del_page_from_inactive_clean_list(page);
add_page_to_inactive_dirty_list(page);
+ pte_chain_unlock(page);
continue;
}
+ pte_chain_unlock(page);
/* OK, remove the page from the caches. */
if (PageSwapCache(page)) {
@@ -239,6 +243,7 @@
continue;
}
+ pte_chain_lock(page);
/*
* The page is in active use or really unfreeable. Move to
* the active list and adjust the page age if needed.
@@ -248,21 +253,26 @@
del_page_from_inactive_dirty_list(page);
add_page_to_active_list(page);
page->age = max((int)page->age, PAGE_AGE_START);
+ pte_chain_unlock(page);
continue;
}
/*
* Page is being freed, don't worry about it.
*/
- if (unlikely(page_count(page)) == 0)
+ if (unlikely(page_count(page)) == 0) {
+ pte_chain_unlock(page);
continue;
+ }
/*
* The page is locked. IO in progress?
* Move it to the back of the list.
*/
- if (unlikely(TryLockPage(page)))
+ if (unlikely(TryLockPage(page))) {
+ pte_chain_unlock(page);
continue;
+ }
/*
* Anonymous process memory without backing store. Try to
@@ -272,6 +282,7 @@
*/
if (page->pte_chain && !page->mapping && !page->buffers) {
page_cache_get(page);
+ pte_chain_unlock(page);
spin_unlock(&pagemap_lru_lock);
if (!add_to_swap(page)) {
activate_page(page);
@@ -282,6 +293,7 @@
}
page_cache_release(page);
spin_lock(&pagemap_lru_lock);
+ pte_chain_lock(page);
}
/*
@@ -295,6 +307,7 @@
goto page_active;
case SWAP_AGAIN:
UnlockPage(page);
+ pte_chain_unlock(page);
continue;
case SWAP_SUCCESS:
; /* try to free the page below */
@@ -319,6 +332,7 @@
page_cache_get(page);
spin_unlock(&pagemap_lru_lock);
+ pte_chain_unlock(page);
writepage(page);
page_cache_release(page);
@@ -348,6 +362,7 @@
*/
spin_lock(&pagemap_lru_lock);
UnlockPage(page);
+ pte_chain_unlock(page);
__lru_cache_del(page);
/* effectively free the page here */
@@ -369,6 +384,7 @@
} else {
/* failed to drop the buffers so stop here */
UnlockPage(page);
+ pte_chain_unlock(page);
page_cache_release(page);
spin_lock(&pagemap_lru_lock);
@@ -403,6 +419,7 @@
add_page_to_active_list(page);
UnlockPage(page);
}
+ pte_chain_unlock(page);
}
spin_unlock(&pagemap_lru_lock);
@@ -473,7 +490,9 @@
* bother with page aging. If the page is touched again
* while on the inactive_clean list it'll be reactivated.
*/
+ pte_chain_lock(page);
if (!page_mapping_inuse(page)) {
+ pte_chain_unlock(page);
drop_page(page);
continue;
}
@@ -497,9 +516,12 @@
list_add(page_lru, &zone->active_list);
} else {
deactivate_page_nolock(page);
- if (++nr_deactivated > target)
+ if (++nr_deactivated > target) {
+ pte_chain_unlock(page);
break;
+ }
}
+ pte_chain_unlock(page);
/* Low latency reschedule point */
if (current->need_resched) {
On Mon, 2002-03-04 at 19:04, Martin J. Bligh wrote:
> High contention on the pagemap_lru lock seems to be a major
> scalability problem for rmap at the moment. Based on wli's and
> Rik's suggestions, I've made a first cut at a patch to split up the
> lock into a per-page lock for each pte_chain.
>
> This isn't ready to go yet - I'm not going to pretend it works. I'm
> looking for feedback on the approach, and any obvious blunders
> I've made in coding. I plan to move the lock in the pages_struct
> into the flags field to save space once it's working reliably.
>
> If I may steal akpm's favourite disclaimer - "I know diddly squat
> about ......" ;-) Flame away .....
>
> Thanks,
>
> Martin.
>
Hi, knowing less that diddly squat about the code being discussed.
I would like to mention that I usually use some little macro's
to get rid of code like:
lock( my_lock);
if ( exp) {
unlock( my_lock);
/* do fancy stuph */
} else
unlock( my_lock);
and make it look like this:
#define LOCK_EXP_F( exp, lock, f_lock, f_unlock) \
({ typeof( exp) e; \
f_lock( lock); \
e = (exp); \
f_unlock( lock); \
e; })
#define PAGELOCK_EXP( exp, page) \
LOCK_EXP_F( exp, page, pte_chain_lock, pte_chain_unlock)
if ( PAGELOCK_EXP( !page->pte_chain &&
(!page->buffers || do_flushpage( page, 0)), page))
lru_cache_del( page);
If this is a not accepted coding style, so be it.
Another little thing I've been wondering about is why keep using
LRU style caches. Has anybody ever thought about using LRU-K
caches? I know, they aren't O(1), but O(log(n)) isn't that bad
agains the advantages:
- easier to make concurrent (no head contention)
- better caching properties (takes low frequency
entries and cache sweeps into account)
just my 2ct.
Regards,
Peter Zijlstra
On Mon, Mar 04, 2002 at 10:04:51AM -0800, Martin J. Bligh wrote:
> High contention on the pagemap_lru lock seems to be a major
> scalability problem for rmap at the moment. Based on wli's and
> Rik's suggestions, I've made a first cut at a patch to split up the
> lock into a per-page lock for each pte_chain.
some year ago when I put a spinlock in the page structure I was flamed :)
That was fair enough. Sometime the theorical maximum degree of
scalability doesn't worth the additional ram usage.
but anyways, can you show where do you see this high contenction on the
pagemap_lru lock? Maybe that's more a sympthom that the rmap is doing
something silly with the lock acquired, can you measure high contention
also on my tree on similar workloads? I think we should worry about the
pagecache_lock, before the pagemap_lru lock. During heavy paging
activity, the system should become I/O bound, and the spinlock there
shouldn't matter. while when the system time goes up, it usually doesn't
run inside the vm, but it usually runs cached and there the
pagecache_lock matters.
Andrea
--On Tuesday, March 05, 2002 3:02 AM +0100 Andrea Arcangeli
<[email protected]> wrote:
> On Mon, Mar 04, 2002 at 10:04:51AM -0800, Martin J. Bligh wrote:
>> High contention on the pagemap_lru lock seems to be a major
>> scalability problem for rmap at the moment. Based on wli's and
>> Rik's suggestions, I've made a first cut at a patch to split up the
>> lock into a per-page lock for each pte_chain.
>
> some year ago when I put a spinlock in the page structure I was flamed :)
> That was fair enough. Sometime the theorical maximum degree of
> scalability doesn't worth the additional ram usage.
I'm starting to think that I used an extremely large power jackhammer
to break this up a little too much ;-) I'm not too broken-hearted about the
RAM usage - seems to me that the capacity of memory is growing faster than
its speed. John Stultz also pointed out to me today that the atomic ops to
grab these things aren't free either, but if we're grabbing and releasing
the lock just to do ops on one page at a time, then collating back up to
something larger scale won't help.
> but anyways, can you show where do you see this high contenction on the
> pagemap_lru lock?
45.5% 72.0% 8.5us( 341us) 138us( 11ms)(44.3%) 3067414 28.0% 72.0%
0% pagemap_lru_lock
0.03% 31.7% 5.2us( 30us) 139us(3473us)(0.02%) 3204 68.3% 31.7%
0% deactivate_page+0xc
6.0% 78.3% 6.8us( 87us) 162us(9847us)( 9.4%) 510349 21.7% 78.3%
0% lru_cache_add+0x2c
6.7% 73.2% 7.6us( 180us) 120us(8138us)( 6.4%) 506534 26.8% 73.2%
0% lru_cache_del+0xc
12.7% 64.2% 7.1us( 151us) 140us( 10ms)(13.4%) 1023578 35.8% 64.2%
0% page_add_rmap+0x2c
20.1% 76.0% 11us( 341us) 133us( 11ms)(15.0%) 1023749 24.0% 76.0%
0% page_remove_rmap+0x3c
12 way NUMA-Q doing a kernel compile. Compile time increases by about 50%,
which surely isn't wholly due to this, but it does stick out like a sore
thumb in the lockmeter data. The NUMA characteristics of the machine make
this even worse - notice the 11ms wait time for a max hold time of 341us -
looks like the lock's getting bounced around unfairly within the node.
Average wait isn't too bad, but look at the count on it ;-(
I'll try John's MCS-type locking stuff one of these days when
I get a minute ... I suspect it'll help, but not be the proper answer.
I suspect that once we've cracked this lock, there will be a few other
scalability things lying underneath, but that seems top of the pile
right now.
> Maybe that's more a sympthom that the rmap is doing
> something silly with the lock acquired,
It seems that we're reusing the pagemap_lru_lock for both the lru chain
and the pte chain locking, which is hurting somewhat. Maybe a per-zone
lock is enough to break this up (would also dispose of cross-node lock
cacheline bouncing) ... I still think the two chains need to be seperated
from each other though.
> can you measure high contention
> also on my tree on similar workloads?
If I could boot your kernel ;-) When I have a free moment, I'll try
those fixes you already sent me to get the early ioremap working.
As the main users (both in count and duration) seem to be page_add_rmap
and page_remove_rmap, I doubt your kernel will have anything like the
same contention on that lock.
> I think we should worry about the
> pagecache_lock, before the pagemap_lru lock. During heavy paging
> activity, the system should become I/O bound, and the spinlock there
> shouldn't matter. while when the system time goes up, it usually doesn't
> run inside the vm, but it usually runs cached and there the
> pagecache_lock matters.
I think this problem is specific to rmap, thus for your kernel, the
priorities may be a little different. The usage pattern you describe
makes perfect sense to me, but would be a little difficult to measure
empirically - hacking lockmeter .... ;-)
Thanks,
Martin.
On Mon, 4 Mar 2002, Martin J. Bligh wrote:
> > Maybe that's more a sympthom that the rmap is doing
> > something silly with the lock acquired,
>
> It seems that we're reusing the pagemap_lru_lock for both the lru chain
> and the pte chain locking, which is hurting somewhat. Maybe a per-zone
> lock is enough to break this up (would also dispose of cross-node lock
> cacheline bouncing) ... I still think the two chains need to be seperated
> from each other though.
Absolutely agreed. I'll happily accept patches for this,
but unfortunately don't have the time to implement this
myself right now.
The reason the pagemap_lru_lock protects both the lru
lists and the pte_chain lists is that it was the easiest
way to get -rmap running on SMP and I had (and still have)
a pretty large TODO list of -rmap and unrelated things...
regards,
Rik
--
Will hack the VM for food.
http://www.surriel.com/ http://distro.conectiva.com/
> but anyways, can you show where do you see this high contenction on the
> pagemap_lru lock? Maybe that's more a sympthom that the rmap is doing
> something silly with the lock acquired, can you measure high contention
> also on my tree on similar workloads? I think we should worry about the
> pagecache_lock, before the pagemap_lru lock.
OK, we've now got some of the other problems on NUMA-Q fixed,
so I can see lock contention better. This is running on a
16-way NUMA-Q, which now has proper page_local allocation
working, compiling a kernel with a VM that's based on the
standard 2.4.18 (I haven't tried the latest -aa tree stuff
yet - if there's any particular part that might help this,
please let me know)
20.2% 57.1% 5.4us( 86us) 111us( 16ms)(14.7%) 1014988 42.9% 57.1% 0% pagemap_lru_lock
0.06% 17.2% 4.4us( 46us) 73us(2511us)(0.01%) 3751 82.8% 17.2% 0% activate_page+0xc
10.4% 55.9% 5.6us( 86us) 123us( 16ms)( 8.1%) 507541 44.1% 55.9% 0% lru_cache_add+0x1c
9.6% 58.5% 5.2us( 84us) 98us( 12ms)( 6.7%) 503696 41.5% 58.5% 0% lru_cache_del+0xc
Contention is bad, max hold time is good, max wait time is bad.
Looks like it's getting bounced around inside one node - I'll
try John Stultz's msclock stuff on this to fix that, but this
lock's still getting hammered, and could do with some medical
aid ;-) I have something to break it up per zone for rmap, but
I think your tree has a global lru, if I'm not mistaken?
Not sure what we can do about that one.
Pagecache_lock doesn't look good either, but is less of a problem:
17.5% 31.3% 7.5us( 99us) 52us(4023us)( 2.4%) 631988 68.7% 31.3% 0% pagecache_lock
13.5% 33.2% 7.7us( 82us) 53us(3663us)( 1.9%) 475603 66.8% 33.2% 0% __find_get_page+0x18
0.12% 14.7% 2.8us( 54us) 43us(1714us)(0.02%) 11570 85.3% 14.7% 0% __find_lock_page+0x10
0.10% 10.7% 6.6us( 58us) 43us( 354us)(0.00%) 4270 89.3% 10.7% 0% add_to_page_cache_unique+0x18
3.8% 28.1% 8.0us( 99us) 52us(4023us)(0.43%) 127606 71.9% 28.1% 0% do_generic_file_read+0x1a4
0.00% 16.9% 8.1us( 34us) 42us( 108us)(0.00%) 65 83.1% 16.9% 0% do_generic_file_read+0x370
0.02% 16.1% 0.9us( 20us) 20us( 161us)(0.00%) 5580 83.9% 16.1% 0% filemap_fdatasync+0x20
0.02% 16.6% 0.9us( 21us) 20us( 145us)(0.00%) 5580 83.4% 16.6% 0% filemap_fdatawait+0x14
0.00% 1.7% 1.0us( 13us) 19us( 26us)(0.00%) 173 98.3% 1.7% 0% find_or_create_page+0x44
0.00% 1.7% 3.0us( 45us) 41us( 89us)(0.00%) 173 98.3% 1.7% 0% find_or_create_page+0x88
0.00% 2.4% 1.9us( 25us) 47us( 211us)(0.00%) 663 97.6% 2.4% 0% remove_inode_page+0x18
0.00% 16.7% 4.0us( 22us) 108us( 306us)(0.00%) 42 83.3% 16.7% 0% truncate_inode_pages+0x3c
0.00% 2.6% 0.8us( 30us) 80us( 864us)(0.00%) 663 97.4% 2.6% 0% truncate_list_pages+0x204
None of the hold times look too bad, but the lock's still getting
hammered. Would be nice to break this up per zone/node.
dcache_lock is also a pain. I'm working with Hanna's stuff on that,
and making good progress. Still some way to go.
BKL is a pain in the rear, as usual ;-)
M.